# **Practical Machine Learning Project**

Ghadamiyan Lida 407 AI

In [1]:
# Loading the training data 

import pandas as pd

Train_Data = pd.read_csv("Project_Data\\training.txt", header=None)

In [2]:
# source: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

# Creating a dictionary by finding the occurrence of every word

from sklearn.feature_extraction.text import CountVectorizer

cvect = CountVectorizer(ngram_range=(1, 1))   
Train_Data_F = cvect.fit_transform(Train_Data[3].values)

In [3]:
# Dividing the occurrences by the dictionary length to obtain term frequencies

from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer(norm= 'l2', use_idf= False)
Train_Data_T = tfidf_transformer.fit_transform(Train_Data_F)

In [4]:
# Defining and fitting the model

from sklearn.linear_model import Ridge

Model_R1 = Ridge(alpha=1).fit(Train_Data_T, Train_Data[1])
Model_R2 = Ridge(alpha=1).fit(Train_Data_T, Train_Data[2])

In [5]:
# Loading the validation data and extracting features without including the data in the dictionary

Validation_Data = pd.read_csv("Project_Data\\validation.txt", header=None, encoding = 'latin-1')

Validation_Data_F = cvect.transform(Validation_Data[3].values)
Validation_Data_T = tfidf_transformer.transform(Validation_Data_F)

In [6]:
# Prediction for validation data

Prediction1 = Model_R1.predict(Validation_Data_T)
Prediction2 = Model_R2.predict(Validation_Data_T)

In [7]:
# Evaluating the model using Mean Absolute Error and Mean Squared Error

from sklearn.metrics import mean_absolute_error, mean_squared_error

MAE1 = mean_absolute_error(Validation_Data[1].values, Prediction1)
MSE1 = mean_squared_error(Validation_Data[1].values, Prediction1)

MAE2 = mean_absolute_error(Validation_Data[2].values, Prediction2)
MSE2 = mean_squared_error(Validation_Data[2].values, Prediction2)

print('MAE model 1 : ', MAE1)
print('MAE model 2 : ', MAE2)

print('\nMSE model 1 : ', MSE1)
print('MSE model 2 : ', MSE2)

MAE model 1 :  0.5221330793452222
MAE model 2 :  0.6974568486309981

MSE model 1 :  0.46333504055507857
MSE model 2 :  0.8202959928306802


In [8]:
# Concatenating the training and validation data in a new training file used for the final submission 

filenames = ['Project_Data\\training.txt','Project_Data\\validation.txt']
with open('Project_Data\\my_training.txt', 'w', errors='ignore') as outfile:
    for fname in filenames:
        with open(fname, encoding='utf-8') as infile:
            for line in infile:
                outfile.write(line)
                
Data = pd.read_csv("Project_Data\\my_training.txt", encoding='latin-1', header=None)

In [9]:
# Creating a new dictionary with the new trainig data 

Train_DataF = cvect.fit_transform(Data[3].values)
Train_DataFF = tfidf_transformer.fit_transform(Train_DataF)


In [10]:
#  Defining and fitting the model 

Model_R1 = Ridge(alpha=1).fit(Train_DataFF, Data[1])
Model_R2 = Ridge(alpha=1).fit(Train_DataFF, Data[2])

In [11]:
# Loading test data

Test_Data = pd.read_csv("Project_Data\\test.txt", header=None)

Test_DataF = cvect.transform(Test_Data[1].values)
Test_DataFF = tfidf_transformer.transform(Test_DataF)

In [12]:
# Prediction

P1 = Model_R1.predict(Test_DataFF)
P2 = Model_R2.predict(Test_DataFF)

In [13]:
# Creating the final submission in csv format

final_df = pd.DataFrame({'id': Test_Data[0].values,
                         'lat': P1,
                         'long': P2
                        })

final_df.to_csv('FinalSubmission.csv', index=False)

In [15]:
# Grid search 

from sklearn.model_selection import GridSearchCV 
from sklearn.pipeline import Pipeline 

text_clf = Pipeline([('cvect', CountVectorizer()), 
                     ('tfidf_transformer', TfidfTransformer()), 
                     ('Model_LR1', Ridge())])
 
parameters = { 'cvect__ngram_range': [(1, 1), (1, 2), (2, 2)], 
              'tfidf_transformer__use_idf': (True, False), 
              'tfidf_transformer__norm': ('l1', 'l2'), 
              'Model_LR1__alpha': [0.1, 1, 2.2, 2.29, 3]}

grid_search = GridSearchCV(text_clf, parameters, cv = 10, n_jobs = -1, scoring='neg_mean_absolute_error') 
grid_search.fit(Data[3].values, Data[1].values) 

print('Best score 1st model : ', grid_search.best_score_) 
print('Best parameters 1st model : ', grid_search.best_params_)

grid_search2 = GridSearchCV(text_clf, parameters, cv = 10, n_jobs = -1, scoring='neg_mean_absolute_error') 
grid_search2.fit(Data[3].values, Data[2].values) 

print('Best score 2nd model : ', grid_search2.best_score_) 
print('Best parameters 2nd model : ', grid_search2.best_params_)

Best score 1st model :  -0.5136244050757868
Best parameters 1st model :  {'Model_LR1__alpha': 1, 'cvect__ngram_range': (1, 1), 'tfidf_transformer__norm': 'l2', 'tfidf_transformer__use_idf': False}
Best score 2nd model :  -0.6806272277821972
Best parameters 2nd model :  {'Model_LR1__alpha': 1, 'cvect__ngram_range': (1, 1), 'tfidf_transformer__norm': 'l2', 'tfidf_transformer__use_idf': False}
