In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

In [3]:
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

In [4]:
automobile_df = pd.read_csv('C:/Users/Mahalakshmi R/Desktop/Personal/Janani Ravi-Regression/automobiles/auto-mpg-processed.csv')

automobile_df.head(5)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,age
0,18.0,8,307.0,130,3504,12.0,50
1,15.0,8,350.0,165,3693,11.5,50
2,18.0,8,318.0,150,3436,11.0,50
3,16.0,8,304.0,150,3433,12.0,50
4,17.0,8,302.0,140,3449,10.5,50


In [5]:
X = automobile_df.drop(['mpg','age'],axis=1)
Y = automobile_df['mpg']

x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)

In [6]:
# Find the best value of the constant (hyperparameter) which determines the magnitude of the regularization

parameters = {'alpha':[0.2,0.4,0.6,0.7,0.8,0.9,1.0]}

grid_search = GridSearchCV(Lasso(), parameters, cv=3, return_train_score=True)
# cv=3 means the original dataset will be split into 3 data parts
# Each model (7 models as 7 alpha values), will be trained using 3 different runs- 2 part= training, 1 part= testing
grid_search.fit(x_train,y_train)

grid_search.best_params_

{'alpha': 1.0}

In [7]:
# How did we get the best value to be alpha=1?

for i in range(len(parameters['alpha'])):
    print('Parameters', grid_search.cv_results_['params'][i])
    print('Mean Test Score: ', grid_search.cv_results_['mean_test_score'][i])
    print('Rank: ', grid_search.cv_results_['rank_test_score'][i])

Parameters {'alpha': 0.2}
Mean Test Score:  0.7042124424976194
Rank:  7
Parameters {'alpha': 0.4}
Mean Test Score:  0.7072316830490385
Rank:  6
Parameters {'alpha': 0.6}
Mean Test Score:  0.70994526454195
Rank:  5
Parameters {'alpha': 0.7}
Mean Test Score:  0.7111885644441528
Rank:  4
Parameters {'alpha': 0.8}
Mean Test Score:  0.7118765943722233
Rank:  3
Parameters {'alpha': 0.9}
Mean Test Score:  0.711926134994286
Rank:  2
Parameters {'alpha': 1.0}
Mean Test Score:  0.7119741676269329
Rank:  1


In [8]:
# Now we have the best alpha value for the model-

lasso_model = Lasso(alpha=grid_search.best_params_['alpha']).fit(x_train,y_train)

In [9]:
y_pred = lasso_model.predict(x_test)

print('Train Score', lasso_model.score(x_train,y_train))
print('Test Score', r2_score(y_test,y_pred))

Train Score 0.7239867509585522
Test Score 0.6174851279252587


In [10]:
# Hyperparameter tuning for KNN

parameters = {'n_neighbors': [10,12,14,18,20,25,30,35,50]}

grid_search = GridSearchCV(KNeighborsRegressor(), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'n_neighbors': 20}

In [11]:
for i in range(len(parameters['n_neighbors'])):
    print('Parameters', grid_search.cv_results_['params'][i])
    print('Mean Test Score: ', grid_search.cv_results_['mean_test_score'][i])
    print('Rank: ', grid_search.cv_results_['rank_test_score'][i])

Parameters {'n_neighbors': 10}
Mean Test Score:  0.7445507877505294
Rank:  6
Parameters {'n_neighbors': 12}
Mean Test Score:  0.7403824881353863
Rank:  7
Parameters {'n_neighbors': 14}
Mean Test Score:  0.738884983571239
Rank:  8
Parameters {'n_neighbors': 18}
Mean Test Score:  0.7457870358316833
Rank:  5
Parameters {'n_neighbors': 20}
Mean Test Score:  0.7513105087181943
Rank:  1
Parameters {'n_neighbors': 25}
Mean Test Score:  0.7509177387846002
Rank:  2
Parameters {'n_neighbors': 30}
Mean Test Score:  0.7493136114502118
Rank:  3
Parameters {'n_neighbors': 35}
Mean Test Score:  0.7485785662033345
Rank:  4
Parameters {'n_neighbors': 50}
Mean Test Score:  0.7362745703403195
Rank:  9


In [12]:
kneighbors_model = KNeighborsRegressor(n_neighbors=grid_search.best_params_['n_neighbors']).fit(x_train,y_train)

In [13]:
y_pred = kneighbors_model.predict(x_test)

print('Training Score: ', kneighbors_model.score(x_train,y_train))
print('Test Score: ', r2_score(y_test,y_pred))

Training Score:  0.7672997386801129
Test Score:  0.5441976217345832


In [16]:
# Hyperparameter tuning for Decision Tree

parameters = {'max_depth':[1,2,3,4,5,7,8]}

grid_search = GridSearchCV(DecisionTreeRegressor(), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train,y_train)

grid_search.best_params_

{'max_depth': 2}

In [17]:
decisiontree_model = DecisionTreeRegressor(max_depth=grid_search.best_params_['max_depth']).fit(x_train,y_train)

In [18]:
y_pred = decisiontree_model.predict(x_test)

print('Training score: ', decisiontree_model.score(x_train,y_train))
print('Test score: ', r2_score(y_test,y_pred))

Training score:  0.7514886190806696
Test score:  0.5480941133045227


In [20]:
# Grid Search for SVR

parameters = {'epsilon' : [0.05, 0.1,0.2,0.3],
             'C':[0.2,0.3]}

grid_search = GridSearchCV(SVR(kernel='linear'), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'C': 0.2, 'epsilon': 0.05}

In [21]:
svr_model = SVR(kernel='linear',
               epsilon = grid_search.best_params_['epsilon'],
               C= grid_search.best_params_['C']).fit(x_train, y_train)

In [22]:
y_pred = svr_model.predict(x_test)

print('Training Score: ', svr_model.score(x_train,y_train))
print('Test Score: ', r2_score(y_test, y_pred))

Training Score:  0.7217008370014333
Test Score:  0.6071377009122577
