## Imports

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['Automobile_data.csv']


## DataPreprocessing

In [2]:
df=pd.read_csv('../input/Automobile_data.csv')
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [3]:
print(df.columns)

Index(['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
       'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
       'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg', 'price'],
      dtype='object')


In [4]:
df['horsepower']=pd.to_numeric(df['horsepower'],errors='coerce')
df['curb-weight']=pd.to_numeric(df['curb-weight'],errors='coerce')
df['price']=pd.to_numeric(df['price'],errors='coerce')
df.dropna(subset=['horsepower','price','curb-weight'],inplace=True)
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111.0,5000,21,27,13495.0
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111.0,5000,21,27,16500.0
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154.0,5000,19,26,16500.0
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102.0,5500,24,30,13950.0
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115.0,5500,18,22,17450.0


## Selecting a field with high correlation wrt price

In [5]:
from scipy.stats import pearsonr

In [6]:
pearsonr(df.horsepower,df.price)

(0.8105330821322063, 1.1891278276946011e-47)

In [7]:
pearsonr(df['curb-weight'],df.price)
#higher correlation than horsepower

(0.8350904472742752, 5.017116685970889e-53)

## Plotting the data

In [8]:
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, ColumnDataSource
output_notebook()

In [9]:
datsrc= ColumnDataSource(data=(dict(x=df['curb-weight'],y=df['price'],make=df.make)))
ttips=[('price','$y{$0}'),('make','@make'),('curb-weight','$x')]

pl= figure(plot_width=600,plot_height=400,tooltips=ttips)

pl.xaxis.axis_label='curb-weight'
pl.yaxis.axis_label='price'

pl.circle('x','y',source=datsrc,size=10,color='red',alpha=0.5)

show(pl)

## Training Single Linear Regression Model

In [10]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model

In [11]:
train_d,test_d=train_test_split(df,test_size=0.25)

In [12]:
mymodel=linear_model.LinearRegression()
#model is expecting a 2d matrix as an input at x. Thus x has to be reshaped.
trn_x=np.array(train_d['curb-weight']).reshape(-1,1)#returning a 2d matrix
trn_y=np.array(train_d['price'])

mymodel.fit(trn_x,trn_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [13]:
slope=np.asscalar(np.squeeze(mymodel.coef_))
intercept=mymodel.intercept_

print("Slope= ",slope,"Intercept= ",intercept)

Slope=  12.063495539878451 Intercept=  -17604.510442487655


In [14]:
from bokeh.models import Slope

best_fit=Slope(gradient=slope,y_intercept=intercept,line_color='black',line_width=3)
pl.add_layout(best_fit)
show(pl)

## Checking accuracy of our model

In [15]:
from sklearn.metrics import accuracy_score, mean_squared_error,mean_absolute_error,r2_score

In [16]:
predictions_train=np.array(mymodel.predict(trn_x))


maet=mean_absolute_error(trn_y,predictions_train)
mset=mean_squared_error(trn_y,predictions_train)
r2scoret=r2_score(trn_y,predictions_train)
print("For TRAIN DATA\nMean Absolute Error : ",maet,"\nMean Squared Error: ",mset,"\nR2 Score: ",r2scoret)

For TRAIN DATA
Mean Absolute Error :  2708.961203848124 
Mean Squared Error:  17137771.315300506 
R2 Score:  0.6937326881058172


In [17]:
test_x=np.array(test_d['curb-weight']).reshape(-1,1)
test_y=np.array(test_d['price'])

predictions=(mymodel.predict(test_x))

In [18]:
mae=mean_absolute_error(test_y,predictions)
mse=mean_squared_error(test_y,predictions)
r2score=r2_score(test_y,predictions)

print("For TEST DATA\nMean Absolute Error : ",mae,"\nMean Squared Error: ",mse,"\nR2 Score: ",r2score)

For TEST DATA
Mean Absolute Error :  2921.1809013170987 
Mean Squared Error:  25826911.51359897 
R2 Score:  0.6923933186825844


## Multiple variables for regression

In [19]:
cols=['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
        'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
        'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
        'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
        'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
        'highway-mpg', 'price']

for col in cols:
    df[col]=pd.to_numeric(df[col],errors='coerce')
df.dropna(subset=['price','horsepower','curb-weight'])

for col in cols:
    print(col,pearsonr(df[col],df.price))

symboling (-0.08246495783141179, 0.24688185056945727)
normalized-losses (nan, 1.0)
make (nan, 1.0)
fuel-type (nan, 1.0)
aspiration (nan, 1.0)
num-of-doors (nan, 1.0)
body-style (nan, 1.0)
drive-wheels (nan, 1.0)
engine-location (nan, 1.0)
wheel-base (0.5837967812893919, 1.4313694393587485e-19)
length (0.6939647745646871, 6.39831060305001e-30)
width (0.7538710519013427, 8.679834788813268e-38)
height (0.13499022754460993, 0.05730390719825449)
curb-weight (0.8350904472742752, 5.017116685970889e-53)
engine-type (nan, 1.0)
num-of-cylinders (nan, 1.0)
engine-size (0.8738869517981516, 1.2650674479074428e-63)
fuel-system (nan, 1.0)
bore (nan, 1.0)
stroke (nan, 1.0)
compression-ratio (0.06954883036213086, 0.3290105035533686)
horsepower (0.8105330821322063, 1.1891278276946011e-47)
peak-rpm (-0.10164886620219901, 0.15311824317199588)
city-mpg (-0.6892532361080423, 2.2048708080036233e-29)
highway-mpg (-0.7052298604211743, 3.008766425754632e-31)
price (1.0, 0.0)


### Select horsepower, curb-weight, length, width, engine-size, 

In [20]:
cols=['horsepower','curb-weight','length','width','engine-size']

multi_x=np.column_stack(tuple(df[col] for col in cols))

multi_train_x,multi_test_x,multi_train_y,multi_test_y=train_test_split(multi_x,df['price'],test_size=0.25)
multi_model=linear_model.LinearRegression()
multi_model.fit(multi_train_x,multi_train_y)
multi_coef=dict(zip(cols,multi_model.coef_))
multi_intercept=multi_model.intercept_


In [21]:
#for training data
pred= multi_model.predict(multi_train_x)

mae=mean_absolute_error(multi_train_y,pred)
mse=mean_squared_error(multi_train_y,pred)
r2score=r2_score(multi_train_y,pred)

print("For TRAIN DATA\nMean Absolute Error : ",mae,"\nMean Squared Error: ",mse,"\nR2 Score: ",r2score)

For TRAIN DATA
Mean Absolute Error :  2232.2684338212016 
Mean Squared Error:  10517960.858333364 
R2 Score:  0.8241314205158522


In [22]:
#for training data
pred= multi_model.predict(multi_test_x)

mae=mean_absolute_error(multi_test_y,pred)
mse=mean_squared_error(multi_test_y,pred)
r2score=r2_score(multi_test_y,pred)

print("For TRAIN DATA\nMean Absolute Error : ",mae,"\nMean Squared Error: ",mse,"\nR2 Score: ",r2score)

For TRAIN DATA
Mean Absolute Error :  2827.4334055583354 
Mean Squared Error:  15123075.270746147 
R2 Score:  0.7937035298659749


## Using Ridge Regression

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
ridge=linear_model.Ridge()
alpha = [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]
parameters={'alpha':[1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]}
ridgeReg=GridSearchCV(ridge,parameters,scoring='neg_mean_squared_error',cv=5)

In [25]:

ridgeReg.fit(multi_train_x,multi_train_y)
pred=ridgeReg.predict(multi_test_x)

r2=r2_score(pred,multi_test_y)
r2



0.7089911759537944

### For Training data

### For Test Data

## Using Lasso Regression

In [26]:
lasso= linear_model.Lasso()
alpha = [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]
parameters={'alpha':[1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]}
lassoreg=GridSearchCV(lasso,parameters,scoring='neg_mean_squared_error',cv=5)

lassoreg.fit(multi_train_x,multi_train_y)

lpred=lassoreg.predict(multi_test_x)

mae=mean_absolute_error(lpred,multi_test_y)
mse=mean_squared_error(lpred,multi_test_y)
r2score=r2_score(lpred,multi_test_y)
print("For TRAIN DATA\nMean Absolute Error : ",mae,"\nMean Squared Error: ",mse,"\nR2 Score: ",r2score)

For TRAIN DATA
Mean Absolute Error :  2828.541120172152 
Mean Squared Error:  15142578.917204786 
R2 Score:  0.7088071091847155




### For Training data

In [27]:
lpred= lassoreg.predict(multi_train_x)
mae=mean_absolute_error(lpred,multi_train_y)
mse=mean_squared_error(lpred,multi_train_y)
r2score=r2_score(lpred,multi_train_y)
print("For TRAIN DATA\nMean Absolute Error : ",mae,"\nMean Squared Error: ",mse,"\nR2 Score: ",r2score)

For TRAIN DATA
Mean Absolute Error :  2232.898975600231 
Mean Squared Error:  10518418.718685728 
R2 Score:  0.7864644644681663


### For Test data

In [28]:
lpred=lassoreg.predict(multi_test_x)

mae=mean_absolute_error(lpred,multi_test_y)
mse=mean_squared_error(lpred,multi_test_y)
r2score=r2_score(lpred,multi_test_y)
print("For TRAIN DATA\nMean Absolute Error : ",mae,"\nMean Squared Error: ",mse,"\nR2 Score: ",r2score)

For TRAIN DATA
Mean Absolute Error :  2828.541120172152 
Mean Squared Error:  15142578.917204786 
R2 Score:  0.7088071091847155
