X Cause/Explanatory
Y Effect

## Imports 

In [1]:
import sklearn
import pandas as pd
import numpy as np
import datetime

## Visualization modules 

In [3]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf

## Offline mode

In [4]:
init_notebook_mode(connected=True)
cf.go_offline()

## Package versions 

In [12]:
print('SciKit-Learn :', sklearn.__version__)
print('Pandas :', pd.__version__)
print('Numpy :', np.__version__)
print('Cufflinks :', cf.__version__)

SciKit-Learn : 0.22.2.post1
Pandas : 0.25.3
Numpy : 1.18.1
Cufflinks : 0.17.3


## Read the data 

In [14]:
data = pd.read_csv(r'Datasets\auto-mpg.csv')

In [19]:
data.sample(5)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
325,44.3,4,90.0,48,2085,21.7,80,2,vw rabbit c (diesel)
298,23.0,8,350.0,125,3900,17.4,79,1,cadillac eldorado
219,25.5,4,122.0,96,2300,15.5,77,1,plymouth arrow gs
131,32.0,4,71.0,65,1836,21.0,74,3,toyota corolla 1200
52,30.0,4,88.0,76,2065,14.5,71,2,fiat 124b


In [18]:
data.shape

(398, 9)

## Replace null values 

In [20]:
data = data.replace('?', np.nan)

In [21]:
data = data.dropna()

In [22]:
data.shape

(392, 9)

In [23]:
data.drop(['origin', 'car name'], axis=1, inplace=True)

In [25]:
data.sample(5)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year
303,31.8,4,85.0,65,2020,19.2,79
282,22.3,4,140.0,88,2890,17.3,79
340,25.8,4,156.0,92,2620,14.4,81
361,25.4,6,168.0,116,2900,12.6,81
128,15.0,6,250.0,100,3336,17.0,74


In [26]:
data['model year'] = '19' + data['model year'].astype(str)

In [27]:
data.sample(5)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year
110,22.0,4,108.0,94,2379,16.5,1973
395,32.0,4,135.0,84,2295,11.6,1982
7,14.0,8,440.0,215,4312,8.5,1970
114,26.0,4,98.0,90,2265,15.5,1973
285,17.0,8,305.0,130,3840,15.4,1979


In [37]:
data['age'] = datetime.datetime.now().year - pd.to_numeric(data['model year'])

In [39]:
data.sample(5)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,age
213,13.0,8,350.0,145,4055,12.0,1976,44
150,26.0,4,108.0,93,2391,15.5,1974,46
235,26.0,4,97.0,75,2265,18.2,1977,43
157,15.0,8,350.0,145,4440,14.0,1975,45
131,32.0,4,71.0,65,1836,21.0,1974,46


## Converting all columns to numeric values

In [40]:
data.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight            int64
acceleration    float64
model year       object
age               int64
dtype: object

In [46]:
data.drop(['model year'], axis=1, inplace=True)

In [47]:
data['horsepower'] = pd.to_numeric(data['horsepower'])

In [48]:
data.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower        int64
weight            int64
acceleration    float64
age               int64
dtype: object

## Statistical information

In [49]:
data.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,age
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,23.445918,5.471939,194.41199,104.469388,2977.584184,15.541327,44.020408
std,7.805007,1.705783,104.644004,38.49116,849.40256,2.758864,3.683737
min,9.0,3.0,68.0,46.0,1613.0,8.0,38.0
25%,17.0,4.0,105.0,75.0,2225.25,13.775,41.0
50%,22.75,4.0,151.0,93.5,2803.5,15.5,44.0
75%,29.0,8.0,275.75,126.0,3614.75,17.025,47.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,50.0


## Exploring relationships in data

In [58]:
data.iplot(kind='scatter', x='age', y='mpg', mode='markers', xTitle='Age', yTitle='Miles per gallon')

In [59]:
data.iplot(kind='scatter', x='acceleration', y='mpg', mode='markers', xTitle='Acceleration', yTitle='Miles per gallon')

In [60]:
data.iplot(kind='scatter', x='weight', y='mpg', mode='markers', xTitle='Weight', yTitle='Miles per gallon')

In [61]:
data.iplot(kind='scatter', x='displacement', y='mpg', mode='markers', xTitle='Displacement', yTitle='Miles per gallon')

In [62]:
data.iplot(kind='scatter', x='horsepower', y='mpg', mode='markers', xTitle='Horsepower', yTitle='Miles per gallon')

In [63]:
data.iplot(kind='scatter', x='cylinders', y='mpg', mode='markers', xTitle='Cylinders', yTitle='Miles per gallon')

In [64]:
data_corr = data.corr()
data_corr

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,age
mpg,1.0,-0.777618,-0.805127,-0.778427,-0.832244,0.423329,-0.580541
cylinders,-0.777618,1.0,0.950823,0.842983,0.897527,-0.504683,0.345647
displacement,-0.805127,0.950823,1.0,0.897257,0.932994,-0.5438,0.369855
horsepower,-0.778427,0.842983,0.897257,1.0,0.864538,-0.689196,0.416361
weight,-0.832244,0.897527,0.932994,0.864538,1.0,-0.416839,0.30912
acceleration,0.423329,-0.504683,-0.5438,-0.689196,-0.416839,1.0,-0.290316
age,-0.580541,0.345647,0.369855,0.416361,0.30912,-0.290316,1.0


In [65]:
data_corr.iplot(kind='heatmap')

In [66]:
data = data.sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,age
0,23.7,3,70.0,100,2420,12.5,40
1,33.0,4,105.0,74,2190,14.2,39
2,19.4,6,232.0,90,3210,17.2,42
3,20.6,6,225.0,110,3360,16.6,41
4,43.4,4,90.0,48,2335,23.7,40


## Save the data in a new csv file

In [68]:
data.to_csv(r'Datasets\auto-mpg-processed.csv', index=False)

## Load the new dataframe

In [69]:
df = pd.read_csv(r'Datasets\auto-mpg-processed.csv')

In [70]:
df.sample(5)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,age
34,27.4,4,121.0,80,2670,15.0,41
255,27.0,4,140.0,86,2790,15.6,38
220,16.5,8,350.0,180,4380,12.1,44
301,13.0,8,350.0,150,4699,14.5,46
307,11.0,8,429.0,208,4633,11.0,48


In [71]:
from sklearn.model_selection import train_test_split

In [79]:
X = df[['horsepower']]
Y = df['mpg']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

## Perfom linear regression 

In [80]:
from sklearn.linear_model import LinearRegression

In [81]:
linear_model = LinearRegression(normalize=True).fit(x_train, y_train)

In [82]:
print('Training score: ', linear_model.score(x_train, y_train))

Training score:  0.6199023776401038


In [84]:
y_pred = linear_model.predict(x_test)

In [85]:
from sklearn.metrics import r2_score

print('Testing score: ', r2_score(y_test, y_pred))

Testing score:  0.5480481421105923


In [90]:
scores = x_test, y_test