# Problem Statement

### Import the necessary Packages

In [1]:
import numpy as np   
import pandas as pd    
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df.head()

Unnamed: 0,origin,cylinders,displacement,horsepower,weight,acceleration,year,name,Kilometer_per_liter
0,1,8,307.0,130,3504,12.0,1970,chevrolet chevelle malibu,7.652587
1,1,8,350.0,165,3693,11.5,1970,buick skylark 320,6.377156
2,1,8,318.0,150,3436,11.0,1970,plymouth satellite,7.652587
3,1,8,304.0,150,3433,12.0,1970,amc rebel sst,6.802299
4,1,8,302.0,140,3449,10.5,1970,ford torino,7.227443


In [4]:
# dependent variable mileage i.e kilometre travelled per litre

In [5]:
df.origin.value_counts()

1    249
3     79
2     70
Name: origin, dtype: int64

In [6]:
# origin is basically a categorical data
df.origin = df.origin.astype('object')

In [7]:
df[["manufacturer","model"]] = df.name.str.split(" ", n = 1 , expand = True)
df.drop("name",axis = 1,inplace= True)
df

Unnamed: 0,origin,cylinders,displacement,horsepower,weight,acceleration,year,Kilometer_per_liter,manufacturer,model
0,1,8,307.0,130,3504,12.0,1970,7.652587,chevrolet,chevelle malibu
1,1,8,350.0,165,3693,11.5,1970,6.377156,buick,skylark 320
2,1,8,318.0,150,3436,11.0,1970,7.652587,plymouth,satellite
3,1,8,304.0,150,3433,12.0,1970,6.802299,amc,rebel sst
4,1,8,302.0,140,3449,10.5,1970,7.227443,ford,torino
...,...,...,...,...,...,...,...,...,...,...
393,1,4,140.0,86,2790,15.6,1982,11.478880,ford,mustang gl
394,2,4,97.0,52,2130,24.6,1982,18.706323,vw,pickup
395,1,4,135.0,84,2295,11.6,1982,13.604599,dodge,rampage
396,1,4,120.0,79,2625,18.6,1982,11.904024,ford,ranger


In [10]:
correction = {"chevy":"chevrolet",
             "toyouta":"toyota",
             "maxda":"mazda",
             "chevroelt":"chevrolet",
             "vw":"volkswagen",
             "mercedes-benz":"mercedes",
             "vokswagen":"volkswagen"}


In [12]:
correction.values()

dict_values(['chevrolet', 'toyota', 'mazda', 'chevrolet', 'volkswagen', 'mercedes', 'volkswagen'])

In [13]:
df.manufacturer.replace(['chevy', 'toyouta', 'maxda', 'chevroelt', 'vw', 'mercedes-benz', 'vokswagen'],['chevrolet', 'toyota', 'mazda', 'chevrolet', 'volkswagen', 'mercedes', 'volkswagen'], inplace= True)

In [14]:
np.unique(df.manufacturer)

array(['amc', 'audi', 'bmw', 'buick', 'cadillac', 'capri', 'chevrolet',
       'chrysler', 'datsun', 'dodge', 'fiat', 'ford', 'hi', 'honda',
       'mazda', 'mercedes', 'mercury', 'nissan', 'oldsmobile', 'opel',
       'peugeot', 'plymouth', 'pontiac', 'renault', 'saab', 'subaru',
       'toyota', 'triumph', 'volkswagen', 'volvo'], dtype=object)

In [15]:
df.model.value_counts()

pinto            6
corolla          5
rabbit           5
maverick         5
matador          5
                ..
zephyr 6         1
d100             1
280s             1
310 gx           1
sunbird coupe    1
Name: model, Length: 299, dtype: int64

In [None]:
# model has 299 unique values. 
# we can drop this feature

In [16]:
df.drop(columns = 'model', axis = 1, inplace= True)

In [None]:
df.dtypes

In [None]:
df.name.head()

In [None]:
df.isnull().sum()

In [None]:
# seems like no missing value

In [None]:
df.horsepower = df.horsepower.astype('float')

In [None]:
# could not convert string to float: '?'
# This means there are '?' in horsepower column. 
# we need to replace '?' with mean or median and to do so, first change '?' to nan

In [17]:
df.horsepower.replace('?', np.nan, inplace= True)

In [18]:
df.horsepower.isnull().sum()

6

In [19]:
df.horsepower.fillna(df.horsepower.median(), inplace= True)
#df.horsepower.fillna(df.horsepower.mean(), inplace= True)

In [None]:
df.horsepower.isnull().sum()

In [20]:
df.horsepower = df.horsepower.astype('float64')

In [21]:
df.dtypes

origin                  object
cylinders                int64
displacement           float64
horsepower             float64
weight                   int64
acceleration           float64
year                     int64
Kilometer_per_liter    float64
manufacturer            object
dtype: object

In [None]:
# Let us check all the features for presence of any missing value like '?'

In [None]:
df.cylinders.value_counts()

In [None]:
df.displacement.value_counts()

In [None]:
df.year.value_counts()

In [None]:
df.weight.value_counts()

In [None]:
# So we could see All missing values have been handled 

### EDA

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include='object')

In [None]:
sns.countplot(x=df.origin)

In [None]:
sns.boxplot(x = df.origin, y= df.Kilometer_per_liter)

In [None]:
sns.heatmap(data = df.corr(), annot=True)

In [None]:
df_modified = df.drop(columns=['cylinders','displacement','weight'], axis = 1)
sns.heatmap(data = df_modified.corr(), annot = True)

#### df_modified dataset is producing lesser score than df dataset.
#### sticking to df dataset is prefered.

In [None]:
df_num = df.drop(columns= 'origin', axis = 1)

In [None]:
sns.pairplot(data= df_num, kind = 'reg' )

In [None]:
# we can see instead of fitting with a straight line, a curve seems to best fit the relation between 
# kilometer_per_liter - Weight, horsepower, displacement

In [None]:
df.Kilometer_per_liter.hist()

In [None]:
# Seems like an imbalanced data
# let us try with logfunction

In [25]:
log_kmpl = np.log(df.Kilometer_per_liter)

In [None]:
log_kmpl.hist()

In [None]:
# More balanced, so let us use log_kmpl in model

In [22]:
df = pd.get_dummies(columns = ['origin', 'manufacturer'], data = df)

In [68]:
df['log_kilo'] = np.log(df.Kilometer_per_liter)

In [69]:
df.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,year,Kilometer_per_liter,origin_1,origin_2,origin_3,...,manufacturer_plymouth,manufacturer_pontiac,manufacturer_renault,manufacturer_saab,manufacturer_subaru,manufacturer_toyota,manufacturer_triumph,manufacturer_volkswagen,manufacturer_volvo,log_kilo
0,8,307.0,130.0,3504,12.0,1970,7.652587,1,0,0,...,0,0,0,0,0,0,0,0,0,2.035044
1,8,350.0,165.0,3693,11.5,1970,6.377156,1,0,0,...,0,0,0,0,0,0,0,0,0,1.852722
2,8,318.0,150.0,3436,11.0,1970,7.652587,1,0,0,...,1,0,0,0,0,0,0,0,0,2.035044
3,8,304.0,150.0,3433,12.0,1970,6.802299,1,0,0,...,0,0,0,0,0,0,0,0,0,1.917261
4,8,302.0,140.0,3449,10.5,1970,7.227443,1,0,0,...,0,0,0,0,0,0,0,0,0,1.977885


### Modeling

In [129]:
x = df.drop(columns= 'Kilometer_per_liter', axis = 1)
y = df.Kilometer_per_liter

In [130]:
model = LinearRegression()

In [131]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0, test_size= 0.3)

In [132]:
model.fit(x_train, y_train)

LinearRegression()

In [133]:
model.score(x_train, y_train)

0.9812732391750811

In [134]:
model.score(x_test, y_test)

0.9764071925998731

In [66]:
#### Polynomial curve


In [135]:
# let us try to fit a polynomial curve 
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model

poly = PolynomialFeatures(degree=2, interaction_only=True)

x_train2 = poly.fit_transform(x_train)
x_test2 = poly.fit_transform(x_test)

poly_clf = linear_model.LinearRegression()

poly_clf.fit(x_train2, y_train)

y_pred = poly_clf.predict(x_test2)

#print(y_pred)

#In sample (training) R^2 will always improve with the number of variables!
print(poly_clf.score(x_train2, y_train))
print(poly_clf.score(x_test2, y_test))

0.9997631753926973
-0.3748014452348869
