In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
df = pd.read_csv('auto-mpg.csv')
df.sample(20)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
142,26.0,4,79.0,67,1963,15.5,74,2,volkswagen dasher
117,29.0,4,68.0,49,1867,19.5,73,2,fiat 128
368,27.0,4,112.0,88,2640,18.6,82,1,chevrolet cavalier wagon
62,13.0,8,350.0,165,4274,12.0,72,1,chevrolet impala
78,21.0,4,120.0,87,2979,19.5,72,2,peugeot 504 (sw)
267,27.5,4,134.0,95,2560,14.2,78,3,toyota corona
252,19.2,6,231.0,105,3535,19.2,78,1,pontiac phoenix lj
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
255,25.1,4,140.0,88,2720,15.4,78,1,ford fairmont (man)
219,25.5,4,122.0,96,2300,15.5,77,1,plymouth arrow gs


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [4]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [5]:
#drop
df=df[df["horsepower"].str.contains('?',regex=False) == False]
df["horsepower"]=pd.to_numeric(df["horsepower"])

In [6]:
X=df.iloc[:,1:-1]
y=df.iloc[:,0]

In [7]:
print(X.columns)

Index(['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration',
       'model year', 'origin'],
      dtype='object')


In [8]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()
X = sc.fit_transform(X)

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Linear Regression without decomposition or corr

In [10]:
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(X_train, y_train)
y_p= model.predict(X_test)

In [11]:
from sklearn.metrics import r2_score
print("r2_score = ",r2_score(y_test, y_p))

r2_score =  0.8363664681024056


## PCA

In [12]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
pca_features = pca.fit_transform(X)

In [13]:
pca_features

array([[ 0.76908765,  0.28294046,  0.20257292],
       [ 0.89603286,  0.34023057,  0.13209555],
       [ 0.81470799,  0.31577319,  0.17605204],
       ...,
       [-0.25168737, -0.57417248, -0.07884605],
       [-0.30071765, -0.64625521, -0.04314999],
       [-0.29393279, -0.64944444, -0.0463404 ]])

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(pca_features, y, test_size = 0.2, random_state = 0)

In [15]:
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(X_train, y_train)
y_p= model.predict(X_test)
from sklearn.metrics import r2_score
print("r2_score with PCA = ",r2_score(y_test, y_p))

r2_score with PCA =  0.802647317929555


## Correlation

In [26]:
df_cor=df.drop(['car name'],axis=1)

In [27]:
df_cor.corr()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
mpg,1.0,-0.777618,-0.805127,-0.778427,-0.832244,0.423329,0.580541,0.565209
cylinders,-0.777618,1.0,0.950823,0.842983,0.897527,-0.504683,-0.345647,-0.568932
displacement,-0.805127,0.950823,1.0,0.897257,0.932994,-0.5438,-0.369855,-0.614535
horsepower,-0.778427,0.842983,0.897257,1.0,0.864538,-0.689196,-0.416361,-0.455171
weight,-0.832244,0.897527,0.932994,0.864538,1.0,-0.416839,-0.30912,-0.585005
acceleration,0.423329,-0.504683,-0.5438,-0.689196,-0.416839,1.0,0.290316,0.212746
model year,0.580541,-0.345647,-0.369855,-0.416361,-0.30912,0.290316,1.0,0.181528
origin,0.565209,-0.568932,-0.614535,-0.455171,-0.585005,0.212746,0.181528,1.0


In [18]:
X=df_cor.iloc[:,[2,4,6]]
y=df_cor.iloc[:,0]

In [19]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()
X = sc.fit_transform(X)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [20]:
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(X_train, y_train)
y_p= model.predict(X_test)

In [21]:
from sklearn.metrics import r2_score
print("r2_score = ",r2_score(y_test, y_p))

r2_score =  0.8117740295735567


## Correlation with SVR

In [22]:
from sklearn.svm import SVR
model = SVR(kernel = 'rbf').fit(X_train, y_train)
y_p= model.predict(X_test)

In [23]:
from sklearn.metrics import r2_score
print(r2_score(y_test, y_p))

0.8726243726538053
