In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

## Data Preprocessing

In [2]:
data=pd.read_csv('Automobile_data.csv')

In [3]:
data.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [4]:
data.replace('?',np.nan,inplace=True)

In [5]:
data.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [6]:
data.isna().sum()

symboling             0
normalized-losses    41
make                  0
fuel-type             0
aspiration            0
num-of-doors          2
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

In [7]:
data['num-of-doors'].value_counts()

four    114
two      89
Name: num-of-doors, dtype: int64

In [8]:
data['num-of-doors'].replace(np.nan,'four',inplace=True)
data['peak-rpm'].fillna(pd.to_numeric(data['peak-rpm']).mean(),inplace=True)
data['horsepower'].fillna(pd.to_numeric(data['horsepower']).mean(),inplace=True)
data.isna().sum()

symboling             0
normalized-losses    41
make                  0
fuel-type             0
aspiration            0
num-of-doors          0
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            0
peak-rpm              0
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

In [9]:
mazda_bore=data[data['make']=='mazda']
mazda_bore_median=mazda_bore.loc[:,'bore'].median()
mazda_stroke=data[data['make']=='mazda']
mazda_stroke_median=mazda_stroke.loc[:,'stroke'].median()
for i  in range(len(data)):
    if data.loc[i,'make']=='mazda':
        if str(data.loc[i,'stroke'])=='nan':
            data.loc[i,'stroke']=mazda_stroke_median
        if str(data.loc[i,'bore'])=='nan':
            data.loc[i,'bore']=mazda_bore_median
data.loc[:,'stroke']=pd.to_numeric(data.loc[:,'stroke'])
data.loc[:,'bore']=pd.to_numeric(data.loc[:,'bore'])


In [10]:
data.isna().sum()

symboling             0
normalized-losses    41
make                  0
fuel-type             0
aspiration            0
num-of-doors          0
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  0
stroke                0
compression-ratio     0
horsepower            0
peak-rpm              0
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

In [11]:
index_list=[]

for i in range(len(data)):
    if str(data['normalized-losses'][i])=='nan':
        index_list.append(i)

normalized_mis_values=data.iloc[index_list,:]
brands=normalized_mis_values['make'].unique()
brand_normalized={}
for brand in brands:
    
    values=data[data['make']==brand].loc[:,'normalized-losses']
    brand_normalized[brand]=values.median()
for i in range(len(data)):
    if data.loc[i,'make'] in brand_normalized.keys():
        if str(data.loc[i,'normalized-losses'])=='nan':
            data.loc[i,'normalized-losses']=brand_normalized[data.loc[i,'make']]
data['normalized-losses'].fillna(pd.to_numeric(data['normalized-losses']).mean(),inplace=True)

In [12]:
data.isna().sum()

symboling            0
normalized-losses    0
make                 0
fuel-type            0
aspiration           0
num-of-doors         0
body-style           0
drive-wheels         0
engine-location      0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-type          0
num-of-cylinders     0
engine-size          0
fuel-system          0
bore                 0
stroke               0
compression-ratio    0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                4
dtype: int64

In [13]:
data.dropna(inplace=True)
data.isna().sum()

symboling            0
normalized-losses    0
make                 0
fuel-type            0
aspiration           0
num-of-doors         0
body-style           0
drive-wheels         0
engine-location      0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-type          0
num-of-cylinders     0
engine-size          0
fuel-system          0
bore                 0
stroke               0
compression-ratio    0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                0
dtype: int64

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          201 non-null    int64  
 1   normalized-losses  201 non-null    object 
 2   make               201 non-null    object 
 3   fuel-type          201 non-null    object 
 4   aspiration         201 non-null    object 
 5   num-of-doors       201 non-null    object 
 6   body-style         201 non-null    object 
 7   drive-wheels       201 non-null    object 
 8   engine-location    201 non-null    object 
 9   wheel-base         201 non-null    float64
 10  length             201 non-null    float64
 11  width              201 non-null    float64
 12  height             201 non-null    float64
 13  curb-weight        201 non-null    int64  
 14  engine-type        201 non-null    object 
 15  num-of-cylinders   201 non-null    object 
 16  engine-size        201 non

In [15]:
data['price']=pd.to_numeric(data['price'])
data['horsepower']=pd.to_numeric(data['horsepower'])
data['peak-rpm']=pd.to_numeric(data['peak-rpm'])
data=pd.get_dummies(data)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201 entries, 0 to 204
Columns: 136 entries, symboling to fuel-system_spfi
dtypes: float64(9), int64(6), uint8(121)
memory usage: 48.9 KB


## Linear Regression From Scratch

In [50]:
y=data['price'].values
X=data.drop('price',axis=1)
sc = StandardScaler()
X = sc.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [51]:
def cost_function(X, y, a, b):
    cost = np.sum((((X.dot(a) + b) - y) ** 2) / (len(y)))
    return cost

In [52]:
def gradient_descent_function(X, y, a, b, alpha, epochs):
    m = len(y)
    costs = [0] * epochs
    
    for epoch in range(epochs):
        # Calculate the value -- Forward Propagation
        z = X.dot(a) + b
        
        # Calculate the losses
        loss = z - y
        
        # Calculate gradient descent
        a_gradient = X.T.dot(loss) / m
        b_gradient = np.sum(loss) / m
        
        # Update weights and bias
        a = a - alpha*a_gradient
        b = b - alpha*b_gradient
        
        # Store current lost
        cost = cost_function(X, y, a, b)
        costs[epoch] = cost
        
    return a, b, costs

In [57]:
a = np.random.randn(X_train.shape[1])
b = 0
a, b,costs = gradient_descent_function(X_train, y_train, a, b,alpha=0.15, epochs=25000)

In [58]:
print(a,'\n',b)

[-2824.07726036  -285.35820742 -1791.22322059  2748.71457256
 -2455.460867    5378.32366775  7558.28964878 -2772.53089277
  -849.08416054  3580.22550884  -773.38661913   260.17023997
  -330.48308966   609.25237515  -565.45935696  -386.32309623
    54.84924015   163.66260161  -413.54282853  -237.37536072
  -171.53575793  -314.02162409    13.48712204   429.73734774
   697.66230624  -214.16124845   667.93188553   197.28901293
  -227.58638958  -461.60967901  -112.92161254 -1133.1895497
   935.49079304   416.2108563    -87.89797309   316.54786105
 -1327.73917661   720.71356554   138.63393618    29.55756835
  -487.59515961  -350.64508911 -1341.61488674  -138.30018811
  -763.06894371  -499.0652978    738.68565655  -291.11413175
    66.9484089    566.53594909    71.99311328   -88.90173529
   543.27202934  -515.60139302   -83.98128857   -22.94188744
   515.1521095  -1603.97754184  -667.63013945 -1368.24801909
   172.37545773  -247.82829533   600.00613725   434.90398005
   462.49646686   765.519

In [35]:
def linear(X,a,b):
    y=X.dot(a)+b
    return y
def prediction(X_test,a,b):
    y_pred=linear(X_test,a,b)
    return y_pred
def r2score(y_pred, y):
    rss = np.sum((y_pred - y) ** 2)
    tss = np.sum((y-y.mean()) ** 2)
    
    r2 = 1 - (rss / tss)
    return r2
y_pred=prediction(X_test,a,b)
r2=r2score(y_pred,y_test)
print(r2)

0.13835731831862375


## Linear Regression by sklearn

In [59]:
model=LinearRegression()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)

In [60]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
print("MAE : ",mean_absolute_error(y_pred,y_test))
print("MSE : ",mean_squared_error(y_pred,y_test))
print("r2-score : ",r2_score(y_pred,y_test))

MAE :  892165622378602.5
MSE :  7.986034639238404e+30
r2-score :  -0.08805296323388134


In [61]:
model.coef_,model.intercept_

(array([-2.81963172e+03, -2.20096995e+02, -1.59719171e+03,  2.63227429e+03,
        -2.49015553e+03,  5.25098454e+03,  7.65760533e+03, -2.91917012e+03,
        -8.21855889e+02,  3.73022833e+03, -7.36682076e+02,  2.66039744e+02,
        -3.70078430e+02,  6.57910226e+02,  1.96937204e+14, -1.08155719e+15,
         3.90909188e+14, -2.91743591e+14, -4.76247572e+14,  6.96547854e+13,
         3.51043420e+15,  3.39395431e+14,  4.76327549e+14, -2.15291912e+15,
        -1.51612046e+15,  3.39395431e+14,  4.35939084e+14,  4.35939084e+14,
        -5.00210402e+14,  3.90909188e+14,  1.96937204e+14,  2.77814114e+14,
        -2.91743591e+14, -2.91743591e+14, -3.56412569e+14, -4.10509202e+14,
        -9.92181152e+14, -2.21859304e+14,  3.90909188e+14,  3.39395431e+14,
         4.76327549e+14, -2.91743591e+14,  4.76327549e+14,  3.39395431e+14,
        -6.27583135e+14,  3.51043420e+15, -4.10509202e+14, -5.38901848e+14,
         2.77814114e+14,  1.20040883e+14,  2.77814114e+14,  6.36628259e+14,
         2.7