In [3]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [5]:
df = pd.read_csv('../datasets/Automobile_data.csv')
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [6]:
# Converting price column type to numeric & Dropping '?' entries from price column 
df['price'] = df['price'].replace('?',np.nan)
df['price'] = pd.to_numeric(df['price'])
df = df[df['price'].notna()]

# Dropping '?' from num-of-doors column 
df['num-of-doors'] = df['num-of-doors'].replace('?',np.nan)
df = df[df['num-of-doors'].notna()]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          199 non-null    int64  
 1   normalized-losses  199 non-null    object 
 2   make               199 non-null    object 
 3   fuel-type          199 non-null    object 
 4   aspiration         199 non-null    object 
 5   num-of-doors       199 non-null    object 
 6   body-style         199 non-null    object 
 7   drive-wheels       199 non-null    object 
 8   engine-location    199 non-null    object 
 9   wheel-base         199 non-null    float64
 10  length             199 non-null    float64
 11  width              199 non-null    float64
 12  height             199 non-null    float64
 13  curb-weight        199 non-null    int64  
 14  engine-type        199 non-null    object 
 15  num-of-cylinders   199 non-null    object 
 16  engine-size        199 non

In [7]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [11]:
# Converting column type to numeric and replacing '?' with mean value
num_col = ['normalized-losses', 'bore',  'stroke', 'horsepower', 'peak-rpm']
for col in num_col:
    df[col] = df[col].replace('?', np.nan)
    df[col] = pd.to_numeric(df[col])
    df[col].fillna(df[col].mean(), inplace=True)
df.T.head(26)

Unnamed: 0,0,1,2,3,4,5,6,7,8,10,...,195,196,197,198,199,200,201,202,203,204
symboling,3,3,1,2,2,2,1,1,1,2,...,-1,-2,-1,-2,-1,-1,-1,-1,-1,-1
normalized-losses,121.84,121.84,121.84,164,164,121.84,158,121.84,158,192,...,74,103,74,103,74,95,95,95,95,95
make,alfa-romero,alfa-romero,alfa-romero,audi,audi,audi,audi,audi,audi,bmw,...,volvo,volvo,volvo,volvo,volvo,volvo,volvo,volvo,volvo,volvo
fuel-type,gas,gas,gas,gas,gas,gas,gas,gas,gas,gas,...,gas,gas,gas,gas,gas,gas,gas,gas,diesel,gas
aspiration,std,std,std,std,std,std,std,std,turbo,std,...,std,std,std,turbo,turbo,std,turbo,std,turbo,turbo
num-of-doors,two,two,two,four,four,two,four,four,four,two,...,four,four,four,four,four,four,four,four,four,four
body-style,convertible,convertible,hatchback,sedan,sedan,sedan,sedan,wagon,sedan,sedan,...,wagon,sedan,wagon,sedan,wagon,sedan,sedan,sedan,sedan,sedan
drive-wheels,rwd,rwd,rwd,fwd,4wd,fwd,fwd,fwd,fwd,rwd,...,rwd,rwd,rwd,rwd,rwd,rwd,rwd,rwd,rwd,rwd
engine-location,front,front,front,front,front,front,front,front,front,front,...,front,front,front,front,front,front,front,front,front,front
wheel-base,88.6,88.6,94.5,99.8,99.4,99.8,105.8,105.8,105.8,101.2,...,104.3,104.3,104.3,104.3,104.3,109.1,109.1,109.1,109.1,109.1


In [12]:
df.describe() 

Unnamed: 0,symboling,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
count,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0
mean,0.844221,121.840491,98.822613,174.267839,65.896482,53.773869,2558.065327,127.045226,3.331949,3.255538,10.114171,103.604061,5118.020305,25.130653,30.633166,13242.613065
std,1.259651,32.103986,6.086103,12.323033,2.106373,2.44673,519.189151,41.703898,0.268538,0.317355,3.920762,37.448626,478.600274,6.408881,6.801746,7978.344857
min,-2.0,65.0,86.6,141.1,60.3,47.8,1488.0,61.0,2.54,2.07,7.0,48.0,4150.0,13.0,16.0,5118.0
25%,0.0,99.5,94.5,166.8,64.15,52.0,2157.0,97.5,3.15,3.11,8.6,70.0,4800.0,19.0,25.0,7775.0
50%,1.0,121.840491,97.0,173.2,65.5,54.1,2414.0,120.0,3.31,3.29,9.0,95.0,5118.020305,24.0,30.0,10295.0
75%,2.0,137.0,102.4,183.5,66.75,55.55,2930.5,143.0,3.585,3.41,9.4,116.0,5500.0,30.0,34.0,16501.5
max,3.0,256.0,120.9,208.1,72.0,59.8,4066.0,326.0,3.94,4.17,23.0,262.0,6600.0,49.0,54.0,45400.0


In [13]:
cleanup_nums = {"num-of-doors":     {"four": 4, "two": 2},
                "num-of-cylinders": {"four": 4, "six": 6, "five": 5, "eight": 8,
                                  "two": 2, "twelve": 12, "three":3 }}
df = df.replace(cleanup_nums)
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,121.840491,alfa-romero,gas,std,2,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,121.840491,alfa-romero,gas,std,2,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,121.840491,alfa-romero,gas,std,2,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,4,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,4,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [20]:
df = pd.get_dummies(df, columns=["body-style", "drive-wheels"], prefix=["body", "drive"])
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,engine-location,wheel-base,length,width,...,highway-mpg,price,body_convertible,body_hardtop,body_hatchback,body_sedan,body_wagon,drive_4wd,drive_fwd,drive_rwd
0,3,121.840491,alfa-romero,gas,std,2,front,88.6,168.8,64.1,...,27,13495.0,1,0,0,0,0,0,0,1
1,3,121.840491,alfa-romero,gas,std,2,front,88.6,168.8,64.1,...,27,16500.0,1,0,0,0,0,0,0,1
2,1,121.840491,alfa-romero,gas,std,2,front,94.5,171.2,65.5,...,26,16500.0,0,0,1,0,0,0,0,1
3,2,164.0,audi,gas,std,4,front,99.8,176.6,66.2,...,30,13950.0,0,0,0,1,0,0,1,0
4,2,164.0,audi,gas,std,4,front,99.4,176.6,66.4,...,22,17450.0,0,0,0,1,0,1,0,0


In [21]:
df["OHC_Code"] = np.where(df["engine-type"].str.contains("ohc"), 1, 0)
df[["make", "engine-type", "OHC_Code"]].head()

Unnamed: 0,make,engine-type,OHC_Code
0,alfa-romero,dohc,1
1,alfa-romero,dohc,1
2,alfa-romero,ohcv,1
3,audi,ohc,1
4,audi,ohc,1


In [22]:
from sklearn.preprocessing import OrdinalEncoder

ord_enc = OrdinalEncoder()
df["make_code"] = ord_enc.fit_transform(df[["make"]])
df[["make", "make_code"]].head(11)

Unnamed: 0,make,make_code
0,alfa-romero,0.0
1,alfa-romero,0.0
2,alfa-romero,0.0
3,audi,1.0
4,audi,1.0
5,audi,1.0
6,audi,1.0
7,audi,1.0
8,audi,1.0
10,bmw,2.0


In [38]:
attrs = ['symboling', 'normalized-losses', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'bore', 'stroke', 
           'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price', 'num-of-doors', 'num-of-cylinders', 'body_convertible', 
           'body_hardtop', 'body_hatchback', 'body_sedan', 'body_wagon', 'drive_4wd', 'drive_fwd', 'drive_rwd', 'OHC_Code', 'make_code']
feed = df[attrs]
feed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199 entries, 0 to 204
Data columns (total 28 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          199 non-null    int64  
 1   normalized-losses  199 non-null    float64
 2   wheel-base         199 non-null    float64
 3   length             199 non-null    float64
 4   width              199 non-null    float64
 5   height             199 non-null    float64
 6   curb-weight        199 non-null    int64  
 7   engine-size        199 non-null    int64  
 8   bore               199 non-null    float64
 9   stroke             199 non-null    float64
 10  compression-ratio  199 non-null    float64
 11  horsepower         199 non-null    float64
 12  peak-rpm           199 non-null    float64
 13  city-mpg           199 non-null    int64  
 14  highway-mpg        199 non-null    int64  
 15  price              199 non-null    float64
 16  num-of-doors       199 non

In [39]:
feed.describe()

Unnamed: 0,symboling,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,...,body_convertible,body_hardtop,body_hatchback,body_sedan,body_wagon,drive_4wd,drive_fwd,drive_rwd,OHC_Code,make_code
count,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0,...,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0
mean,0.844221,121.840491,98.822613,174.267839,65.896482,53.773869,2558.065327,127.045226,3.331949,3.255538,...,0.030151,0.040201,0.341709,0.462312,0.125628,0.040201,0.582915,0.376884,0.919598,12.361809
std,1.259651,32.103986,6.086103,12.323033,2.106373,2.44673,519.189151,41.703898,0.268538,0.317355,...,0.171433,0.196926,0.475479,0.499835,0.332266,0.196926,0.494321,0.485828,0.2726,6.24902
min,-2.0,65.0,86.6,141.1,60.3,47.8,1488.0,61.0,2.54,2.07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,99.5,94.5,166.8,64.15,52.0,2157.0,97.5,3.15,3.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,8.0
50%,1.0,121.840491,97.0,173.2,65.5,54.1,2414.0,120.0,3.31,3.29,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,12.0
75%,2.0,137.0,102.4,183.5,66.75,55.55,2930.5,143.0,3.585,3.41,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,19.0
max,3.0,256.0,120.9,208.1,72.0,59.8,4066.0,326.0,3.94,4.17,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,21.0


In [41]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [42]:
feed[attrs]=scaler.fit_transform(feed[attrs])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feed[attrs]=scaler.fit_transform(feed[attrs])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())


In [43]:
feed.describe()

Unnamed: 0,symboling,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,...,body_convertible,body_hardtop,body_hatchback,body_sedan,body_wagon,drive_4wd,drive_fwd,drive_rwd,OHC_Code,make_code
count,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0,...,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0
mean,0.568844,0.297594,0.356344,0.495042,0.478332,0.497822,0.415076,0.249227,0.565678,0.564542,...,0.030151,0.040201,0.341709,0.462312,0.125628,0.040201,0.582915,0.376884,0.919598,0.588658
std,0.25193,0.168084,0.177437,0.183926,0.180032,0.203894,0.201392,0.157373,0.191813,0.151121,...,0.171433,0.196926,0.475479,0.499835,0.332266,0.196926,0.494321,0.485828,0.2726,0.297572
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.4,0.180628,0.230321,0.383582,0.32906,0.35,0.259503,0.137736,0.435714,0.495238,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.380952
50%,0.6,0.297594,0.303207,0.479104,0.444444,0.525,0.359193,0.222642,0.55,0.580952,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.571429
75%,0.8,0.376963,0.460641,0.632836,0.551282,0.645833,0.559542,0.309434,0.746429,0.638095,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.904762
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [46]:
df_train_x = feed.drop('price',axis = 1)
df_train_x.describe()
df_train_y = feed['price']
df_train_y.describe

<bound method NDFrame.describe of 0      0.207959
1      0.282558
2      0.282558
3      0.219254
4      0.306142
         ...   
200    0.291123
201    0.345738
202    0.406311
203    0.430763
204    0.434611
Name: price, Length: 199, dtype: float64>

In [47]:
x_train, x_test, y_train, y_test = train_test_split(df_train_x, df_train_y, test_size=0.15, random_state=42)

In [51]:
from treend import *

In [70]:
reg = M5regressor(smoothing=True, n_attr_leaf=4, max_depth=4, k=15.0)
reg.fit(np.array(x_train), np.array(y_train)[:,None])
predictions = reg.predict(np.array(x_test))
print("r2_score is : " , r2_score(y_test, predictions))

r2_score is :  -64.08184021669054


  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(
  ret = ret.dtype.type(ret / rcount)


In [73]:
predictions-y_test

87     0.013213
16    -0.285465
116   -0.026084
183    0.011082
81     0.047925
169    0.302687
73    -0.000149
72    -0.017268
125    9.285715
179   -0.123794
182    0.019707
154    0.053623
70    -0.127698
32     0.006045
91    -0.008860
90     0.035173
59     0.039520
65    -0.173317
95    -0.015066
165    0.031343
17    -0.105104
130    7.844827
101    0.041578
178   -0.146847
71     0.041097
195    0.176701
153    0.052320
10    -0.032775
19    -0.012417
134    0.251086
Name: price, dtype: float64

In [72]:
y_test

87     0.103297
16     0.898590
116    0.318554
183    0.070925
81     0.083933
169    0.120922
73     0.889777
72     0.743210
125    0.419542
179    0.270096
182    0.065960
154    0.069013
70     0.657415
32     0.006976
91     0.038007
90     0.049178
59     0.092523
65     0.326746
95     0.066556
165    0.103768
17     0.788491
130    0.103694
101    0.208058
178    0.283998
71     0.721563
195    0.205973
153    0.044685
10     0.280820
19     0.029219
134    0.246313
Name: price, dtype: float64

In [49]:
reg = LinearRegression().fit(x_train, y_train)
predictions = reg.predict(x_test)

In [50]:
print("r2_score is : " , r2_score(y_test, predictions))

r2_score is :  0.8262499679282982
