In [2]:
import numpy as np
import pandas as pd

In [3]:
!wget https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv

--2023-10-02 10:07:25--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1475504 (1.4M) [text/plain]
Saving to: ‘data.csv.5’


2023-10-02 10:07:25 (56.2 MB/s) - ‘data.csv.5’ saved [1475504/1475504]



In [4]:
df = pd.read_csv('data.csv')

In [5]:
df.columns = df.columns.str.replace(' ', '_').str.lower()

In [6]:
df.drop(['market_category','vehicle_size','popularity','number_of_doors','engine_fuel_type','driven_wheels'],axis = 1,inplace = True)

In [7]:
strings = list(df.dtypes[df.dtypes == 'object'].index)

In [8]:
for col in strings:
  df[col] = df[col].str.lower().str.replace(' ','_')

In [9]:
df.rename(columns = {'msrp':'price'},inplace = True)

In [10]:
df = df.fillna(0)

In [11]:
df['transmission_type'].mode()

0    automatic
Name: transmission_type, dtype: object

In [12]:
df.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
price                  int64
dtype: object

In [13]:
numerical = ['year','engine_hp','engine_cylinders','highway_mpg','city_mpg','price']

In [14]:
df[numerical].corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
year,1.0,0.338714,-0.040708,0.25824,0.198171,0.22759
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918,0.650095
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306,0.526274
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829,-0.160043
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0,-0.157676
price,0.22759,0.650095,0.526274,-0.160043,-0.157676,1.0


In [15]:
df['above_average'] = (df.price >= df['price'].mean()).astype(int)
price = df.price.values
del df['price']

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
df_full_train,df_test = train_test_split(df, test_size = 0.2,random_state = 1)

In [18]:
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state = 1)

In [19]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

In [20]:
del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [21]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [22]:
from sklearn.metrics import mutual_info_score

In [23]:
def mutual_info_price_avg_score(series):
  return mutual_info_score(series,y_train)

In [24]:
df.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
above_average          int64
dtype: object

In [25]:
categorical = ['make','model','transmission_type','vehicle_style']

In [26]:
score = df_train[categorical].apply(mutual_info_price_avg_score)
score.round(2)

make                 0.24
model                0.47
transmission_type    0.02
vehicle_style        0.08
dtype: float64

In [27]:
from sklearn.feature_extraction import DictVectorizer

In [28]:
dv = DictVectorizer(sparse = False)

In [29]:
train_dicts = df_train.to_dict(orient = 'records')

In [30]:
X_train = dv.fit_transform(train_dicts)

In [31]:
from sklearn.linear_model import LogisticRegression

In [32]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

In [33]:
model.fit(X_train,y_train)

In [34]:
val_dicts = df_val.to_dict(orient = 'records')

In [35]:
X_val = dv.transform(val_dicts)

In [36]:
y_pred = model.predict_proba(X_val)[:,1]

In [37]:
above_average_decision = (y_pred >= 0.5).astype(int)

In [38]:
(y_val == above_average_decision).mean().round(2)

0.94

In [39]:
(y_val == above_average_decision).mean()

0.9353755770037767

In [40]:
no_year_train = df_train.drop('year',axis=1)
train_dicts = no_year_train.to_dict(orient = 'records')
X_train = dv.fit_transform(train_dicts)
model.fit(X_train,y_train)

no_year_val = df_val.drop('year',axis = 1)
val_dicts = no_year_val.to_dict(orient = 'records')
X_val = dv.transform(val_dicts)
y_pred = model.predict_proba(X_val)[:,1]
above_avg_decision = (y_pred >= 0.5).astype(int)
(y_val == above_avg_decision).mean()

0.9496433067561897

In [41]:
no_engine_hp_train = df_train.drop('engine_hp',axis=1)
train_dicts = no_engine_hp_train.to_dict(orient = 'records')
X_train = dv.fit_transform(train_dicts)
model.fit(X_train,y_train)

no_engine_hp_val = df_val.drop('engine_hp',axis = 1)
val_dicts = no_engine_hp_val.to_dict(orient = 'records')
X_val = dv.transform(val_dicts)
y_pred = model.predict_proba(X_val)[:,1]
above_avg_decision = (y_pred >= 0.5).astype(int)
(y_val == above_avg_decision).mean()

0.9290809903483005

In [42]:
no_city_mpg_train = df_train.drop('city_mpg',axis=1)
train_dicts = no_city_mpg_train.to_dict(orient = 'records')
X_train = dv.fit_transform(train_dicts)
model.fit(X_train,y_train)

no_city_mpg_val = df_val.drop('city_mpg',axis = 1)
val_dicts = no_city_mpg_val.to_dict(orient = 'records')
X_val = dv.transform(val_dicts)
y_pred = model.predict_proba(X_val)[:,1]
above_avg_decision = (y_pred >= 0.5).astype(int)
(y_val == above_avg_decision).mean()

0.936634494334872

In [43]:
no_transmission_type_train = df_train.drop('transmission_type',axis=1)
train_dicts = no_transmission_type_train.to_dict(orient = 'records')
X_train = dv.fit_transform(train_dicts)
model.fit(X_train,y_train)

no_transmission_type_val = df_val.drop('transmission_type',axis = 1)
val_dicts = no_transmission_type_val.to_dict(orient = 'records')
X_val = dv.transform(val_dicts)
y_pred = model.predict_proba(X_val)[:,1]
above_avg_decision = (y_pred >= 0.5).astype(int)
(y_val == above_avg_decision).mean()

0.9475451112043642

In [44]:
df.drop('above_average',axis = 1,inplace=True)
df['price']= price
df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,bmw,1_series_m,2011,335.0,6.0,manual,coupe,26,19,46135
1,bmw,1_series,2011,300.0,6.0,manual,convertible,28,19,40650
2,bmw,1_series,2011,300.0,6.0,manual,coupe,28,20,36350
3,bmw,1_series,2011,230.0,6.0,manual,coupe,28,18,29450
4,bmw,1_series,2011,230.0,6.0,manual,convertible,28,18,34500


In [45]:
df['price'] = np.log1p(df.price)

In [46]:
df_full_train,df_test = train_test_split(df, test_size = 0.2,random_state = 1)

In [47]:
df_train,df_val = train_test_split(df_full_train, test_size = 0.25,random_state = 1)

In [48]:
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

In [49]:
del df_train['price']
del df_val['price']
del df_test['price']

In [50]:
from sklearn.linear_model import Ridge

In [52]:
train_dicts = df_train.to_dict(orient = 'records')
X_train = dv.fit_transform(train_dicts)
val_dicts = df_val.to_dict(orient = 'records')
X_val = dv.transform(val_dicts)

In [62]:
a = [0, 0.01, 0.1, 1, 10]

In [64]:
scores = []

In [63]:
def rmse(y,y_pred):
  se = (y-y_pred)**2
  mse = se.mean()
  rmse = np.sqrt(mse)
  return rmse

In [67]:
for i in a:
  model = Ridge(alpha = i, max_iter = 2000, solver = 'sag', random_state = 42)
  model.fit(X_train,y_train)
  y_pred = model.predict(X_val)
  scores.append(rmse(y_val,y_pred))



In [71]:
scores

[0.4709573112436054,
 0.4709577801927823,
 0.47096200069291194,
 0.4710041929871678,
 0.4714248203933885]