In [38]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [None]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv'

In [None]:
!wget $data -O data-week-3.csv 

In [None]:
df = pd.read_csv('data-week-3.csv')
df.head()

In [5]:
df = df[['Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders', 'Transmission Type', 'Vehicle Style', 'highway MPG', 'city mpg', 'MSRP']]
df.head()

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle Style,highway MPG,city mpg,MSRP
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


In [6]:
df.columns = df.columns.str.replace(' ', '_').str.lower()

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df.loc[:, c] = df[c].str.lower().str.replace(' ', '_')

In [7]:
df.isna().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
msrp                  0
dtype: int64

In [8]:
df = df.fillna(0)
df.isna().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
msrp                 0
dtype: int64

In [9]:
df.rename(columns={'msrp': 'price'}, inplace=True)

print(df)

          make       model  year  engine_hp  engine_cylinders  \
0          bmw  1_series_m  2011      335.0               6.0   
1          bmw    1_series  2011      300.0               6.0   
2          bmw    1_series  2011      300.0               6.0   
3          bmw    1_series  2011      230.0               6.0   
4          bmw    1_series  2011      230.0               6.0   
...        ...         ...   ...        ...               ...   
11909    acura         zdx  2012      300.0               6.0   
11910    acura         zdx  2012      300.0               6.0   
11911    acura         zdx  2012      300.0               6.0   
11912    acura         zdx  2013      300.0               6.0   
11913  lincoln      zephyr  2006      221.0               6.0   

      transmission_type  vehicle_style  highway_mpg  city_mpg  price  
0                manual          coupe           26        19  46135  
1                manual    convertible           28        19  40650  
2     

In [10]:
print(df.transmission_type.mode())

0    automatic
Name: transmission_type, dtype: object


In [11]:
numeric_df = df[['engine_hp', 'year', 'highway_mpg', 'engine_cylinders', 'city_mpg']]
# print(numeric_df)
correlation_matrix = numeric_df.corr()

print(correlation_matrix)

                  engine_hp      year  highway_mpg  engine_cylinders  city_mpg
engine_hp          1.000000  0.338714    -0.415707          0.774851 -0.424918
year               0.338714  1.000000     0.258240         -0.040708  0.198171
highway_mpg       -0.415707  0.258240     1.000000         -0.614541  0.886829
engine_cylinders   0.774851 -0.040708    -0.614541          1.000000 -0.587306
city_mpg          -0.424918  0.198171     0.886829         -0.587306  1.000000


In [12]:
average_price = df.price.mean()
print(average_price)

40594.737032063116


In [13]:
df['above_average'] = (df['price'] > average_price).astype(int)

print(df)

          make       model  year  engine_hp  engine_cylinders  \
0          bmw  1_series_m  2011      335.0               6.0   
1          bmw    1_series  2011      300.0               6.0   
2          bmw    1_series  2011      300.0               6.0   
3          bmw    1_series  2011      230.0               6.0   
4          bmw    1_series  2011      230.0               6.0   
...        ...         ...   ...        ...               ...   
11909    acura         zdx  2012      300.0               6.0   
11910    acura         zdx  2012      300.0               6.0   
11911    acura         zdx  2012      300.0               6.0   
11912    acura         zdx  2013      300.0               6.0   
11913  lincoln      zephyr  2006      221.0               6.0   

      transmission_type  vehicle_style  highway_mpg  city_mpg  price  \
0                manual          coupe           26        19  46135   
1                manual    convertible           28        19  40650   
2  

### Perform the train/validation/test split with Scikit-Learn

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
# y = df['above_average']
# df = df.drop(columns=['above_average'])

In [16]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [17]:
len(df_train), len(df_val), len(df_test)

(7148, 2383, 2383)

In [18]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [19]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [20]:
print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(7148, 10)
(2383, 10)
(2383, 10)


### Mutual information score

In [21]:
from sklearn.metrics import mutual_info_score

In [22]:
categorical = [
    'make',
    'model',
    'transmission_type',
    'vehicle_style'
]
numerical = [
    'engine_hp',
    'year',
    'highway_mpg',
    'engine_cylinders',
    'city_mpg'
]
# print(df_train[categorical])

In [23]:
def mutual_info_churn_score(series):
    my_value = mutual_info_score(series, y_train)
    return round(my_value, 2)

In [24]:
mi = df_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

model                0.46
make                 0.24
vehicle_style        0.08
transmission_type    0.02
dtype: float64

### One-hot encoding

In [25]:
from sklearn.feature_extraction import DictVectorizer

In [26]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

### Training logistic regression with Scikit-Learn

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [28]:
# TRAINING LOGISTIC REGRESSSION MODEL:

# model = LogisticRegression(solver='lbfgs')
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
# solver='lbfgs' is the default solver in newer version of sklearn
# for older versions, you need to specify it explicitly
model.fit(X_train, y_train)

In [33]:
# INITIALIZING THE MODEL PREDICTION:
y_pred = model.predict(X_val)

#@ INSPECTING THE ACCURACY:
accuracy = np.round(accuracy_score(y_val, y_pred),2)
print(f"Answer for question 4: {accuracy:.2f}")

Answer for question 4: 0.93


In [34]:
# INITIALIZING FEATURES:
features = categorical + numerical
features

['make',
 'model',
 'transmission_type',
 'vehicle_style',
 'engine_hp',
 'year',
 'highway_mpg',
 'engine_cylinders',
 'city_mpg']

In [35]:
# INSPECTING THE DIFFERENCE IN ACCURACY:
orig_score = accuracy

for c in features:
    subset = features.copy()
    subset.remove(c)
    
    train_dict = df_train[subset].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)

    X_train = dv.transform(train_dict)

    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    val_dict = df_val[subset].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    y_pred = model.predict(X_val)

    score = accuracy_score(y_val, y_pred)
    print(c, orig_score - score, score)
     

make -0.01670583298363404 0.9467058329836341
model 0.005954678976080596 0.9240453210239195
transmission_type -0.013348720100713307 0.9433487201007134
vehicle_style -0.007054133445237087 0.9370541334452371
engine_hp -0.0003399076793956235 0.9303399076793957
year -0.01712547209399906 0.9471254720939991
highway_mpg -0.014607637431808596 0.9446076374318086
engine_cylinders -0.016286193873268906 0.946286193873269
city_mpg -0.002438103231221067 0.9324381032312211


In [None]:
# So the answer for question 5 is engine

In [37]:
df['price']=np.log1p(df['price'])

# SPLITTING THE DATASET:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

# PREPARING THE DATASET:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']
del df_train['price']
del df_val['price']
del df_test['price']

### Ridge Regression

In [40]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

# RIDGE REGRESSION IMPLEMENTATION:
for a in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    score = np.sqrt(mean_squared_error(y_val, y_pred))
    
    print(a, round(score, 3))



0 0.487




0.01 0.487




0.1 0.487




1 0.487
10 0.487


