In [1]:
import pandas as pd
import numpy as np

In [4]:
columns = ['Make',
'Model',
'Year',
'Engine HP',
'Engine Cylinders',
'Transmission Type',
'Vehicle Style',
'highway MPG',
'city mpg',
'MSRP']

In [5]:
df = pd.read_csv('./data.csv')
df = df[columns]
df.head().T

Unnamed: 0,0,1,2,3,4
Make,BMW,BMW,BMW,BMW,BMW
Model,1 Series M,1 Series,1 Series,1 Series,1 Series
Year,2011,2011,2011,2011,2011
Engine HP,335.0,300.0,300.0,230.0,230.0
Engine Cylinders,6.0,6.0,6.0,6.0,6.0
Transmission Type,MANUAL,MANUAL,MANUAL,MANUAL,MANUAL
Vehicle Style,Coupe,Convertible,Coupe,Coupe,Convertible
highway MPG,26,28,28,28,28
city mpg,19,19,20,18,18
MSRP,46135,40650,36350,29450,34500


#### Data Preparation

In [9]:
df.columns = df.columns.str.lower().str.replace(' ','_')
df['price'] = df['msrp']
del df['msrp']
df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


In [10]:
# question 1
df['transmission_type'].value_counts()

AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: transmission_type, dtype: int64

In [11]:
numerical = ['year','engine_hp','engine_cylinders','highway_mpg','city_mpg']

In [12]:
df[numerical].corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg
year,1.0,0.351794,-0.041479,0.25824,0.198171
engine_hp,0.351794,1.0,0.779988,-0.406563,-0.439371
engine_cylinders,-0.041479,0.779988,1.0,-0.621606,-0.600776
highway_mpg,0.25824,-0.406563,-0.621606,1.0,0.886829
city_mpg,0.198171,-0.439371,-0.600776,0.886829,1.0


In [14]:
price_mean = df['price'].mean()
price_mean


40594.737032063116

In [17]:
def average_price(price):
    if price >= price_mean:
        return 1
    else:
        return 0

In [18]:
df['above_average'] = df['price'].apply(average_price)

In [58]:
df.dropna(inplace = True)

In [59]:
# split data
from sklearn.model_selection import train_test_split

df_train_full, df_test = train_test_split(df,test_size=0.2,random_state=42)
df_train,df_val = train_test_split(df_train_full,test_size=0.25,random_state=42)

In [60]:
len(df_train),len(df_test),len(df_val)

(7089, 2364, 2363)

In [61]:
y_train = df_train['above_average']
y_test = df_test['above_average']
y_val = df_val['above_average']

In [62]:
del df_train['above_average']
del df_test['above_average']
del df_val['above_average']

del df_train['price']
del df_test['price']
del df_val['price']

In [63]:
df_train = df_train.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)


In [64]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7089 entries, 0 to 7088
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   make               7089 non-null   object 
 1   model              7089 non-null   object 
 2   year               7089 non-null   int64  
 3   engine_hp          7089 non-null   float64
 4   engine_cylinders   7089 non-null   float64
 5   transmission_type  7089 non-null   object 
 6   vehicle_style      7089 non-null   object 
 7   highway_mpg        7089 non-null   int64  
 8   city_mpg           7089 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 498.6+ KB


In [65]:
categorical = ['make','model','transmission_type','vehicle_style']

In [66]:
from sklearn.metrics import  mutual_info_score

# mutual information
def mutual_info_price_score(series):
    return mutual_info_score(series, df_train_full.above_average)

mi = df_train_full[categorical].apply(mutual_info_price_score)
mi.sort_values(ascending = False)

model                0.463040
make                 0.242789
vehicle_style        0.083382
transmission_type    0.020183
dtype: float64

In [67]:
df_train.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg
0,Mercedes-Benz,GLA-Class,2015,355.0,4.0,AUTOMATED_MANUAL,4dr SUV,29,23
1,Dodge,Daytona,1992,141.0,6.0,MANUAL,2dr Hatchback,26,17
2,Saab,9-3 Griffin,2012,220.0,4.0,AUTOMATIC,Convertible,28,18
3,Cadillac,DTS,2009,275.0,8.0,AUTOMATIC,Sedan,23,15
4,Volkswagen,Tiguan,2017,200.0,4.0,AUTOMATIC,4dr SUV,24,20


In [68]:
from sklearn.feature_extraction import DictVectorizer

train_dicts = df_train[categorical + numerical].to_dict(orient = 'records')

In [69]:
train_dicts[0]

{'make': 'Mercedes-Benz',
 'model': 'GLA-Class',
 'transmission_type': 'AUTOMATED_MANUAL',
 'vehicle_style': '4dr SUV',
 'year': 2015,
 'engine_hp': 355.0,
 'engine_cylinders': 4.0,
 'highway_mpg': 29,
 'city_mpg': 23}

In [70]:
dv = DictVectorizer(sparse = False)

X_train = dv.fit_transform(train_dicts)

In [71]:
val_dicts = df_val[categorical + numerical].to_dict(orient = 'records')
X_val = dv.transform(val_dicts)

In [72]:
from sklearn.linear_model import LogisticRegression

In [73]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train,y_train)

In [78]:
y_pred = model.predict_proba(X_val)[ : ,1]
price_decision = (y_pred >= 0.4)
(y_val == price_decision).mean().round(2)

0.94

In [80]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = price_decision.astype('int')
df_pred['actual'] = y_val.values
df_pred['correct'] = df_pred.prediction == df_pred.actual

In [81]:
df_pred

Unnamed: 0,probability,prediction,actual,correct
0,0.008940,0,0,True
1,0.985308,1,1,True
2,0.001974,0,0,True
3,0.999975,1,1,True
4,0.000221,0,0,True
...,...,...,...,...
2358,0.021960,0,0,True
2359,0.254923,0,1,False
2360,0.003069,0,0,True
2361,0.233327,0,0,True


In [113]:
subset = ['year',
'engine_hp',
'city_mpg']

train_dict_small = df_train[subset].to_dict(orient='records')
dv_small = DictVectorizer(sparse=False)
dv_small.fit(train_dict_small)

X_small_train = dv_small.transform(train_dict_small)

In [114]:
model_small = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model_small.fit(X_small_train,y_train)

In [115]:
val_dict_small = df_val[subset].to_dict(orient='records')
X_small_val = dv_small.transform(val_dict_small)

In [116]:
y_pred_small = model_small.predict_proba(X_small_val)[:, 1]
price_decision_small = (y_pred_small >= 0.4)
(y_val == price_decision_small).mean().round(2)

0.87

In [112]:
original_accuracy = 0.87
year = 0
engine = 0.14
transmission = 0.00
city = 0.01

SyntaxError: invalid syntax (Temp/ipykernel_10892/4162833874.py, line 5)