In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [2]:
df = pd.read_csv("car-price_data.csv")
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [3]:
df_fixed = df[["Make", "Model", "Year", "Engine HP", "Engine Cylinders", "Transmission Type", "Vehicle Style",
                "highway MPG", "city mpg", "MSRP"]]
df_fixed

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle Style,highway MPG,city mpg,MSRP
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920


In [4]:
df_fixed.columns = df_fixed.columns.str.replace(" ", "_").str.lower()
df_fixed = df_fixed.rename(columns={"msrp": "price"})
df_fixed

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920


In [5]:
df_fixed.isnull().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
price                 0
dtype: int64

In [6]:
df_fixed = df_fixed.fillna(0)

df_fixed.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
price                0
dtype: int64

In [8]:
df_fixed["transmission_type"].value_counts()

transmission_type
AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: count, dtype: int64

### Question 1 ###
AUTOMATIC

In [9]:
df_fixed.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
price                  int64
dtype: object

In [10]:
categorical = list(df_fixed.dtypes[df_fixed.dtypes == 'object'].index)
categorical

['make', 'model', 'transmission_type', 'vehicle_style']

In [11]:
numeric = list(df_fixed.dtypes[df_fixed.dtypes != 'object'].index)
numeric

['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'price']

In [12]:
df_fixed[numeric].corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
year,1.0,0.338714,-0.040708,0.25824,0.198171,0.22759
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918,0.650095
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306,0.526274
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829,-0.160043
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0,-0.157676
price,0.22759,0.650095,0.526274,-0.160043,-0.157676,1.0


## Question 2 ##
highway_mpg and city_mpg

In [7]:
average_price = df_fixed['price'].mean()
df_fixed['above_average'] = df_fixed['price'].apply(lambda x: 1 if x > average_price else 0)
df_fixed

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500,0
...,...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120,1
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670,1
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620,1
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920,1


In [7]:
df_fixed_with_price = df_fixed.copy()
df_fixed_with_price

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920


In [15]:
average_price

40594.737032063116

In [16]:
del df_fixed['price']
df_fixed

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,0
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,1
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,1
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,1
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,1


In [17]:
df_train_full, df_test = train_test_split(df_fixed, test_size=0.2, random_state=42)

In [18]:
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

In [19]:
len(df_train), len(df_val), len(df_test)

(7148, 2383, 2383)

In [20]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [21]:
y_train = df_train['above_average'].values
y_val = df_val['above_average'].values
y_test = df_test['above_average'].values

In [22]:
del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [24]:
df_val

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg
0,Volkswagen,Beetle,2015,210.0,4.0,MANUAL,2dr Hatchback,31,23
1,Audi,SQ5,2015,354.0,6.0,AUTOMATIC,4dr SUV,24,17
2,Pontiac,Grand Am,2005,140.0,4.0,AUTOMATIC,Sedan,31,22
3,Nissan,350Z,2009,306.0,6.0,MANUAL,Convertible,24,17
4,Ford,E-150,1996,199.0,6.0,AUTOMATIC,Passenger Van,15,11
...,...,...,...,...,...,...,...,...,...
2378,Ford,Explorer Sport,2003,203.0,6.0,AUTOMATIC,2dr SUV,19,14
2379,Subaru,Outback,2016,175.0,4.0,AUTOMATIC,4dr SUV,33,25
2380,GMC,Sonoma,2003,190.0,6.0,MANUAL,Extended Cab Pickup,17,12
2381,Aston Martin,V8 Vantage,2015,430.0,8.0,AUTOMATED_MANUAL,Coupe,21,14


In [26]:
mi_scores = {}

for variable in categorical:
    mi_score = mutual_info_score(df_train_full['above_average'], df_train_full[variable])
    mi_scores[variable] = round(mi_score, 2)

print(mi_scores)

{'make': 0.24, 'model': 0.46, 'transmission_type': 0.02, 'vehicle_style': 0.08}


## Question 3 ##

transmission_type

In [30]:
numerical = numeric[:-1]
numerical

['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']

In [31]:
train_dicts = df_train[categorical + numerical].to_dict(orient='records')

In [32]:
dv = DictVectorizer(sparse=False)

In [33]:
dv.fit(train_dicts)

In [34]:
X_train = dv.transform(train_dicts)

In [35]:
X_train.shape

(7148, 943)

In [36]:
val_dicts = df_val[categorical + numerical].to_dict(orient="records")

In [37]:
X_val = dv.transform(val_dicts)

In [38]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [39]:
y_pred = model.predict_proba(X_val)[:, 1]
y_pred

array([8.45426579e-04, 9.96341923e-01, 1.49816720e-04, ...,
       2.62411354e-04, 9.89700971e-01, 9.87620006e-01])

In [40]:
price_decision = (y_pred >= 0.5)
price_decision

array([False,  True, False, ..., False,  True,  True])

In [41]:
(y_val == price_decision).mean()

0.9450272765421738

## Question 4 ##
0.95

In [43]:
'''
year
engine_hp
transmission_type
city_mpg
'''

'\nyear\nengine_hp\ntransmission_type\ncity_mpg\n'

In [44]:
num_year = ['engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']

***Обучение без year***

0.9479647503147294

In [45]:
train_dicts = df_train[categorical + num_year].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dicts)

X_train = dv.transform(train_dicts)
X_train.shape

(7148, 942)

In [46]:
val_dicts = df_val[categorical + num_year].to_dict(orient="records")
X_val = dv.transform(val_dicts)

In [47]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [48]:
y_pred = model.predict_proba(X_val)[:, 1]
price_decision = (y_pred >= 0.5)
(y_val == price_decision).mean()

0.9479647503147294

In [49]:
num_engine_hp = ['year', 'engine_cylinders', 'highway_mpg', 'city_mpg']

**Обучение без engine_hp**

0.9278220730172052

In [50]:
train_dicts = df_train[categorical + num_engine_hp].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dicts)

X_train = dv.transform(train_dicts)
X_train.shape

(7148, 942)

In [51]:
val_dicts = df_val[categorical + num_engine_hp].to_dict(orient="records")
X_val = dv.transform(val_dicts)

In [52]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [53]:
y_pred = model.predict_proba(X_val)[:, 1]
price_decision = (y_pred >= 0.5)
(y_val == price_decision).mean()

0.9278220730172052

In [54]:
num_city_mpg = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg']

**Обучение без city_mpg**

0.9324381032312211

In [55]:
train_dicts = df_train[categorical + num_city_mpg].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dicts)

X_train = dv.transform(train_dicts)
X_train.shape

(7148, 942)

In [57]:
val_dicts = df_val[categorical + num_city_mpg].to_dict(orient="records")
X_val = dv.transform(val_dicts)

In [58]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [59]:
y_pred = model.predict_proba(X_val)[:, 1]
price_decision = (y_pred >= 0.5)
(y_val == price_decision).mean()

0.9324381032312211

In [60]:
categorical_transmission_type = ['make', 'model', 'vehicle_style']

**Обучение без transmission_type**

0.9450272765421738

In [62]:
numerical

['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']

In [63]:
train_dicts = df_train[categorical_transmission_type + numerical].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dicts)

X_train = dv.transform(train_dicts)
X_train.shape

(7148, 938)

In [64]:
val_dicts = df_val[categorical_transmission_type + numerical].to_dict(orient="records")
X_val = dv.transform(val_dicts)

In [65]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [66]:
y_pred = model.predict_proba(X_val)[:, 1]
price_decision = (y_pred >= 0.5)
(y_val == price_decision).mean()

0.9450272765421738

## Question 5 ##

transmission_type

In [8]:
df_fixed_with_price

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920


In [9]:
df_fixed_with_price['price'] = np.log1p(df_fixed_with_price['price'])
df_fixed_with_price

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,10.739349
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,10.612779
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,10.500977
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,10.290483
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,10.448744
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,10.739024
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,10.945018
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,10.832122
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,10.838031


In [10]:
df_train_full, df_test = train_test_split(df_fixed_with_price, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)
len(df_train), len(df_val), len(df_test)

(7148, 2383, 2383)

In [11]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [12]:
y_train = df_train['price'].values
y_val = df_val['price'].values
y_test = df_test['price'].values

In [13]:
del df_train['price']
del df_val['price']
del df_test['price']

In [16]:
y_train

array([10.42228135, 10.17526888, 12.42118806, ..., 10.25224121,
        7.60140233, 10.60214453])

In [14]:
categorical = ['make', 'model', 'transmission_type', 'vehicle_style']
categorical

['make', 'model', 'transmission_type', 'vehicle_style']

In [15]:
numerical = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']
numerical

['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']

In [16]:
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
dv = DictVectorizer(sparse=True)
dv.fit(train_dicts)

X_train = dv.transform(train_dicts)
X_train.shape

(7148, 943)

In [17]:
val_dicts = df_val[categorical + numerical].to_dict(orient="records")
X_val = dv.transform(val_dicts)

In [95]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [18]:
from sklearn.linear_model import LinearRegression

In [19]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [20]:
for alpha in [0, 0.01, 0.1, 1, 10]:
    # Обучаем модель Ridge regression
    model = Ridge(alpha=alpha, solver='sag', random_state=42)
    model.fit(X_train, y_train)
    
    # Предсказываем значения 'price' для валидационной выборки
    y_pred = model.predict(X_val)
    
    # Вычисляем RMSE
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    
    print(f'alpha: {alpha}, RMSE: {round(rmse, 3)}')

alpha: 0, RMSE: 0.251
alpha: 0.01, RMSE: 0.255
alpha: 0.1, RMSE: 0.251
alpha: 1, RMSE: 0.258
alpha: 10, RMSE: 0.331


In [25]:
min(y_val), max(y_val)

(7.601402334583733, 14.349527179656565)