In [1]:
import pandas as pd

In [2]:
df = pd.read_csv(r'car_fuel_efficiency.csv')
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


## Preparing the dataset

In [3]:
df.isnull().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [4]:
df = df.fillna(0)

In [5]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.fuel_efficiency_mpg.to_numpy()
y_val = df_val.fuel_efficiency_mpg.to_numpy()
y_test = df_test.fuel_efficiency_mpg.to_numpy()

del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

## Question 1

In [22]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text

train_dicts = df_train.fillna(0).to_dict(orient = 'records')

dv = DictVectorizer(sparse = False)
X_train = dv.fit_transform(train_dicts)

dt = DecisionTreeRegressor(max_depth = 1, random_state=1)
dt.fit(X_train, y_train)

val_dicts = df_val.fillna(0).to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [23]:
print(export_text(dt,feature_names=list(dv.get_feature_names_out())))

|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]



'vehicle_weight' is the feature for splitting the data

## Question 2

In [32]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

In [35]:
rf = RandomForestRegressor(n_estimators = 10, random_state = 1, n_jobs = -1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(rmse)

0.45997775573361477


# Question 3

In [38]:
for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators = n, random_state = 1, n_jobs = -1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    print(n, round(rmse,3))

10 0.46
20 0.454
30 0.451
40 0.448
50 0.446
60 0.445
70 0.445
80 0.445
90 0.445
100 0.444
110 0.443
120 0.444
130 0.443
140 0.443
150 0.443
160 0.443
170 0.443
180 0.442
190 0.443
200 0.443


At n_estimators = 110 RMSE stop improving

# Question 4

In [46]:
scores = []
for max_depth in [10, 15, 20, 25]:
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(n_estimators = n, max_depth = max_depth, random_state = 1, n_jobs = -1)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        scores.append((max_depth, n, rmse))

In [47]:
df_scores = pd.DataFrame(scores, columns = ['max_depth', 'n_estimators', 'rmse'])

In [48]:
df_scores.sort_values(by = 'rmse')

Unnamed: 0,max_depth,n_estimators,rmse
17,10,180,0.439837
19,10,200,0.439942
18,10,190,0.439982
15,10,160,0.439997
14,10,150,0.440128
...,...,...,...
41,20,20,0.453368
61,25,20,0.453949
20,15,10,0.457160
40,20,10,0.459109


# Question 5

In [49]:
rf = RandomForestRegressor(n_estimators = 10, max_depth = 20, random_state = 1, n_jobs = -1)
rf.fit(X_train, y_train)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [53]:
features = pd.Series(rf.feature_importances_, index = list(dv.get_feature_names_out()))
features.sort_values(ascending = False)

vehicle_weight                  0.959162
horsepower                      0.016040
acceleration                    0.011471
engine_displacement             0.003269
model_year                      0.003182
num_cylinders                   0.002359
num_doors                       0.001591
origin=USA                      0.000555
origin=Europe                   0.000520
origin=Asia                     0.000476
drivetrain=All-wheel drive      0.000382
fuel_type=Diesel                0.000344
fuel_type=Gasoline              0.000337
drivetrain=Front-wheel drive    0.000312
dtype: float64

# Question 6

In [54]:
import xgboost as xgb

In [55]:
features = list(dv.get_feature_names_out())
dtrain = xgb.DMatrix(X_train, label = y_train, feature_names = features)
dval = xgb.DMatrix(X_val, label = y_val, feature_names = features)

watchlist = [(dtrain, 'train'), (dval, 'val')]

In [60]:
%%capture output

xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100,
                  verbose_eval=5,
                  evals=watchlist)

In [61]:
y_pred = model.predict(dval)
np.sqrt(mean_squared_error(y_val, y_pred))

np.float64(0.42622800553359225)

- For eta = 0.3, rmse = 0.45
- For eta = 0.1, rmse = 0.426