In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('car_fuel_efficiency.csv')

In [3]:
df.isnull().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [4]:
df = df.fillna(0)

In [5]:
df.isnull().sum()

engine_displacement    0
num_cylinders          0
horsepower             0
vehicle_weight         0
acceleration           0
model_year             0
origin                 0
fuel_type              0
drivetrain             0
num_doors              0
fuel_efficiency_mpg    0
dtype: int64

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [8]:
len(df_train), len(df_val), len(df_test)

(5822, 1941, 1941)

In [9]:
df.head(2).T

Unnamed: 0,0,1
engine_displacement,170,130
num_cylinders,3.0,5.0
horsepower,159.0,97.0
vehicle_weight,3413.433759,3149.664934
acceleration,17.7,17.8
model_year,2003,2007
origin,Europe,USA
fuel_type,Gasoline,Gasoline
drivetrain,All-wheel drive,Front-wheel drive
num_doors,0.0,0.0


In [10]:
df.fuel_type.unique()

array(['Gasoline', 'Diesel'], dtype=object)

In [11]:
from sklearn.feature_extraction import DictVectorizer

In [12]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [13]:
y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values

del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [14]:
# Define Vectorizer
dv = DictVectorizer(sparse=True)

# train
train_dicts = df_train.to_dict(orient='records')    # Transform data frame to dictionary
X_train = dv.fit_transform(train_dicts)             # Transform dictionary to matrix

# validation
val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

# test
test_dicts = df_test.to_dict(orient='records')
X_test = dv.transform(test_dicts)

In [15]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_text

In [16]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [17]:
print(export_text(dt, feature_names=list(dv.get_feature_names_out())))

|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]



In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

In [19]:
rf = RandomForestRegressor(n_estimators=10, random_state=1,n_jobs=-1)
rf.fit(X_train, y_train)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [20]:
y_pred = rf.predict(X_val)

In [21]:
root_mean_squared_error(y_val, y_pred)

0.4595777223092726

In [22]:
scores = []

for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1,n_jobs=-1)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)

    print('%3s -> %.3f' % (n, rmse))
    scores.append((n, rmse))

 10 -> 0.460
 20 -> 0.454
 30 -> 0.452
 40 -> 0.449
 50 -> 0.447
 60 -> 0.445
 70 -> 0.445
 80 -> 0.445
 90 -> 0.445
100 -> 0.445
110 -> 0.444
120 -> 0.444
130 -> 0.444
140 -> 0.443
150 -> 0.443
160 -> 0.443
170 -> 0.443
180 -> 0.442
190 -> 0.442
200 -> 0.442


In [23]:
df_scores = pd.DataFrame(scores, columns=['n_estimators', 'rmse'])

In [24]:
import plotly.express as px
fig = px.scatter(df_scores, x="n_estimators", y="rmse")
fig.show()

In [25]:
scores = []

for d in [10, 15, 20, 25]:
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(n_estimators=n,
                                   max_depth=d,
                                   random_state=1,
                                   n_jobs=-1)
        rf.fit(X_train, y_train)

        y_pred = rf.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)

        print(f"{d}, {n}, {rmse:.3f}")
        scores.append((d, n, rmse))

10, 10, 0.450
10, 20, 0.447
10, 30, 0.445
10, 40, 0.443
10, 50, 0.442
10, 60, 0.442
10, 70, 0.441
10, 80, 0.441
10, 90, 0.442
10, 100, 0.441
10, 110, 0.441
10, 120, 0.441
10, 130, 0.441
10, 140, 0.440
10, 150, 0.440
10, 160, 0.440
10, 170, 0.440
10, 180, 0.440
10, 190, 0.440
10, 200, 0.440
15, 10, 0.458
15, 20, 0.453
15, 30, 0.451
15, 40, 0.449
15, 50, 0.446
15, 60, 0.445
15, 70, 0.445
15, 80, 0.445
15, 90, 0.445
15, 100, 0.444
15, 110, 0.443
15, 120, 0.444
15, 130, 0.444
15, 140, 0.443
15, 150, 0.443
15, 160, 0.443
15, 170, 0.443
15, 180, 0.442
15, 190, 0.442
15, 200, 0.442
20, 10, 0.459
20, 20, 0.454
20, 30, 0.452
20, 40, 0.449
20, 50, 0.447
20, 60, 0.446
20, 70, 0.445
20, 80, 0.446
20, 90, 0.446
20, 100, 0.445
20, 110, 0.444
20, 120, 0.444
20, 130, 0.444
20, 140, 0.444
20, 150, 0.443
20, 160, 0.443
20, 170, 0.443
20, 180, 0.443
20, 190, 0.443
20, 200, 0.443
25, 10, 0.459
25, 20, 0.454
25, 30, 0.452
25, 40, 0.449
25, 50, 0.447
25, 60, 0.446
25, 70, 0.445
25, 80, 0.445
25, 90, 0.445
2

In [26]:
columns = ['max_depth', 'n_estimators', 'rmse']
df_scores = pd.DataFrame(scores, columns=columns)

In [27]:
df_scores.groupby('max_depth')['rmse'].mean()

max_depth
10    0.441808
15    0.445417
20    0.446253
25    0.445910
Name: rmse, dtype: float64

In [28]:
rf2 = RandomForestRegressor(n_estimators=10, max_depth=20,random_state=1,n_jobs=-1)
rf2.fit(X_train, y_train)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [29]:
importances = rf2.feature_importances_
importances

array([1.14797006e-02, 3.57108549e-04, 3.45384113e-04, 3.27279191e-03,
       3.25424323e-04, 3.60383601e-04, 1.59978977e-02, 3.21230009e-03,
       2.34334695e-03, 1.63498954e-03, 4.62246496e-04, 5.18739639e-04,
       5.39721689e-04, 9.59149965e-01])

In [30]:
feature_names=list(dv.get_feature_names_out())
feature_names

['acceleration',
 'drivetrain=All-wheel drive',
 'drivetrain=Front-wheel drive',
 'engine_displacement',
 'fuel_type=Diesel',
 'fuel_type=Gasoline',
 'horsepower',
 'model_year',
 'num_cylinders',
 'num_doors',
 'origin=Asia',
 'origin=Europe',
 'origin=USA',
 'vehicle_weight']

In [31]:
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print(feature_importance_df)

                         Feature  Importance
13                vehicle_weight    0.959150
6                     horsepower    0.015998
0                   acceleration    0.011480
3            engine_displacement    0.003273
7                     model_year    0.003212
8                  num_cylinders    0.002343
9                      num_doors    0.001635
12                    origin=USA    0.000540
11                 origin=Europe    0.000519
10                   origin=Asia    0.000462
5             fuel_type=Gasoline    0.000360
1     drivetrain=All-wheel drive    0.000357
2   drivetrain=Front-wheel drive    0.000345
4               fuel_type=Diesel    0.000325


In [32]:
import xgboost as xgb

In [33]:
features = list(dv.get_feature_names_out())
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

In [34]:
watchlist = [(dtrain, 'train'), (dval, 'val')]

In [35]:
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'nthread': 8,

    'seed': 1,
    'verbosity': 1,
}

In [36]:
model = xgb.train(xgb_params, dtrain, num_boost_round=100,
                  verbose_eval=5,
                  evals=watchlist)

[0]	train-rmse:1.81393	val-rmse:1.85444
[5]	train-rmse:0.51381	val-rmse:0.55664
[10]	train-rmse:0.37115	val-rmse:0.43896
[15]	train-rmse:0.34666	val-rmse:0.43362
[20]	train-rmse:0.33553	val-rmse:0.43376
[25]	train-rmse:0.32268	val-rmse:0.43683
[30]	train-rmse:0.31475	val-rmse:0.43752
[35]	train-rmse:0.30960	val-rmse:0.43784
[40]	train-rmse:0.30202	val-rmse:0.43968
[45]	train-rmse:0.29126	val-rmse:0.44024
[50]	train-rmse:0.28456	val-rmse:0.44140
[55]	train-rmse:0.27618	val-rmse:0.44225
[60]	train-rmse:0.26768	val-rmse:0.44290
[65]	train-rmse:0.26174	val-rmse:0.44352
[70]	train-rmse:0.25489	val-rmse:0.44531
[75]	train-rmse:0.24792	val-rmse:0.44628
[80]	train-rmse:0.24254	val-rmse:0.44689
[85]	train-rmse:0.23644	val-rmse:0.44749
[90]	train-rmse:0.23193	val-rmse:0.44839
[95]	train-rmse:0.22475	val-rmse:0.44904
[99]	train-rmse:0.21950	val-rmse:0.45018


In [37]:
xgb_params = {
    'eta': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'nthread': 8,

    'seed': 1,
    'verbosity': 1,
}

In [38]:

model = xgb.train(xgb_params, dtrain, num_boost_round=100,
                  verbose_eval=5,
                  evals=watchlist)

[0]	train-rmse:2.28944	val-rmse:2.34561
[5]	train-rmse:1.41247	val-rmse:1.44988
[10]	train-rmse:0.91008	val-rmse:0.94062
[15]	train-rmse:0.63402	val-rmse:0.66672
[20]	train-rmse:0.48983	val-rmse:0.53064
[25]	train-rmse:0.41881	val-rmse:0.46891
[30]	train-rmse:0.38342	val-rmse:0.44289
[35]	train-rmse:0.36435	val-rmse:0.43250
[40]	train-rmse:0.35343	val-rmse:0.42746
[45]	train-rmse:0.34621	val-rmse:0.42595
[50]	train-rmse:0.33998	val-rmse:0.42498
[55]	train-rmse:0.33480	val-rmse:0.42449
[60]	train-rmse:0.33054	val-rmse:0.42456
[65]	train-rmse:0.32602	val-rmse:0.42493
[70]	train-rmse:0.32202	val-rmse:0.42503
[75]	train-rmse:0.31895	val-rmse:0.42526
[80]	train-rmse:0.31667	val-rmse:0.42563
[85]	train-rmse:0.31440	val-rmse:0.42574
[90]	train-rmse:0.31059	val-rmse:0.42586
[95]	train-rmse:0.30625	val-rmse:0.42611
[99]	train-rmse:0.30419	val-rmse:0.42623
