In [83]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [84]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')
df

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.870990,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369
...,...,...,...,...,...,...,...,...,...,...,...
9699,140,5.0,164.0,2981.107371,17.3,2013,Europe,Diesel,Front-wheel drive,,15.101802
9700,180,,154.0,2439.525729,15.0,2004,USA,Gasoline,All-wheel drive,0.0,17.962326
9701,220,2.0,138.0,2583.471318,15.1,2008,USA,Diesel,All-wheel drive,-1.0,17.186587
9702,230,4.0,177.0,2905.527390,19.4,2011,USA,Diesel,Front-wheel drive,1.0,15.331551


In [85]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
numerical = list(df.dtypes[df.dtypes != 'object'].index)

for c in categorical_columns:
    df[c] = df[c].fillna('NA')
for n in numerical:
    df[n] = df[n].fillna('0.0')

df.isnull().sum()

engine_displacement    0
num_cylinders          0
horsepower             0
vehicle_weight         0
acceleration           0
model_year             0
origin                 0
fuel_type              0
drivetrain             0
num_doors              0
fuel_efficiency_mpg    0
dtype: int64

In [86]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text

In [87]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

y_train = df_train['fuel_efficiency_mpg'].astype(float).values
y_val = df_val['fuel_efficiency_mpg'].astype(float).values
y_test = df_test['fuel_efficiency_mpg'].astype(float).values

del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']


In [88]:
dv = DictVectorizer(sparse=True)
train_dicts = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [89]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

print(export_text(dt, feature_names=dv.get_feature_names_out()))

|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]



In [90]:
rf = RandomForestRegressor(
    n_estimators=10,
    random_state=1,
    n_jobs=-1
)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)
rmse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(rmse)
print("Validation RMSE:", round(rmse, 3))

Validation RMSE: 0.46


In [91]:
rmse_scores = {}

for n in range(10, 201, 10):  # 10 → 200, step 10
    rf = RandomForestRegressor(
        n_estimators=n,
        random_state=1,
        n_jobs=-1
    )
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)

    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores[n] = rmse
    print(f"n_estimators={n:3d} --> RMSE: {rmse:.3f}")

best_n = min(rmse_scores, key=rmse_scores.get)
print("\nLowest RMSE:", round(rmse_scores[best_n], 3), "at n_estimators =", best_n)

n_estimators= 10 --> RMSE: 0.460
n_estimators= 20 --> RMSE: 0.453
n_estimators= 30 --> RMSE: 0.451
n_estimators= 40 --> RMSE: 0.448
n_estimators= 50 --> RMSE: 0.446
n_estimators= 60 --> RMSE: 0.446
n_estimators= 70 --> RMSE: 0.445
n_estimators= 80 --> RMSE: 0.445
n_estimators= 90 --> RMSE: 0.445
n_estimators=100 --> RMSE: 0.445
n_estimators=110 --> RMSE: 0.444
n_estimators=120 --> RMSE: 0.444
n_estimators=130 --> RMSE: 0.444
n_estimators=140 --> RMSE: 0.444
n_estimators=150 --> RMSE: 0.443
n_estimators=160 --> RMSE: 0.443
n_estimators=170 --> RMSE: 0.443
n_estimators=180 --> RMSE: 0.443
n_estimators=190 --> RMSE: 0.443
n_estimators=200 --> RMSE: 0.443

Lowest RMSE: 0.443 at n_estimators = 180


In [92]:
depth_values = [10, 15, 20, 25]
rmse_summary = {}

for depth in depth_values:
    rmse_list = []
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(
            n_estimators=n,
            max_depth=depth,
            random_state=1,
            n_jobs=-1
        )
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_list.append(rmse)
    mean_rmse = np.mean(rmse_list)
    rmse_summary[depth] = mean_rmse
    print(f"max_depth={depth:2d} --> mean RMSE: {mean_rmse:.3f}")

best_depth = min(rmse_summary, key=rmse_summary.get)
print("\nBest max_depth:", best_depth, "with mean RMSE =", round(rmse_summary[best_depth], 3))

max_depth=10 --> mean RMSE: 0.442
max_depth=15 --> mean RMSE: 0.445
max_depth=20 --> mean RMSE: 0.445
max_depth=25 --> mean RMSE: 0.445

Best max_depth: 10 with mean RMSE = 0.442


In [93]:
importances = rf.feature_importances_

feature_names = dv.get_feature_names_out()
feat_imp = list(zip(feature_names, importances))

feat_imp = sorted(feat_imp, key=lambda x: x[1], reverse=True)
for f, imp in feat_imp[:10]:
    print(f"{f}: {imp:.3f}")

vehicle_weight: 0.959
horsepower: 0.015
acceleration: 0.011
model_year: 0.003
engine_displacement: 0.003
num_cylinders: 0.002
num_doors: 0.002
origin=USA: 0.001
origin=Europe: 0.000
acceleration=0.0: 0.000


In [95]:
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error

In [96]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
watchlist = [(dtrain, 'train'), (dval, 'val')]

In [97]:
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

model_03 = xgb.train(
    params=xgb_params,
    dtrain=dtrain,
    num_boost_round=100,
    evals=watchlist,
    verbose_eval=False
)

y_pred = model_03.predict(dval)
rmse_03 = np.sqrt(mean_squared_error(y_val, y_pred))
print("RMSE (eta=0.3):", round(rmse_03, 3))

RMSE (eta=0.3): 0.449


In [98]:
xgb_params['eta'] = 0.1

model_01 = xgb.train(
    params=xgb_params,
    dtrain=dtrain,
    num_boost_round=100,
    evals=watchlist,
    verbose_eval=False
)

y_pred = model_01.predict(dval)
rmse_01 = np.sqrt(mean_squared_error(y_val, y_pred))
print("RMSE (eta=0.1):", round(rmse_01, 3))

RMSE (eta=0.1): 0.427
