In [1]:
# Decision Trees and Ensemble Learning

In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [3]:
# dataset
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [4]:
## Preparing dataset
df = df.fillna(0)

In [5]:
# target
y = df['fuel_efficiency_mpg']
X = df.drop('fuel_efficiency_mpg', axis=1)

In [6]:
from sklearn.model_selection import train_test_split

# df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
# df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=1)

In [8]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=True)
X_train_dict = dv.fit_transform(X_train.to_dict(orient='records'))
X_val_dict = dv.transform(X_val.to_dict(orient='records'))

In [9]:
# Q1:DecisionTreeRegressor
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train_dict, y_train)

# Check the feature used for splitting
feature_index = dt.tree_.feature[0]
feature_name = dv.feature_names_[feature_index]
print(feature_name)

vehicle_weight


In [15]:
# Q2:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train_dict, y_train)

y_pred = rf.predict(X_val_dict)
# rmse = mean_squared_error(y_val, y_pred, squared=False)
# print(rmse)
mse = mean_squared_error(y_val, y_pred)
# print(np.sqrt(mse))
rmse = np.sqrt(mse)
print(rmse)

0.45957772230927263


In [16]:
# Q3:
for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train_dict, y_train)
    y_pred = rf.predict(X_val_dict)
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    print(n, rmse)

10 0.4595777223092726
20 0.4535906725124706
30 0.4516867257545712
40 0.4487208301736997
50 0.4466568972416094
60 0.445459702608112
70 0.4451263244986996
80 0.4449843119777284
90 0.4448614906399875
100 0.4446518680868042
110 0.4435787643986023
120 0.4439118681233817
130 0.443702590396687
140 0.4433549955101689
150 0.44289761494219454
160 0.4427612219659299
170 0.44280146504730905
180 0.44236195357041347
190 0.44249397112206923
200 0.4424785084688597


In [26]:
# Q4:
max_depths = [10, 15, 20, 25]
n_estimators_range = range(10, 201, 10)
best_depth = None
best_rmse = float("inf")

for md in max_depths:
    rmses = []
    for n in n_estimators_range:
        rf = RandomForestRegressor(max_depth=md, n_estimators=n, random_state=1, n_jobs=-1)
        rf.fit(X_train_dict, y_train)
        y_pred = rf.predict(X_val_dict)
        mse = mean_squared_error(y_val, y_pred)
        rmses.append(np.sqrt(mse))
        # rmses.append(mean_squared_error(y_val, y_pred, squared=False))
    mean_rmse = np.mean(rmses)
    print(md, mean_rmse)
    if mean_rmse < best_rmse:
        best_rmse = mean_rmse
        best_depth = md

print("Best max_depth:", best_depth)

10 0.4418078609323356
15 0.4454166445638107
20 0.44625292424422536
25 0.44590993626161624
Best max_depth: 10


In [19]:
#Q5: 
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train_dict, y_train)

importances = rf.feature_importances_
for name, importance in zip(dv.feature_names_, importances):
    print(name, importance)

# Get most important feature
best_feature = dv.feature_names_[np.argmax(importances)]
print("Most important feature:", best_feature)

acceleration 0.01147970063142936
drivetrain=All-wheel drive 0.0003571085493021933
drivetrain=Front-wheel drive 0.0003453841126318372
engine_displacement 0.003272791913609506
fuel_type=Diesel 0.00032542432286973745
fuel_type=Gasoline 0.00036038360069172833
horsepower 0.015997897714266237
model_year 0.0032123000947946716
num_cylinders 0.0023433469524512004
num_doors 0.001634989543930702
origin=Asia 0.0004622464955097426
origin=Europe 0.000518739638586969
origin=USA 0.0005397216891829172
vehicle_weight 0.9591499647407432
Most important feature: vehicle_weight


In [23]:
uv add xgboost

Note: you may need to restart the kernel to use updated packages.


C:\Users\sudwa\Desktop\ML-Zoomcamp\.venv\Scripts\python.exe: No module named uv


In [25]:
#Q6:

import xgboost as xgb

# DMatrix
dtrain = xgb.DMatrix(X_train_dict, label=y_train)
dval = xgb.DMatrix(X_val_dict, label=y_val)
watchlist = [(dtrain, 'train'), (dval, 'val')]

for eta in [0.3, 0.1]:
    xgb_params = {
        'eta': eta,
        'max_depth': 6,
        'min_child_weight': 1,
        'objective': 'reg:squarederror',
        'nthread': 8,
        'seed': 1,
        'verbosity': 1,
    }
    model = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=False)
    y_pred = model.predict(dval)
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    # rmse = mean_squared_error(y_val, y_pred, squared=False)
    print(f"eta={eta}, RMSE={rmse:.4f}")

eta=0.3, RMSE=0.4502
eta=0.1, RMSE=0.4262
