6. Decision Trees and Ensemble Learning

6.1. Clean up and split data

In [2]:
     
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

url='https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'
df = pd.read_csv(url)

df.columns = df.columns.str.lower().str.replace(' ', '_')
df.head(5)

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [3]:
df.dtypes

engine_displacement      int64
num_cylinders          float64
horsepower             float64
vehicle_weight         float64
acceleration           float64
model_year               int64
origin                  object
fuel_type               object
drivetrain              object
num_doors              float64
fuel_efficiency_mpg    float64
dtype: object

In [4]:
df.isna().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [5]:
df.columns= df.columns.str.lower().str.replace(' ', '_')
numerical_columns = list(df.dtypes[df.dtypes == 'float64'].index)

for column in numerical_columns:
    df.fillna({column: 0}, inplace=True)
df.isna().sum()

engine_displacement    0
num_cylinders          0
horsepower             0
vehicle_weight         0
acceleration           0
model_year             0
origin                 0
fuel_type              0
drivetrain             0
num_doors              0
fuel_efficiency_mpg    0
dtype: int64

In [6]:
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train,test_size=0.25, random_state=1) #20% of the orignal df
df_train= df_train.reset_index(drop= True)
df_val= df_val.reset_index(drop= True)
df_test= df_test.reset_index(drop= True)

In [7]:
y_train=df_train.fuel_efficiency_mpg.values
y_val=df_val.fuel_efficiency_mpg.values
y_test=df_test.fuel_efficiency_mpg.values
del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [8]:
from sklearn.feature_extraction import DictVectorizer
dv= DictVectorizer(sparse=True)

In [9]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text

question 1.

In [10]:
train_dicts = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [12]:
feature_names = dv.get_feature_names_out()
split_feature_index = dt.feature_importances_.argmax()
split_feature_name = feature_names[split_feature_index]

print(f"Feature used for splitting the data: {split_feature_name}")


Feature used for splitting the data: vehicle_weight


question 2.

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)

# Step 2: Train the model
rf.fit(X_train, y_train)

# Step 3: Make predictions on the validation set
val_dicts = df_val.fillna(0).to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_pred = rf.predict(X_val)

# Step 4: Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

# Step 5: Print the RMSE value
print(f"RMSE on validation data: {rmse:.4f}")

RMSE on validation data: 0.4596


question 3.

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

for n in range(10, 200, 10):

    rf = RandomForestRegressor(n_estimators=n, random_state=1)

    # Step 2: Train the model
    rf.fit(X_train, y_train)

    # Step 3: Make predictions on the validation set
    val_dicts = df_val.fillna(0).to_dict(orient='records')
    X_val = dv.transform(val_dicts)
    y_pred = rf.predict(X_val)

    # Step 4: Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))

    # Step 5: Print the RMSE value
    print(f"RMSE on validation data: {rmse:.3f} if n_estimator is {n}")

RMSE on validation data: 0.460 if n_estimator is 10
RMSE on validation data: 0.454 if n_estimator is 20
RMSE on validation data: 0.452 if n_estimator is 30
RMSE on validation data: 0.449 if n_estimator is 40
RMSE on validation data: 0.447 if n_estimator is 50
RMSE on validation data: 0.445 if n_estimator is 60
RMSE on validation data: 0.445 if n_estimator is 70
RMSE on validation data: 0.445 if n_estimator is 80
RMSE on validation data: 0.445 if n_estimator is 90
RMSE on validation data: 0.445 if n_estimator is 100
RMSE on validation data: 0.444 if n_estimator is 110
RMSE on validation data: 0.444 if n_estimator is 120
RMSE on validation data: 0.444 if n_estimator is 130
RMSE on validation data: 0.443 if n_estimator is 140
RMSE on validation data: 0.443 if n_estimator is 150
RMSE on validation data: 0.443 if n_estimator is 160
RMSE on validation data: 0.443 if n_estimator is 170
RMSE on validation data: 0.442 if n_estimator is 180
RMSE on validation data: 0.442 if n_estimator is 190


Question 4.

In [19]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
for depth in [10, 15, 20, 25]:
    for n in range(10, 200, 10):

        rf = RandomForestRegressor(n_estimators=n, random_state=1, max_depth=depth)

        # Step 2: Train the model
        rf.fit(X_train, y_train)

        # Step 3: Make predictions on the validation set
        val_dicts = df_val.fillna(0).to_dict(orient='records')
        X_val = dv.transform(val_dicts)
        y_pred = rf.predict(X_val)

        # Step 4: Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))

        # Step 5: Print the RMSE value
        print(f"RMSE on validation data: {rmse:.3f} if n_estimator is {n} and max_depth is {depth}")

RMSE on validation data: 0.450 if n_estimator is 10 and max_depth is 10
RMSE on validation data: 0.447 if n_estimator is 20 and max_depth is 10
RMSE on validation data: 0.445 if n_estimator is 30 and max_depth is 10
RMSE on validation data: 0.443 if n_estimator is 40 and max_depth is 10
RMSE on validation data: 0.442 if n_estimator is 50 and max_depth is 10
RMSE on validation data: 0.442 if n_estimator is 60 and max_depth is 10
RMSE on validation data: 0.441 if n_estimator is 70 and max_depth is 10
RMSE on validation data: 0.441 if n_estimator is 80 and max_depth is 10
RMSE on validation data: 0.442 if n_estimator is 90 and max_depth is 10
RMSE on validation data: 0.441 if n_estimator is 100 and max_depth is 10
RMSE on validation data: 0.441 if n_estimator is 110 and max_depth is 10
RMSE on validation data: 0.441 if n_estimator is 120 and max_depth is 10
RMSE on validation data: 0.441 if n_estimator is 130 and max_depth is 10
RMSE on validation data: 0.440 if n_estimator is 140 and max

Question 5.

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1, max_depth=20)

# Step 2: Train the model
rf.fit(X_train, y_train)

# Step 3: Make predictions on the validation set
val_dicts = df_val.fillna(0).to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_pred = rf.predict(X_val)

feature_names = dv.get_feature_names_out()
split_feature_index = dt.feature_importances_.argmax()
split_feature_name = feature_names[split_feature_index]

print(f"Feature used for splitting the data: {split_feature_name}")

Feature used for splitting the data: vehicle_weight


question 6.

In [23]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np

# Define a range of eta values to try
eta_values = [0.1, 0.3]

best_eta = None
best_rmse = float('inf')  # Start with a very high RMSE
best_model = None

# Loop over each eta value
for eta in eta_values:
    print(f"Training with eta = {eta}")
    
    # Define the parameters
    xgb_params = {
        'eta': eta,
        'max_depth': 6,
        'min_child_weight': 1,
        'objective': 'reg:squarederror',
        'nthread': 8,
        'seed': 1,
        'verbosity': 0,
    }
    
    # Create DMatrix for train and validation data
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    
    # Create watchlist for monitoring
    watchlist = [(dtrain, 'train'), (dval, 'eval')]
    
    # Train the model for 100 rounds
    bst = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)
    
    # Get predictions for the validation set
    y_pred = bst.predict(dval)
    
    # Calculate RMSE on the validation set
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    
    # Check if this RMSE is the best so far
    if rmse < best_rmse:
        best_rmse = rmse
        best_eta = eta
        best_model = bst
    
    print(f"RMSE for eta={eta}: {rmse:.4f}")
    
# Output the best eta and corresponding RMSE
print(f"\nBest eta: {best_eta} with RMSE: {best_rmse:.4f}")


Training with eta = 0.1
[0]	train-rmse:2.28944	eval-rmse:2.34561
[1]	train-rmse:2.07396	eval-rmse:2.12434
[2]	train-rmse:1.88066	eval-rmse:1.92597
[3]	train-rmse:1.70730	eval-rmse:1.74987
[4]	train-rmse:1.55163	eval-rmse:1.59059
[5]	train-rmse:1.41247	eval-rmse:1.44988
[6]	train-rmse:1.28796	eval-rmse:1.32329
[7]	train-rmse:1.17660	eval-rmse:1.20930
[8]	train-rmse:1.07736	eval-rmse:1.10830
[9]	train-rmse:0.98883	eval-rmse:1.02009
[10]	train-rmse:0.91008	eval-rmse:0.94062
[11]	train-rmse:0.84030	eval-rmse:0.87100
[12]	train-rmse:0.77874	eval-rmse:0.80916


[13]	train-rmse:0.72417	eval-rmse:0.75465
[14]	train-rmse:0.67626	eval-rmse:0.70780
[15]	train-rmse:0.63402	eval-rmse:0.66672
[16]	train-rmse:0.59690	eval-rmse:0.63062
[17]	train-rmse:0.56447	eval-rmse:0.60016
[18]	train-rmse:0.53619	eval-rmse:0.57383
[19]	train-rmse:0.51138	eval-rmse:0.55044
[20]	train-rmse:0.48983	eval-rmse:0.53064
[21]	train-rmse:0.47135	eval-rmse:0.51451
[22]	train-rmse:0.45501	eval-rmse:0.49998
[23]	train-rmse:0.44120	eval-rmse:0.48790
[24]	train-rmse:0.42929	eval-rmse:0.47773
[25]	train-rmse:0.41881	eval-rmse:0.46891
[26]	train-rmse:0.40953	eval-rmse:0.46151
[27]	train-rmse:0.40173	eval-rmse:0.45551
[28]	train-rmse:0.39470	eval-rmse:0.45043
[29]	train-rmse:0.38873	eval-rmse:0.44621
[30]	train-rmse:0.38342	eval-rmse:0.44289
[31]	train-rmse:0.37876	eval-rmse:0.43989
[32]	train-rmse:0.37450	eval-rmse:0.43754
[33]	train-rmse:0.37073	eval-rmse:0.43553
[34]	train-rmse:0.36743	eval-rmse:0.43390
[35]	train-rmse:0.36435	eval-rmse:0.43250
[36]	train-rmse:0.36178	eval-rmse: