In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [5]:
# Load the data
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'
df = pd.read_csv(url)
df.head()


Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [6]:

# Fill missing values with zeros
df = df.fillna(0)

# Split the data
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

# Reset indices
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Prepare target variables
y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values

# Remove target from features
del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

# Convert to dictionaries
train_dicts = df_train.to_dict(orient='records')
val_dicts = df_val.to_dict(orient='records')
test_dicts = df_test.to_dict(orient='records')

# Use DictVectorizer
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)
X_test = dv.transform(test_dicts)

In [7]:
# Train decision tree with max_depth=1
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_train)

# Get the feature used for splitting
feature_names = dv.get_feature_names_out()
tree_feature = dt.tree_.feature[0]
print(f"Feature used for splitting: {feature_names[tree_feature]}")

Feature used for splitting: vehicle_weight


In [8]:
# Train random forest
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

# Predict and calculate RMSE
y_pred = rf.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"RMSE: {rmse:.3f}")

RMSE: 0.460


In [9]:
# Experiment with n_estimators
scores = []
for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    scores.append((n, rmse))
    print(f"n_estimators: {n}, RMSE: {rmse:.3f}")

# Find when RMSE stops improving significantly
for i in range(1, len(scores)):
    if abs(scores[i][1] - scores[i-1][1]) < 0.001:  # Stopping criterion
        print(f"Stops improving at n_estimators: {scores[i][0]}")
        break

n_estimators: 10, RMSE: 0.460
n_estimators: 20, RMSE: 0.454
n_estimators: 30, RMSE: 0.452
n_estimators: 40, RMSE: 0.449
n_estimators: 50, RMSE: 0.447
n_estimators: 60, RMSE: 0.445
n_estimators: 70, RMSE: 0.445
n_estimators: 80, RMSE: 0.445
n_estimators: 90, RMSE: 0.445
n_estimators: 100, RMSE: 0.445
n_estimators: 110, RMSE: 0.444
n_estimators: 120, RMSE: 0.444
n_estimators: 130, RMSE: 0.444
n_estimators: 140, RMSE: 0.443
n_estimators: 150, RMSE: 0.443
n_estimators: 160, RMSE: 0.443
n_estimators: 170, RMSE: 0.443
n_estimators: 180, RMSE: 0.442
n_estimators: 190, RMSE: 0.442
n_estimators: 200, RMSE: 0.442
Stops improving at n_estimators: 70


In [10]:
# Experiment with max_depth
best_score = float('inf')
best_depth = None

for depth in [10, 15, 20, 25]:
    scores_depth = []
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(n_estimators=n, max_depth=depth, random_state=1, n_jobs=-1)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        scores_depth.append(rmse)
    
    mean_rmse = np.mean(scores_depth)
    print(f"max_depth: {depth}, mean RMSE: {mean_rmse:.3f}")
    
    if mean_rmse < best_score:
        best_score = mean_rmse
        best_depth = depth

print(f"Best max_depth: {best_depth}")

max_depth: 10, mean RMSE: 0.442
max_depth: 15, mean RMSE: 0.445
max_depth: 20, mean RMSE: 0.446
max_depth: 25, mean RMSE: 0.446
Best max_depth: 10


In [11]:
# Train model and get feature importance
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

# Get feature importances
feature_importances = rf.feature_importances_
feature_names = dv.get_feature_names_out()

# Create list of (feature, importance) pairs
feature_importance_pairs = list(zip(feature_names, feature_importances))
feature_importance_pairs.sort(key=lambda x: x[1], reverse=True)

# Display top features
for feature, importance in feature_importance_pairs[:10]:
    print(f"{feature}: {importance:.4f}")

# Check the specific features mentioned in the question
target_features = ['vehicle_weight', 'horsepower', 'acceleration', 'engine_displacement']
for feature, importance in feature_importance_pairs:
    for target in target_features:
        if target in feature:
            print(f"{feature}: {importance:.4f}")

vehicle_weight: 0.9591
horsepower: 0.0160
acceleration: 0.0115
engine_displacement: 0.0033
model_year: 0.0032
num_cylinders: 0.0023
num_doors: 0.0016
origin=USA: 0.0005
origin=Europe: 0.0005
origin=Asia: 0.0005
vehicle_weight: 0.9591
horsepower: 0.0160
acceleration: 0.0115
engine_displacement: 0.0033


In [14]:
# Convert feature names to a list
feature_names = list(dv.get_feature_names_out())

# Create DMatrix for training and validation
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=feature_names)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=feature_names)

watchlist = [(dtrain, 'train'), (dval, 'val')]

# Test eta=0.3
xgb_params_03 = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

model_03 = xgb.train(xgb_params_03, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=False)
y_pred_03 = model_03.predict(dval)
rmse_03 = np.sqrt(mean_squared_error(y_val, y_pred_03))

# Test eta=0.1
xgb_params_01 = {
    'eta': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

model_01 = xgb.train(xgb_params_01, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=False)
y_pred_01 = model_01.predict(dval)
rmse_01 = np.sqrt(mean_squared_error(y_val, y_pred_01))

# Compare RMSE
print(f"eta=0.3 RMSE: {rmse_03:.3f}")
print(f"eta=0.1 RMSE: {rmse_01:.3f}")

if rmse_03 < rmse_01:
    print("Best eta: 0.3")
else:
    print("Best eta: 0.1")


eta=0.3 RMSE: 0.450
eta=0.1 RMSE: 0.426
Best eta: 0.1
