In [2]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv

--2025-11-03 19:02:36--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv’


2025-11-03 19:02:37 (2.65 MB/s) - ‘car_fuel_efficiency.csv’ saved [874188/874188]



In [3]:
import pandas as pd

# Load dataset
df = pd.read_csv('car_fuel_efficiency.csv')

# Fill missing values with 0
df = df.fillna(0)

# The target variable
y = df['fuel_efficiency_mpg']

# Features (remove the target column)
X = df.drop('fuel_efficiency_mpg', axis=1)


In [4]:
from sklearn.model_selection import train_test_split

# First split: train (60%) and temp (40%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=1)

# Second split temp into validation (20%) and test (20%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=1)


In [5]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=True)

# Fit on training data
X_train_dict = X_train.to_dict(orient='records')
X_train_matrix = dv.fit_transform(X_train_dict)

# Transform validation and test data
X_val_matrix = dv.transform(X_val.to_dict(orient='records'))
X_test_matrix = dv.transform(X_test.to_dict(orient='records'))


In [6]:
from sklearn.tree import DecisionTreeRegressor

# Train decision tree with depth=1
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train_matrix, y_train)

# Identify the splitting feature
first_split_feature = dv.get_feature_names_out()[dt.tree_.feature[0]]
first_split_feature


'vehicle_weight'

In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Train the model
rf = RandomForestRegressor(
    n_estimators=10,
    random_state=1,
    n_jobs=-1  # optional
)
rf.fit(X_train_matrix, y_train)

# Predict on validation data
y_pred = rf.predict(X_val_matrix)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
rmse

np.float64(0.4602815367032659)

In [8]:
scores = []

for n in range(10, 201, 10):
    rf = RandomForestRegressor(
        n_estimators=n,
        random_state=1,
        n_jobs=-1
    )
    rf.fit(X_train_matrix, y_train)
    y_pred = rf.predict(X_val_matrix)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    scores.append((n, rmse))

scores

[(10, np.float64(0.4602815367032658)),
 (20, np.float64(0.44615674589110027)),
 (30, np.float64(0.4397780761280069)),
 (40, np.float64(0.4383939265191818)),
 (50, np.float64(0.4371703249467453)),
 (60, np.float64(0.4355914081920472)),
 (70, np.float64(0.43611238591302576)),
 (80, np.float64(0.43605455887808786)),
 (90, np.float64(0.43541008234407647)),
 (100, np.float64(0.4352773655478666)),
 (110, np.float64(0.434896815770466)),
 (120, np.float64(0.43546652508605704)),
 (130, np.float64(0.43492336206666454)),
 (140, np.float64(0.4351068229164201)),
 (150, np.float64(0.4351910645153306)),
 (160, np.float64(0.43523690427566636)),
 (170, np.float64(0.43520773900215154)),
 (180, np.float64(0.43524040934995967)),
 (190, np.float64(0.4353979933811757)),
 (200, np.float64(0.4350031248889441))]

In [9]:
max_depth_values = [10, 15, 20, 25]
results = {}

for depth in max_depth_values:
    rmse_list = []
    
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(
            n_estimators=n,
            max_depth=depth,
            random_state=1,
            n_jobs=-1
        )
        rf.fit(X_train_matrix, y_train)
        y_pred = rf.predict(X_val_matrix)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_list.append(rmse)
    
    results[depth] = np.mean(rmse_list)

results


{10: np.float64(0.43624733022811624),
 15: np.float64(0.4378245115127723),
 20: np.float64(0.43769343549884143),
 25: np.float64(0.43765343428485853)}

In [10]:
# Train the Random Forest model
rf = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1,
    n_jobs=-1
)
rf.fit(X_train_matrix, y_train)

# Get feature importances
importances = rf.feature_importances_
feature_names = dv.get_feature_names_out()

# Create a mapping for easier reading
feature_importance_dict = dict(zip(feature_names, importances))

# Filter only the 4 features of interest
for f in ['vehicle_weight', 'horsepower', 'acceleration', 'engine_displacement']:
    print(f, feature_importance_dict.get(f, 0))


vehicle_weight 0.9598782143148441
horsepower 0.015933481489766168
acceleration 0.011442313735237557
engine_displacement 0.003159424030350312


In [12]:
!pip install xgboost
import xgboost as xgb

# Create DMatrix
dtrain = xgb.DMatrix(X_train_matrix, label=y_train)
dval = xgb.DMatrix(X_val_matrix, label=y_val)

# Watchlist allows XGBoost to evaluate train & val at each iteration
watchlist = [(dtrain, 'train'), (dval, 'val')]

def train_and_evaluate(eta):
    params = {
        'eta': eta,
        'max_depth': 6,
        'min_child_weight': 1,
        'objective': 'reg:squarederror',
        'nthread': 8,
        'seed': 1,
        'verbosity': 0
    }
    model = xgb.train(params, dtrain, num_boost_round=100, evals=watchlist)
    y_pred = model.predict(dval)
    return np.sqrt(mean_squared_error(y_val, y_pred))

rmse_eta_03 = train_and_evaluate(0.3)
rmse_eta_01 = train_and_evaluate(0.1)

rmse_eta_03, rmse_eta_01


Collecting xgboost
  Downloading xgboost-3.1.1-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.28.7-py3-none-manylinux_2_18_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.1.1-py3-none-manylinux_2_28_x86_64.whl (115.9 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.9/115.9 MB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hDownloading nvidia_nccl_cu12-2.28.7-py3-none-manylinux_2_18_x86_64.whl (296.8 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.8/296.8 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [xgboost]━━━[0m [32m1/2[0m [xgboost]
[1A[2KSuccessfully installed nvidia-nccl-cu12-2.28.7 xgboost-3.1.1

[1m[[0

(np.float64(0.44340462733166064), np.float64(0.4167428683326873))