In [2]:
#pip -q install scikit-learn xgboost pandas numpy
!wget -q https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv -O car_fuel_efficiency.csv

In [3]:
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

df = pd.read_csv("car_fuel_efficiency.csv")
df = df.fillna(0)
y = df["fuel_efficiency_mpg"].values
X = df.drop(columns=["fuel_efficiency_mpg"])
X_train_full, X_temp, y_train_full, y_temp = train_test_split(X, y, test_size=0.4, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(X_train_full.to_dict(orient="records"))
X_val_m = dv.transform(X_val.to_dict(orient="records"))
X_test_m = dv.transform(X_test.to_dict(orient="records"))
feature_names = dv.get_feature_names_out().tolist()
print(len(X_train_full), len(X_val), len(X_test))

5822 1941 1941


In [7]:
#Q1
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_text

dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_train_full)
tree_report = export_text(dt, feature_names=feature_names)
first_line = tree_report.strip().splitlines()[0]
split_idx = dt.tree_.feature[0]
split_feature = feature_names[split_idx]
print(split_feature)
print(tree_report)

vehicle_weight
|--- vehicle_weight <= 3028.82
|   |--- value: [16.86]
|--- vehicle_weight >  3028.82
|   |--- value: [12.87]



In [10]:
#Q2
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train_full)
pred_val = rf.predict(X_val_m)
rmse_val = mean_squared_error(y_val, pred_val)
print(round(rmse_val, 3))

0.212


In [11]:
#Q3
best_rmse = float("inf")
best_n = None
history = []
for n in range(10, 201, 10):
    m = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    m.fit(X_train, y_train_full)
    p = m.predict(X_val_m)
    r = mean_squared_error(y_val, p)
    r3 = round(r, 3)
    history.append((n, r3))
    if r3 < round(best_rmse, 3):
        best_rmse = r
        best_n = n
stop_after = None
for i in range(len(history)-1):
    if history[i][0] >= best_n and history[i+1][1] >= history[i][1]:
        stop_after = history[i][0]
        break
if stop_after is None:
    stop_after = history[-1][0]
print(history)
print(stop_after)

[(10, 0.212), (20, 0.199), (30, 0.193), (40, 0.192), (50, 0.191), (60, 0.19), (70, 0.19), (80, 0.19), (90, 0.19), (100, 0.189), (110, 0.189), (120, 0.19), (130, 0.189), (140, 0.189), (150, 0.189), (160, 0.189), (170, 0.189), (180, 0.189), (190, 0.19), (200, 0.189)]
100


In [13]:
#Q4
depths = [10, 15, 20, 25]
summary = []
for d in depths:
    rmses = []
    for n in range(10, 201, 10):
        m = RandomForestRegressor(n_estimators=n, max_depth=d, random_state=1, n_jobs=-1)
        m.fit(X_train, y_train_full)
        p = m.predict(X_val_m)
        r = mean_squared_error(y_val, p)
        rmses.append(r)
    mean_rmse = float(np.mean(rmses))
    summary.append((d, round(mean_rmse, 3)))
best_depth = min(summary, key=lambda x: x[1])[0]
print(summary)
print(best_depth)

[(10, 0.19), (15, 0.192), (20, 0.192), (25, 0.192)]
10


In [14]:
#Q5
candidates = ["vehicle_weight","horsepower","acceleration","engine_displacement"]
m = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
m.fit(X_train, y_train_full)
imp = m.feature_importances_
imap = dict(zip(feature_names, imp))
scores = {c: sum(v for k,v in imap.items() if k.startswith(c+"=") or k==c) for c in candidates}
top = max(scores.items(), key=lambda x: x[1])[0]
print(scores)
print(top)

{'vehicle_weight': np.float64(0.9598782143148441), 'horsepower': np.float64(0.015933481489766168), 'acceleration': np.float64(0.011442313735237557), 'engine_displacement': np.float64(0.003159424030350312)}
vehicle_weight


In [17]:
#Q6
import xgboost as xgb
from sklearn.metrics import mean_squared_error

dtrain = xgb.DMatrix(X_train, label=y_train_full, feature_names=feature_names)
dval = xgb.DMatrix(X_val_m, label=y_val, feature_names=feature_names)

def run_eta(eta):
    params = {'eta': eta, 'max_depth': 6, 'min_child_weight': 1, 'objective': 'reg:squarederror', 'nthread': 8, 'seed': 1, 'verbosity': 1}
    evallist = [(dtrain, 'train'), (dval, 'val')]
    model = xgb.train(params, dtrain, num_boost_round=100, evals=evallist, verbose_eval=False)
    pred = model.predict(dval)
    rmse = mean_squared_error(y_val, pred)
    return rmse

rmse_03 = run_eta(0.3)
rmse_01 = run_eta(0.1)
winner = "0.3" if rmse_03 < rmse_01 else ("0.1" if rmse_01 < rmse_03 else "equal")
print(round(rmse_03,3), round(rmse_01,3), winner)

0.197 0.174 0.1
