In [3]:
import pandas as pd
from io import StringIO
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import xgboost as xgb

In [2]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.1-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.8/72.0 MB 2.2 MB/s eta 0:00:32
    --------------------------------------- 1.3/72.0 MB 2.5 MB/s eta 0:00:29
   - -------------------------------------- 2.1/72.0 MB 2.8 MB/s eta 0:00:25
   - -------------------------------------- 3.1/72.0 MB 3.2 MB/s eta 0:00:22
   -- ------------------------------------- 3.9/72.0 MB 3.5 MB/s eta 0:00:20
   -- ------------------------------------- 5.2/72.0 MB 3.8 MB/s eta 0:00:18
   --- ------------------------------------ 6.3/72.0 MB 4.0 MB/s eta 0:00:17
   ---- ----------------------------------- 7.6/72.0 MB 4.2 MB/s eta 0:00:16
   ---- ----------------------------------- 8.9/72.0 MB 4.4 MB/s eta 0:00:15
   ----- -----------

In [5]:
# ----------------------------------------------------------------------
# 1. Data Loading and Preprocessing
# ----------------------------------------------------------------------

# The provided file content (simulated loading from 'car_fuel_efficiency.csv')
# Load data into a DataFrame
df = pd.read_csv('car_fuel_efficiency.csv')

In [6]:
# Standardize column names
df.columns = df.columns.str.lower().str.replace(' ', '_')
string_columns = list(df.dtypes[df.dtypes == 'object'].index)

In [7]:
# Standardize string values
for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_').str.replace('-', '_').str.replace("'", '')

In [8]:
# Define target variable
target = 'fuel_efficiency_mpg'

In [9]:
# Split the data (80% full_train, 20% test)
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
# Split full_train into train (60% total) and validation (20% total)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1) # 0.25 * 0.8 = 0.2

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [10]:
# Separate target variable from features
y_train = df_train[target].values
y_val = df_val[target].values
y_test = df_test[target].values

del df_train[target]
del df_val[target]
del df_test[target]

In [11]:
# Identify feature columns
numerical = ['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight', 'acceleration', 'model_year', 'num_doors']
categorical = ['origin', 'fuel_type', 'drivetrain']
base = numerical + categorical

In [12]:
# Impute missing values with 0 (as required by the context of this problem set)
df_train[base] = df_train[base].fillna(0)
df_val[base] = df_val[base].fillna(0)
df_test[base] = df_test[base].fillna(0)

In [13]:
# Vectorization using DictVectorizer
dv = DictVectorizer(sparse=False) # sparse=False for easier feature importance access

train_dict = df_train[base].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[base].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [14]:
# ----------------------------------------------------------------------
# 2. Question 1: Decision Tree Regressor (max_depth=1)
# ----------------------------------------------------------------------

print("--- Question 1 ---")
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_train)

feature_importances_q1 = pd.Series(dt.feature_importances_, index=dv.get_feature_names_out())
most_important_q1 = feature_importances_q1.nlargest(1).index[0]
print(f"Most important feature for max_depth=1: {most_important_q1}")

--- Question 1 ---
Most important feature for max_depth=1: vehicle_weight


In [15]:
# ----------------------------------------------------------------------
# 3. Question 2: Random Forest (n_estimators=10, max_depth=20)
# ----------------------------------------------------------------------

print("\n--- Question 2 ---")
rf_q2 = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf_q2.fit(X_train, y_train)

y_pred_q2 = rf_q2.predict(X_val)
rmse_q2 = np.sqrt(mean_squared_error(y_val, y_pred_q2))
print(f"RMSE on validation for n_estimators=10, max_depth=20: {rmse_q2:.4f}")


--- Question 2 ---
RMSE on validation for n_estimators=10, max_depth=20: 0.4591


In [16]:
# ----------------------------------------------------------------------
# 4. Question 3: Tuning max_depth for Random Forest (n_estimators=10)
# ----------------------------------------------------------------------

print("\n--- Question 3 ---")
max_depths = [10, 15, 20, 25]
scores_q3 = []

for depth in max_depths:
    rf_q3 = RandomForestRegressor(n_estimators=10, max_depth=depth, random_state=1, n_jobs=-1)
    rf_q3.fit(X_train, y_train)
    y_pred_q3 = rf_q3.predict(X_val)
    rmse_q3 = np.sqrt(mean_squared_error(y_val, y_pred_q3))
    scores_q3.append((depth, rmse_q3))

scores_df_q3 = pd.DataFrame(scores_q3, columns=['max_depth', 'rmse'])
best_depth = scores_df_q3.loc[scores_df_q3['rmse'].idxmin()]
print("RMSE scores for different max_depth:")
print(scores_df_q3)
best_max_depth = int(best_depth['max_depth'])
print(f"Best max_depth: {best_max_depth} with RMSE: {best_depth['rmse']:.4f}")


--- Question 3 ---
RMSE scores for different max_depth:
   max_depth      rmse
0         10  0.451895
1         15  0.457160
2         20  0.459109
3         25  0.459621
Best max_depth: 10 with RMSE: 0.4519


In [17]:
# ----------------------------------------------------------------------
# 5. Question 4: Tuning n_estimators for Random Forest (with best max_depth)
# ----------------------------------------------------------------------

print("\n--- Question 4 ---")
n_estimators_values = [10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
scores_q4 = []

# Use the best max_depth found in Q3
for n in n_estimators_values:
    rf_q4 = RandomForestRegressor(n_estimators=n, max_depth=best_max_depth, random_state=1, n_jobs=-1)
    rf_q4.fit(X_train, y_train)
    y_pred_q4 = rf_q4.predict(X_val)
    rmse_q4 = np.sqrt(mean_squared_error(y_val, y_pred_q4))
    scores_q4.append((n, rmse_q4))

scores_df_q4 = pd.DataFrame(scores_q4, columns=['n_estimators', 'rmse'])
smallest_rmse_q4 = scores_df_q4['rmse'].min()
print("RMSE scores for different n_estimators:")
print(scores_df_q4)
print(f"Smallest RMSE on validation: {smallest_rmse_q4:.4f}")


--- Question 4 ---
RMSE scores for different n_estimators:
    n_estimators      rmse
0             10  0.451895
1             15  0.448851
2             20  0.448719
3             25  0.446442
4             30  0.446225
5             35  0.445145
6             40  0.443877
7             45  0.443214
8             50  0.442682
9             55  0.442713
10            60  0.442350
Smallest RMSE on validation: 0.4423


In [18]:
# ----------------------------------------------------------------------
# 6. Question 5: Feature Importance in Random Forest (n_estimators=10, max_depth=20)
# ----------------------------------------------------------------------

print("\n--- Question 5 ---")
# Use the model trained in Q2 (n_estimators=10, max_depth=20)
rf_q5 = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf_q5.fit(X_train, y_train)

feature_importances_q5 = pd.Series(rf_q5.feature_importances_, index=dv.get_feature_names_out())
target_features_q5 = ['vehicle_weight', 'horsepower', 'acceleration', 'engine_displacement']
importances_q5 = feature_importances_q5[target_features_q5].sort_values(ascending=False)

print("Importances (among the 4 features):")
print(importances_q5)
most_important_q5 = importances_q5.index[0]
print(f"Most important feature (among the 4) for Q5: {most_important_q5}")


--- Question 5 ---
Importances (among the 4 features):
vehicle_weight         0.959162
horsepower             0.016040
acceleration           0.011471
engine_displacement    0.003269
dtype: float64
Most important feature (among the 4) for Q5: vehicle_weight


In [20]:
# ----------------------------------------------------------------------
# 7. Question 6: XGBoost (eta=0.3 vs eta=0.1)
# ----------------------------------------------------------------------

print("\n--- Question 6 ---")
# Create DMatrix objects
# Convert NumPy array of feature names to a list of strings
feature_names = dv.get_feature_names_out().tolist()  # Convert to list

dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=feature_names)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=feature_names)
watchlist = [(dtrain, 'train'), (dval, 'val')]
num_boost_round = 100

# XGBoost with eta=0.3
params_03 = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 0,
}
model_03 = xgb.train(params_03, dtrain, num_boost_round=num_boost_round, evals=watchlist, verbose_eval=False)
y_pred_03 = model_03.predict(dval)
rmse_03 = np.sqrt(mean_squared_error(y_val, y_pred_03))
print(f"RMSE on validation for eta=0.3: {rmse_03:.4f}")

# XGBoost with eta=0.1
params_01 = {
    'eta': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 0,
}
model_01 = xgb.train(params_01, dtrain, num_boost_round=num_boost_round, evals=watchlist, verbose_eval=False)
y_pred_01 = model_01.predict(dval)
rmse_01 = np.sqrt(mean_squared_error(y_val, y_pred_01))
print(f"RMSE on validation for eta=0.1: {rmse_01:.4f}")

best_eta = 0.3 if rmse_03 < rmse_01 else 0.1
print(f"Best eta: {best_eta}")


--- Question 6 ---
RMSE on validation for eta=0.3: 0.4502
RMSE on validation for eta=0.1: 0.4262
Best eta: 0.1
