<a href="https://colab.research.google.com/github/Gon-Frecces/ML-zoomcamp/blob/main/fuel_efficiency_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv('/content/drive/MyDrive/car_fuel_efficiency.csv')
len(df)
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [4]:
for key in df.keys():
    if df[key].isnull().any():
        print(key)


# print(df['horsepower'].isnull().sum())



num_cylinders
horsepower
acceleration
num_doors


In [5]:
df['num_cylinders'].fillna(0)
df['horsepower'].fillna(0)
df['acceleration'].fillna(0)
df['num_doors'].fillna(0)

Unnamed: 0,num_doors
0,0.0
1,0.0
2,0.0
3,2.0
4,2.0
...,...
9699,0.0
9700,0.0
9701,-1.0
9702,1.0


In [6]:
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [7]:
X = df.drop(columns=['fuel_efficiency_mpg'])
y = df['fuel_efficiency_mpg']

X['num_cylinders'].fillna(0)
X['horsepower'].fillna(0)
X['acceleration'].fillna(0)
X['num_doors'].fillna(0)
y.fillna(0)

Unnamed: 0,fuel_efficiency_mpg
0,13.231729
1,13.688217
2,14.246341
3,16.912736
4,12.488369
...,...
9699,15.101802
9700,17.962326
9701,17.186587
9702,15.331551


In [8]:
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=1)

In [9]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=True)

X_train = X_train.fillna(0)
X_val = X_val.fillna(0)
X_test = X_test.fillna(0)


train_dicts = X_train.to_dict(orient='records')
val_dicts = X_val.to_dict(orient='records')
test_dicts = X_test.to_dict(orient='records')

X_train = dv.fit_transform(train_dicts)

X_val = dv.transform(val_dicts)
X_test = dv.transform(test_dicts)


In [10]:
from sklearn.tree import DecisionTreeRegressor,  export_text


In [11]:
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_train)

tree_rules = export_text(dt, feature_names=dv.get_feature_names_out())
print(tree_rules)

|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]



In [23]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np


rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

print(f"RMSE on validation data: {rmse:.3f}")

RMSE on validation data: 0.460


In [19]:
n_estimators_list = list(range(10, 210, 10))
rmse_scores = []

for n in n_estimators_list:
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores.append(rmse)
    print(f"n_estimators={n:3d} --> RMSE: {rmse:.3f}")

n_estimators= 10 --> RMSE: 0.460
n_estimators= 20 --> RMSE: 0.454
n_estimators= 30 --> RMSE: 0.452
n_estimators= 40 --> RMSE: 0.449
n_estimators= 50 --> RMSE: 0.447
n_estimators= 60 --> RMSE: 0.445
n_estimators= 70 --> RMSE: 0.445
n_estimators= 80 --> RMSE: 0.445
n_estimators= 90 --> RMSE: 0.445
n_estimators=100 --> RMSE: 0.445
n_estimators=110 --> RMSE: 0.444
n_estimators=120 --> RMSE: 0.444
n_estimators=130 --> RMSE: 0.444
n_estimators=140 --> RMSE: 0.443
n_estimators=150 --> RMSE: 0.443
n_estimators=160 --> RMSE: 0.443
n_estimators=170 --> RMSE: 0.443
n_estimators=180 --> RMSE: 0.442
n_estimators=190 --> RMSE: 0.442
n_estimators=200 --> RMSE: 0.442


In [20]:
depths = [10, 15, 20, 25]
n_estimators_list = list(range(10, 210, 10))
rmse_scores = []
mean_rmse_scores = []

for d in depths:
  for n in n_estimators_list:
      rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1, max_depth=d)
      rf.fit(X_train, y_train)
      y_pred = rf.predict(X_val)
      rmse = np.sqrt(mean_squared_error(y_val, y_pred))
      rmse_scores.append(rmse)
      # print(f"n_estimators={n:3d} depth={d:3d} --> RMSE: {rmse:.3f}")


  mean_rmse = np.mean(rmse_scores)
  mean_rmse_scores.append(mean_rmse)
  print(f"max_depth={d:3d} --> mean RMSE: {mean_rmse:.3f}")

max_depth= 10 --> RMSE: 0.442
max_depth= 15 --> RMSE: 0.444
max_depth= 20 --> RMSE: 0.444
max_depth= 25 --> RMSE: 0.445


In [21]:
depths = [10, 15, 20, 25]
n_estimators_list = list(range(10, 210, 10))

results = {}
mean_rmse_scores = []

for d in depths:
  rmse_scores = []

  for n in n_estimators_list:
      rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1, max_depth=d)
      rf.fit(X_train, y_train)
      y_pred = rf.predict(X_val)
      rmse = np.sqrt(mean_squared_error(y_val, y_pred))
      rmse_scores.append(rmse)
      # print(f"n_estimators={n:3d} depth={d:3d} --> RMSE: {rmse:.3f}")


  mean_rmse = np.mean(rmse_scores)
  results[d] = mean_rmse
  print(f"max_depth={d} --> mean RMSE: {mean_rmse:.3f}")



max_depth=10 --> mean RMSE: 0.442
max_depth=15 --> mean RMSE: 0.445
max_depth=20 --> mean RMSE: 0.446
max_depth=25 --> mean RMSE: 0.446


In [25]:
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)

rf.fit(X_train, y_train)

# Get feature names from DictVectorizer
feature_names = dv.get_feature_names_out()

# Create feature importance DataFrame
importances = pd.Series(rf.feature_importances_, index=feature_names)
importances = importances.sort_values(ascending=False)

# Display top features
print(importances.head(10))

# If you only want to see among specific features:
important_subset = importances[
    importances.index.isin([
        'vehicle_weight',
        'horsepower',
        'acceleration',
        'engine_displacement'
    ])
]

print("\nSelected feature importances:")
print(important_subset)

vehicle_weight         0.959150
horsepower             0.015998
acceleration           0.011480
engine_displacement    0.003273
model_year             0.003212
num_cylinders          0.002343
num_doors              0.001635
origin=USA             0.000540
origin=Europe          0.000519
origin=Asia            0.000462
dtype: float64

Selected feature importances:
vehicle_weight         0.959150
horsepower             0.015998
acceleration           0.011480
engine_displacement    0.003273
dtype: float64


In [26]:
!pip install xgboost



In [27]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np

# Convert data into DMatrix format (optimized for XGBoost)
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

# Watchlist for monitoring performance
watchlist = [(dtrain, 'train'), (dval, 'val')]

# Common parameters
common_params = {
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1
}


In [28]:
# Train with eta = 0.3
xgb_params_03 = dict(common_params)
xgb_params_03['eta'] = 0.3

model_03 = xgb.train(xgb_params_03, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=False)

y_pred_03 = model_03.predict(dval)
rmse_03 = np.sqrt(mean_squared_error(y_val, y_pred_03))
print(f"RMSE (eta=0.3): {rmse_03:.4f}")

RMSE (eta=0.3): 0.4502


In [29]:
# Train with eta = 0.1
xgb_params_01 = dict(common_params)
xgb_params_01['eta'] = 0.1

model_01 = xgb.train(xgb_params_01, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=False)

y_pred_01 = model_01.predict(dval)
rmse_01 = np.sqrt(mean_squared_error(y_val, y_pred_01))
print(f"RMSE (eta=0.1): {rmse_01:.4f}")

RMSE (eta=0.1): 0.4262
