In [1]:
#import requests
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from tqdm import tqdm

In [2]:
#link = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'
#response = requests.get(link)
#with open('C:/Users/Abdullahi Mujaheed/Desktop/mlzoom/mlzoomcamp/tree based/fuel_data.csv', 'wb') as file:
#    file.write(response.content)

In [3]:
data = pd.read_csv(r'C:\Users\Abdullahi Mujaheed\Desktop\mlzoom\mlzoomcamp\tree based\fuel_data.csv')
data

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.870990,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369
...,...,...,...,...,...,...,...,...,...,...,...
9699,140,5.0,164.0,2981.107371,17.3,2013,Europe,Diesel,Front-wheel drive,,15.101802
9700,180,,154.0,2439.525729,15.0,2004,USA,Gasoline,All-wheel drive,0.0,17.962326
9701,220,2.0,138.0,2583.471318,15.1,2008,USA,Diesel,All-wheel drive,-1.0,17.186587
9702,230,4.0,177.0,2905.527390,19.4,2011,USA,Diesel,Front-wheel drive,1.0,15.331551


In [4]:
data.dtypes

engine_displacement      int64
num_cylinders          float64
horsepower             float64
vehicle_weight         float64
acceleration           float64
model_year               int64
origin                  object
fuel_type               object
drivetrain              object
num_doors              float64
fuel_efficiency_mpg    float64
dtype: object

In [5]:
data = data.fillna('0')
data.isna().sum()


engine_displacement    0
num_cylinders          0
horsepower             0
vehicle_weight         0
acceleration           0
model_year             0
origin                 0
fuel_type              0
drivetrain             0
num_doors              0
fuel_efficiency_mpg    0
dtype: int64

In [6]:
train, val = train_test_split(data, test_size = .4, random_state =1)
val, test = train_test_split(val, test_size = .5, random_state= 1)

In [7]:
y_tr = train['fuel_efficiency_mpg']
y_val = val['fuel_efficiency_mpg']
y_ts = test['fuel_efficiency_mpg']
train = train.drop(['fuel_efficiency_mpg'], axis=1)
val = val.drop(['fuel_efficiency_mpg'], axis=1)
test = test.drop(['fuel_efficiency_mpg'], axis=1)

In [8]:
data.shape, train.shape, val.shape, test.shape

((9704, 11), (5822, 10), (1941, 10), (1941, 10))

In [9]:
vect = DictVectorizer(sparse= True)

In [10]:
def data_dict(frame):
    return frame.to_dict(orient= 'records')

In [11]:
train_dict = data_dict(train)
train_trans = vect.fit_transform(train_dict)
val_dict = data_dict(val)
val_trans = vect.transform(val_dict)
test_dict = data_dict(test)
test_trans = vect.transform(test_dict)

In [12]:
tree_reg = DecisionTreeRegressor(max_depth=1)

In [13]:
tree_reg.fit(train_trans, y_tr.values)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [14]:
importance = tree_reg.feature_importances_

In [15]:
tree = tree_reg.tree_
tree.feature

array([17, -2, -2])

In [16]:
feature_names = vect.get_feature_names_out()

In [17]:
feature_importance_dict = dict(zip(feature_names, importance))
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# Display
for feature, importance in sorted_features:
    print(f"Feature: {feature}, Importance: {importance}")

Feature: vehicle_weight, Importance: 1.0
Feature: acceleration, Importance: 0.0
Feature: acceleration=0, Importance: 0.0
Feature: drivetrain=All-wheel drive, Importance: 0.0
Feature: drivetrain=Front-wheel drive, Importance: 0.0
Feature: engine_displacement, Importance: 0.0
Feature: fuel_type=Diesel, Importance: 0.0
Feature: fuel_type=Gasoline, Importance: 0.0
Feature: horsepower, Importance: 0.0
Feature: horsepower=0, Importance: 0.0
Feature: model_year, Importance: 0.0
Feature: num_cylinders, Importance: 0.0
Feature: num_cylinders=0, Importance: 0.0
Feature: num_doors, Importance: 0.0
Feature: num_doors=0, Importance: 0.0
Feature: origin=Asia, Importance: 0.0
Feature: origin=Europe, Importance: 0.0
Feature: origin=USA, Importance: 0.0


In [18]:
forest = RandomForestRegressor(n_estimators= 10, random_state= 1, n_jobs= -1)

In [19]:
forest.fit(train_trans, y_tr)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [20]:
forest_val = forest.predict(val_trans)
root_mean_squared_error(y_val, forest_val)

0.463593877615232

In [21]:
depths = [10, 15, 20, 25]
for depth in tqdm(depths):
    print(f'for depth : {depth}')
    for value in (range(10, 210, 10)):
        fr_model = RandomForestRegressor(n_estimators= value, max_depth= depth, random_state= 1)
        fr_model.fit(train_trans, y_tr.values)
        fr_val = fr_model.predict(val_trans)
        rmse = root_mean_squared_error(y_val, fr_val)
        print(f'{value}: {rmse}')

  0%|          | 0/4 [00:00<?, ?it/s]

for depth : 10
10: 0.4539750031415985
20: 0.44341384305813664
30: 0.4391983754142612
40: 0.43797571600980145
50: 0.4367926371804551
60: 0.4357026990243875
70: 0.43558555565029294
80: 0.43567880186846564
90: 0.435435235690929
100: 0.43513175362429
110: 0.43450744508089745
120: 0.4347276462411842
130: 0.434450540783713
140: 0.43447518644599625
150: 0.43473761986894316
160: 0.4347633722517319
170: 0.43472030689103136
180: 0.4346961343785545
190: 0.434654292027172


 25%|██▌       | 1/4 [05:21<16:04, 321.45s/it]

200: 0.43444387880781166
for depth : 15
10: 0.460407043148769
20: 0.4467177725831503
30: 0.44152674291809474
40: 0.43996996161130675
50: 0.43832157900228463
60: 0.4367581737752201
70: 0.43671662706058395
80: 0.4368277555661091
90: 0.436286764501982
100: 0.4359116889847878
110: 0.43556274062514727
120: 0.43591200679935177
130: 0.4354526646086125
140: 0.43571096055475644
150: 0.43570692270088474
160: 0.4357376446117587
170: 0.43578363520376184
180: 0.43551653895200554
190: 0.4356359108874546


 50%|█████     | 2/4 [16:02<16:59, 509.54s/it]

200: 0.43527645485818117
for depth : 20
10: 0.463997329652161
20: 0.4480289982419928
30: 0.44241668026992553
40: 0.44106904243567224
50: 0.4389812172030433
60: 0.43709588610670863
70: 0.4371703001832074
80: 0.4372086821562933
90: 0.43638455411002197
100: 0.43607291843625157
110: 0.4355897478633808
120: 0.43611287419514344
130: 0.43579965274227883
140: 0.4359067793899847
150: 0.43596741423745977
160: 0.436005975778077
170: 0.4361906289891253
180: 0.43604721994642404
190: 0.4361190030376701


 75%|███████▌  | 3/4 [28:51<10:27, 627.74s/it]

200: 0.4358451958115517
for depth : 25
10: 0.4635938776152321
20: 0.4477513753721792
30: 0.44269936613288646
40: 0.4410584749018763
50: 0.4394282072019057
60: 0.4379977543939522
70: 0.4382919586994107
80: 0.43815842851351955
90: 0.4374497329231701
100: 0.43713345631574774
110: 0.4366903479239228
120: 0.4371844716788016
130: 0.4366023023037728
140: 0.4367303832728525
150: 0.436800058689207
160: 0.436773235485905
170: 0.4367657690725709
180: 0.43657532605867266
190: 0.4367372037073646


100%|██████████| 4/4 [41:12<00:00, 618.02s/it]

200: 0.43625834935629476





In [22]:
model = RandomForestRegressor(n_estimators=10,
max_depth=20,
random_state=1,
n_jobs=-1)
model.fit(train_trans, y_tr)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [23]:
importances = model.feature_importances_

In [24]:
i = np.argmax(importances)
i

np.int64(17)

In [25]:
importances[i]

np.float64(0.9597952777077449)

In [20]:
vect.get_feature_names_out()[i]

NameError: name 'i' is not defined

In [94]:
xg_train = xgb.DMatrix(train_trans, label= y_tr)
xg_val = xgb.DMatrix(val_trans, y_val)

In [103]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

In [104]:
wishlist = [(xg_train, 'trained'), (xg_val,  'evaluation')]

In [105]:
#%%capture output

model = xgb.train(xgb_params, xg_train, num_boost_round=100)

In [106]:
val_preds = model.predict(xg_val)

In [107]:
val_rmse = root_mean_squared_error(y_val, val_preds)

In [108]:
val_rmse

0.43983716636225784