In [1]:
# %pip uninstall seaborn matplotliby
%pip install seaborn matplotlib

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv('data/cleaned_data.csv')
data.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,temp_diff,AC,rain,sun,snow,gas_type,litter_per_km,time_spent
0,28.0,5.0,26,21.5,12,-9.5,0,0,0,0,E10,0.178571,1.076923
1,12.0,4.2,30,21.5,13,-8.5,0,0,0,0,E10,0.35,0.4
2,11.2,5.5,38,21.5,15,-6.5,0,0,0,0,E10,0.491071,0.294737
3,12.9,3.9,36,21.5,14,-7.5,0,0,0,0,E10,0.302326,0.358333
4,18.5,4.5,46,21.5,15,-6.5,0,0,0,0,E10,0.243243,0.402174


In [4]:
# explod the gas_type columnn
data = pd.get_dummies(data, columns=['gas_type'], drop_first=True)


In [5]:
# Convert the gas_type_SP98 as int type
data['gas_type_SP98'] = data['gas_type_SP98'].astype(int)
data.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,temp_diff,AC,rain,sun,snow,litter_per_km,time_spent,gas_type_SP98
0,28.0,5.0,26,21.5,12,-9.5,0,0,0,0,0.178571,1.076923,0
1,12.0,4.2,30,21.5,13,-8.5,0,0,0,0,0.35,0.4,0
2,11.2,5.5,38,21.5,15,-6.5,0,0,0,0,0.491071,0.294737,0
3,12.9,3.9,36,21.5,14,-7.5,0,0,0,0,0.302326,0.358333,0
4,18.5,4.5,46,21.5,15,-6.5,0,0,0,0,0.243243,0.402174,0


In [6]:
corr = data.corr()
corr

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,temp_diff,AC,rain,sun,snow,litter_per_km,time_spent,gas_type_SP98
distance,1.0,-0.128967,0.562299,0.077636,0.088175,0.080453,-0.025738,-0.019791,0.089808,-0.033712,-0.256357,0.884942,-0.053411
consume,-0.128967,1.0,-0.227866,-0.154953,-0.320811,-0.311697,0.096591,0.248118,-0.1702,0.072961,0.779053,-0.177825,-0.015327
speed,0.562299,-0.227866,1.0,0.060039,0.015411,0.007143,-0.035408,0.009489,0.081895,0.032481,-0.355459,0.245061,-0.09736
temp_inside,0.077636,-0.154953,0.060039,1.0,0.357431,0.224108,0.300407,-0.035199,0.238954,0.09823,-0.120525,0.069588,0.005771
temp_outside,0.088175,-0.320811,0.015411,0.357431,1.0,0.990287,0.167562,-0.186315,0.338967,-0.162367,-0.080493,0.107179,0.148705
temp_diff,0.080453,-0.311697,0.007143,0.224108,0.990287,1.0,0.130129,-0.189179,0.318138,-0.184053,-0.066052,0.101481,0.154314
AC,-0.025738,0.096591,-0.035408,0.300407,0.167562,0.130129,1.0,0.242915,0.084694,0.065984,0.050622,-0.00926,0.105285
rain,-0.019791,0.248118,0.009489,-0.035199,-0.186315,-0.189179,0.242915,1.0,-0.086497,0.271633,0.090824,-0.0273,0.060328
sun,0.089808,-0.1702,0.081895,0.238954,0.338967,0.318138,0.084694,-0.086497,1.0,-0.031118,-0.062719,0.073659,0.030185
snow,-0.033712,0.072961,0.032481,0.09823,-0.162367,-0.184053,0.065984,0.271633,-0.031118,1.0,-0.001807,-0.056517,0.033665


In [7]:
# import seaborn as sns
# import matplotlib.pyplot as plt
# corr=np.abs(data.corr())

# #Set up mask for triangle representation
# mask = np.zeros_like(corr, dtype=bool)
# mask[np.triu_indices_from(mask)] = True

# # Set up the matplotlib figure
# f, ax = plt.subplots(figsize=(20, 15))
# # Generate a custom diverging colormap
# cmap = sns.diverging_palette(220, 10, as_cmap=True)
# # Draw the heatmap with the mask and correct aspect ratio
# sns.heatmap(corr, mask=mask,  vmax=1,square=True, linewidths=.5, cbar_kws={"shrink": .5},annot = corr)

# plt.show()

In [8]:
# Because we are using consume as our target, we can drop the columns that have a  high Multicollinearity 
data = data.drop(columns=['temp_outside','time_spent'])
data.head()


Unnamed: 0,distance,consume,speed,temp_inside,temp_diff,AC,rain,sun,snow,litter_per_km,gas_type_SP98
0,28.0,5.0,26,21.5,-9.5,0,0,0,0,0.178571,0
1,12.0,4.2,30,21.5,-8.5,0,0,0,0,0.35,0
2,11.2,5.5,38,21.5,-6.5,0,0,0,0,0.491071,0
3,12.9,3.9,36,21.5,-7.5,0,0,0,0,0.302326,0
4,18.5,4.5,46,21.5,-6.5,0,0,0,0,0.243243,0


In [10]:
# Train Test Split
X = data.drop(columns=['consume'])
y = data['consume']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=0)


In [11]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)

In [12]:
X_train_scaled = scaler.transform(X_train)

X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns)
X_train_scaled.head()

Unnamed: 0,distance,speed,temp_inside,temp_diff,AC,rain,sun,snow,litter_per_km,gas_type_SP98
0,0.126399,0.276316,0.461538,0.558824,0.0,0.0,1.0,0.0,0.017523,1.0
1,0.047108,0.131579,0.384615,0.338235,0.0,0.0,0.0,0.0,0.061742,1.0
2,0.014925,0.157895,0.384615,0.25,0.0,0.0,0.0,0.0,0.202515,0.0
3,0.044776,0.315789,0.461538,0.617647,0.0,0.0,0.0,0.0,0.056255,1.0
4,0.01166,0.157895,0.384615,0.279412,0.0,0.0,0.0,0.0,0.221864,1.0


In [13]:
X_test_scaled = pd.DataFrame(X_test_scaled, columns = X_test.columns)
X_test_scaled.head()

Unnamed: 0,distance,speed,temp_inside,temp_diff,AC,rain,sun,snow,litter_per_km,gas_type_SP98
0,0.020522,0.184211,0.384615,0.367647,0.0,0.0,0.0,0.0,0.161913,1.0
1,0.047108,0.315789,0.615385,0.088235,0.0,1.0,0.0,1.0,0.057725,1.0
2,0.011194,0.131579,0.538462,0.485294,0.0,0.0,0.0,0.0,0.204245,0.0
3,0.181437,0.394737,0.307692,0.235294,0.0,0.0,0.0,0.0,0.01325,0.0
4,0.066698,0.342105,0.461538,0.352941,0.0,0.0,0.0,0.0,0.033654,1.0


In [16]:
# K Nearest Neighbors Test for scaled data

# Import necessary libraries
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error

# Define our KNN regressor, using 10 neighbors
knnregress = KNeighborsRegressor(n_neighbors=10)

# Fit KNN to scaled data
knnregress.fit(X_train_scaled, y_train)
pred = knnregress.predict(X_test_scaled)

# Test model's performance
print("MAE", mean_absolute_error(pred, y_test))
print("RMSE", mean_squared_error(pred, y_test, squared=False))
print("R2 score", knnregress.score(X_test_scaled, y_test))

MAE 0.566794871794872
RMSE 0.9807447457979354
R2 score 0.44279167952101817


In [17]:
# linear regression
# import packages and measure
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# define linear regression model
lin_reg = LinearRegression()

# fits data to a linear regression
lin_reg.fit(X_train_scaled, y_train)

pred = lin_reg.predict(X_test_scaled)

print("MAE", mean_absolute_error(pred, y_test))
print("RMSE", mean_squared_error(pred, y_test, squared=False))
print("R2 score", lin_reg.score(X_test_scaled, y_test))


MAE 0.43607136619531783
RMSE 0.6073161995630046
R2 score 0.7863340373066566


In [18]:
# Decision Tree
# Import package
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor(max_depth=10)

# fit data to tree model
tree.fit(X_train_scaled, y_train)

pred = tree.predict(X_test_scaled)

print("MAE", mean_absolute_error(pred, y_test))
print("RMSE", mean_squared_error(pred, y_test, squared=False))
print("R2 score", tree.score(X_test_scaled, y_test))


MAE 0.48511574074074076
RMSE 0.9793169518299687
R2 score 0.44441289558180364


In [20]:
# Random Patches
# Initialize Random Forest
forest = RandomForestRegressor(n_estimators=100,
                             max_depth=20)

# Train model
forest.fit(X_train_scaled, y_train)

# Set predictor
pred = forest.predict(X_test_scaled)

# Evaluate Model
print("MAE", mean_absolute_error(pred, y_test))
print("RMSE", mean_squared_error(pred, y_test, squared=False))
print("R2 score", forest.score(X_test_scaled, y_test))

MAE 0.2841025641025644
RMSE 0.4652586267266733
R2 score 0.8746008820941737


In [22]:
# AdaBoost
# Initialize AdaBoost model
ada_reg = AdaBoostRegressor(DecisionTreeRegressor(max_depth=20),
                            n_estimators=100)

# Train Model 1
ada_reg.fit(X_train_scaled, y_train)

# Set predictor
pred = ada_reg.predict(X_test_scaled)
print("MAE", mean_absolute_error(pred, y_test))
print("RMSE", mean_squared_error(pred, y_test, squared=False))
print("R2 score", ada_reg.score(X_test_scaled, y_test))


MAE 0.32179487179487165
RMSE 0.5888929204828934
R2 score 0.7991007666906617


### Look like that Random Patches has the best R2 result with 
- MAE 0.2841025641025644
- RMSE 0.4652586267266733
- R2 score 0.8746008820941737