In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split

In [25]:
# IMPORT THE DATASET

df = pd.read_csv('df_processed.csv',sep=',',header=0,index_col=0)
df.head()

Unnamed: 0,Counter_ID1,Counter_ID2,Counter_site_name,Hourly_counting,Latitude,Longitude,counting_year,counting_month,counting_day,counting_hour,counting_day_name,weekday,weekend,installation_year,installation_month,installation_day
0,100003096,353242251,0,80,48.83504,2.33314,2023,4,1,7,0,0,1,2012.0,2.0,22.0
1,100003096,353242251,0,4,48.83504,2.33314,2023,4,1,3,0,0,1,2012.0,2.0,22.0
2,100003096,353242251,0,17,48.83504,2.33314,2023,4,1,5,0,0,1,2012.0,2.0,22.0
3,100003096,353242251,0,58,48.83504,2.33314,2023,4,1,8,0,0,1,2012.0,2.0,22.0
4,100003096,353242251,0,5,48.83504,2.33314,2023,4,1,4,0,0,1,2012.0,2.0,22.0


In [21]:
df.columns

Index(['Counter_ID1', 'Counter_ID2', 'Counter_site_name', 'Hourly_counting',
       'Latitude', 'Longitude', 'counting_year', 'counting_month',
       'counting_day', 'counting_hour', 'counting_day_name', 'weekday',
       'weekend', 'installation_year', 'installation_month',
       'installation_day'],
      dtype='object')

In [3]:
# SPLITTING FEATURE / TARGET VARIABLE AND TRAIN / TEST SETS

feats = df.drop(['Hourly_counting'],axis=1)
target = df['Hourly_counting']

X_train,X_test,y_train,y_test = train_test_split(feats,target,test_size=0.2)

In [4]:
# TRAINING THE RF MODEL

import time
start_time_train = time.time()
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train,y_train)
end_time_train = time.time()
training_time = end_time_train - start_time_train

# score of the model for train and test data
print('Training time is:', training_time, 'seconds')
print(f"The score of train set {rf_model.score(X_train,y_train)}")
print(f"The score of test set {rf_model.score(X_test,y_test)}")

Training time is: 629.8908522129059 seconds
The score of train set 0.9800549377192965
The score of test set 0.8993199015586812


In [5]:
# RANDOM FOREST METRICS

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# predict the Randomforest regressor
yrf_pred_train = rf_model.predict(X_train)
yrf_pred_test = rf_model.predict(X_test)

# calculate R2 for the model
rfmodel_r2_train = r2_score(y_train,yrf_pred_train)
rfmodel_r2_test = r2_score(y_test,yrf_pred_test)

# calculate the Mean absolute error and mean_squared error
mae_train = mean_absolute_error(y_train,yrf_pred_train)
mae_test = mean_absolute_error(y_test,yrf_pred_test)

# calculate the mean squared error
mse_train = mean_squared_error(y_train,yrf_pred_train)
mse_test = mean_squared_error(y_test,yrf_pred_test)

#Print R2 score
print(f"The R2 score for train is {rfmodel_r2_train}")
print(f"The R2 score for test is {rfmodel_r2_test}")
print(f"The mean_absolute_error for train set is {mae_train}")
print(f"The mean_absolute_error for test set is {mae_test}")
print(f"The mean_squared_error for train set is {mse_train}")
print(f"The mean_squared_error for test set is {mse_test}")

The R2 score for train is 0.9800549377192965
The R2 score for test is 0.8993199015586812
The mean_absolute_error for train set is 5.997724102466067
The mean_absolute_error for test set is 14.529167397565752
The mean_squared_error for train set is 219.82423850706402
The mean_squared_error for test set is 1096.5786548410651


In [26]:
# Make the model training simpler

df_2 = df[['Counter_site_name', 'Hourly_counting','Latitude','Longitude','counting_year','counting_month','counting_day','counting_hour', 'counting_day_name', 'weekday','weekend']]

df.to_csv("C:/Users/loris/Desktop\DA_project/Streamlit_app/df_model2.csv")


In [27]:
# SPLITTING FEATURE / TARGET VARIABLE AND TRAIN / TEST SETS

feats = df_2.drop(['Hourly_counting'],axis=1)
target = df_2['Hourly_counting']

X_train,X_test,y_train,y_test = train_test_split(feats,target,test_size=0.2)

In [28]:
# TRAINING THE RF MODEL 

import time
start_time_train = time.time()
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train,y_train)
end_time_train = time.time()
training_time = end_time_train - start_time_train

# score of the model for train and test data
print('Training time is:', training_time, 'seconds')

Training time is: 547.5059809684753 seconds
The score of train set 0.9340630986380993
The score of test set 0.7730210052725188


In [29]:
# predict the Randomforest regressor
yrf_pred_train = rf_model.predict(X_train)
yrf_pred_test = rf_model.predict(X_test)

# calculate R2 for the model
rfmodel_r2_train = r2_score(y_train,yrf_pred_train)
rfmodel_r2_test = r2_score(y_test,yrf_pred_test)

# calculate the Mean absolute error and mean_squared error
mae_train = mean_absolute_error(y_train,yrf_pred_train)
mae_test = mean_absolute_error(y_test,yrf_pred_test)

# calculate the mean squared error
mse_train = mean_squared_error(y_train,yrf_pred_train)
mse_test = mean_squared_error(y_test,yrf_pred_test)

#Print R2 score
print(f"The R2 score for train is {rfmodel_r2_train}")
print(f"The R2 score for test is {rfmodel_r2_test}")
print(f"The mean_absolute_error for train set is {mae_train}")
print(f"The mean_absolute_error for test set is {mae_test}")
print(f"The mean_squared_error for train set is {mse_train}")
print(f"The mean_squared_error for test set is {mse_test}")

The R2 score for train is 0.9340630986380993
The R2 score for test is 0.7730210052725188
The mean_absolute_error for train set is 11.599194261440738
The mean_absolute_error for test set is 22.187904652994728
The mean_squared_error for train set is 724.3430638295722
The mean_squared_error for test set is 2504.9544350438596


In [30]:
import pickle
import os

# Define the relative path to the desired save location
relative_path = os.path.join(os.getcwd(), 'rf_model.pkl')

# Create directory if it does not exist
os.makedirs(os.path.dirname(relative_path), exist_ok=True)

# Save the model using pickle
with open(relative_path, 'wb') as best_rfmodel:
    pickle.dump(rf_model, best_rfmodel)