# Import libraries

In [None]:
# Random Forest Regression Model (Ensemble learning technique)

# Import necessary libraries for data handling, visualization, and model evaluation
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
!pip install memory_profiler # Install memory profiler to track memory usage
from google.colab import files
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from memory_profiler import memory_usage
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,r2_score,mean_absolute_percentage_error,mean_squared_error

# Step 1: Dataset upload

In [None]:
# Step 1: Dataset upload
# Prompt the user to upload the dataset
upload = files.upload()
dataset = pd.read_csv('All_Div_BD.csv') # Load dataset into a DataFrame
dataset.shape # Print the shape of the dataset
dataset.head(5) # Display the first 5 rows for preview

# Step 2: Exclude nighttime data (hours outside 7 AM - 7 PM)

In [None]:
# Step 2: Exclude nighttime data (hours outside 7 AM - 7 PM)
# Filter rows with hour < 7 or hour > 19
before_seven_am = dataset[dataset['hour']<7]
after_seven_pm = dataset[dataset['hour']>19]
before_seven_am.head(10)
after_seven_pm.head(10)
dataset = dataset.drop(before_seven_am.index, axis=0) # Drop early morning rows
dataset = dataset.drop(after_seven_pm.index, axis=0) # Drop late evening rows

# Step 3: Correlation matrix visualization

In [None]:
# Step 3: Correlation matrix visualization
# Analyze correlations between features
dataset.corr()
plt.figure(figsize=(10,10))
sns.heatmap(dataset.corr(), annot=True)

# Step 4: Define dependent (target) and independent (features) variables

In [None]:
# Step 4: Define dependent (target) and independent (features) variables
# Exclude irrelevant columns from the feature set
x = dataset.drop(['year','month','day','hour','wbgt'],axis=1)
print(x.shape) # Print the shape of the independent variables
colms = x.shape[1] # Extract the number of features (columns) in the independent variables
colms # Output the number of features
y = dataset['wbgt'] # Target variable: WBGT
y.shape # Print the shape of the dependent variable

# Step 5: Train, test, and validation split

In [None]:
# Step 5: Train, test, and validation split
# Split the dataset into training (80%) and test (20%) sets
X_main,X_test,y_main,y_test = train_test_split(x,y,test_size=0.2,random_state=0)
# Further split the training set into training (70%) and validation (10%) sets
X_train,X_val,y_train,y_val = train_test_split(X_main,y_main,test_size=0.1,random_state=0)
print(X_val.shape) # Print the shape of the validation set
print(X_test.shape) # Print the shape of the test set
val_rows = X_val.shape[0] # Store the number of rows in the validation set
test_rows = X_test.shape[0] # Store the number of rows in the test set
print(val_rows)
print(test_rows)

# Step 6: Model training with Random Forest

In [None]:
# Step 6: Model training with Random Forest
# Create a Random Forest Regressor with 50 decision trees and Out-of-Bag score
reg_obj = RandomForestRegressor(n_estimators=50,oob_score=True)
reg_obj.fit(X_train,y_train) # Fit the model on the training data
print("OOB score: %f"%reg_obj.oob_score_) # Print Out-of-Bag score

# Step 7: Predictions for test and validation sets

In [None]:
# Step 7: Predictions for test and validation sets
# Predict WBGT for the test set
y_test_pred = reg_obj.predict(X_test)
# Predict WBGT for the validation set
y_val_pred = reg_obj.predict(X_val)

# Step 8: Performance and error metrics

In [None]:
# Step 8: Performance and error metrics
# Test set
r2_test = r2_score(y_test,y_test_pred)
adjusted_r2_test = 1-((1-r2_test)*(test_rows-1)/(test_rows-1-colms))
print("R2 score: %f"%r2_test)
print("Adjusted R2 score: %f"%adjusted_r2_test)
print("MAE: %f"%mean_absolute_error(y_test,y_test_pred))
print("MAPE: %f"%mean_absolute_percentage_error(y_test,y_test_pred))
print("MSE: %f"%mean_squared_error(y_test,y_test_pred))
print("RMSE: %f"%np.sqrt(mean_squared_error(y_test,y_test_pred)))
# Validation set
r2_val = r2_score(y_val,y_val_pred)
adjusted_r2_val = 1-((1-r2_val)*(val_rows-1)/(val_rows-1-colms))
print("R2 score: %f"%r2_val)
print("Adjusted R2 score: %f"%adjusted_r2_val)
print("MAE: %f"%mean_absolute_error(y_val,y_val_pred))
print("MAPE: %f"%mean_absolute_percentage_error(y_val,y_val_pred))
print("MSE: %f"%mean_squared_error(y_val,y_val_pred))
print("RMSE: %f"%np.sqrt(mean_squared_error(y_val,y_val_pred)))

# Step 9: Customize font for plots

In [None]:
# Step 9: Customize font for plots
font_upload = files.upload() # Upload custom font file
font_path = '/content/arial.ttf' # Path to the uploaded font file
fm.fontManager.addfont(font_path) # Add font to Matplotlib
# Apply font customization globally
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Arial']
plt.rcParams['font.size'] = 15

# Step 10: Visualize the importance of each feature

In [None]:
# Step 10: Visualize the importance of each feature
importances = reg_obj.feature_importances_ # Feature importance values
importances
reg_obj.feature_importances_.shape

plt.figure(figsize=(7,6))
plt.bar(x.columns, importances) # Bar chart of feature importance
plt.ylabel('Feature importance')
# Annotate each bar with its value
for index, value in enumerate(importances):
    plt.text(index, value+0.02, str(value), ha='center')
plt.show()

# Step 11: Scatter plot of Actual vs Predicted values

In [None]:
# Step 11: Scatter plot of Actual vs Predicted values
plt.scatter(y_test, y_test_pred, alpha=1)
plt.plot([min(y_test), max(y_test)], [min(y_test_pred), max(y_test_pred)], color='red', linewidth=2)
plt.legend(["Predicted","Actual"], loc="upper left")
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('WBGT')

# Step 12: Inference time and memory usage

In [None]:
# Step 12: Inference time and memory usage
# Test set
test_initial_time = time.time()
print(test_initial_time)
X_test_temp_index = random.randrange(0,len(X_test)) # Choose a random row
X_test_temp = X_test.iloc[X_test_temp_index].values.reshape(1,-1) # Reshape for prediction
y_pred = reg_obj.predict(X_test_temp) # Perform prediction
test_final_time = time.time()
print(test_final_time)
test_inference_time = test_final_time - test_initial_time # Calculate inference time
print('Test set inference time:', test_inference_time)
# Calculate average memory usage for prediction
def y_pred_func():
  return reg_obj.predict(X_test_temp)

test_mem_usage = memory_usage(y_pred_func) # Track memory usage
test_avg_mem_usage = np.mean(test_mem_usage) # Calculate average memory usage
print('Test set avg. memory usage:', test_avg_mem_usage)
# Validation set
val_initial_time = time.time()
print(val_initial_time)
X_val_temp_index = random.randrange(0,len(X_val)) # Choose a random row
X_val_temp = X_val.iloc[X_val_temp_index].values.reshape(1,-1) # Reshape for prediction
y_val_pred = reg_obj.predict(X_val_temp) # Perform prediction
val_final_time = time.time()
print(val_final_time)
val_inference_time = val_final_time - val_initial_time # Calculate inference time
print('Validation set inference time:', val_inference_time)
# Calculate average memory usage for prediction
def y_val_pred_func():
  return reg_obj.predict(X_val_temp)

val_mem_usage = memory_usage(y_val_pred_func) # Track memory usage
val_avg_mem_usage = np.mean(val_mem_usage) # Calculate average memory usage
print('Validation set avg memory usage:', val_avg_mem_usage)

# Step 13: K-Fold cross-validation (K=10)

In [None]:
# Step 13: K-Fold cross-validation (K=10)
k_fold = KFold(n_splits=10) # Create K-Fold cross-validator
k_fold
cv_score = cross_val_score(reg_obj,x,y, cv=k_fold) # Perform cross-validation
print("Cross-validation scores:", cv_score) # Print cross-validation scores