# Import libraries

In [None]:
# K-Nearest Neighbors (KNN) Regression Model

# Import necessary libraries for data handling, visualization, and model evaluation
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
from google.colab import files
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from memory_profiler import memory_usage
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import (
    r2_score, mean_absolute_error,
    mean_squared_error, mean_absolute_percentage_error
)

# Step 1: Dataset upload

In [None]:
# Step 1: Dataset upload
upload = files.upload()  # Prompt the user to upload the dataset
df = pd.read_csv('All_Div_BD.csv')  # Load dataset into a DataFrame
dataset.shape # Print the shape of the dataset
df.head(5)  # Display the first 5 rows for preview

# Step 2: Exclude nighttime data (hours outside 7 AM - 7 PM)

In [None]:
# Step 2: Exclude nighttime data (hours outside 7 AM - 7 PM)
# Filter rows with hour < 7 or hour > 19
after_midnight_data = df[df['hour'] < 7]
till_midnight_data = df[df['hour'] > 19]
df = df.drop(after_midnight_data.index, axis=0)  # Drop early morning rows
df = df.drop(till_midnight_data.index, axis=0)  # Drop late evening rows

# Step 3: Shuffle the dataset

In [None]:
# Step 3: Shuffle the dataset
df = df.sample(frac=1, random_state=42)  # Shuffle data for randomness

# Step 4: Correlation matrix visualization

In [None]:
# Step 4: Correlation matrix visualization
# Analyze correlations between features
cor_matrix = df.corr()
features = cor_matrix.index  # Extract feature names
font_upload = files.upload()  # Upload custom font file
font_path = '/content/arial.ttf'  # Path to the uploaded font file
fm.fontManager.addfont(font_path)  # Add font to Matplotlib
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Arial']
plt.rcParams['font.size'] = 15
plt.figure(figsize=(10, 10))
sns.heatmap(df[features].corr(), annot=True)  # Generate heatmap
plt.show()

# Step 5: Define dependent (target) and independent (features) variables


In [None]:
# Step 5: Define dependent (target) and independent (features) variables
# Exclude irrelevant columns from the feature set
x = df.drop(['year', 'month', 'day', 'hour', 'wbgt'], axis=1).values
y = df['wbgt'].values  # Target variable
print(x.shape)  # Print the shape of the independent variables
colms = x.shape[1]  # Extract the number of features (columns)
print(colms)  # Output the number of features

# Step 6: Train, test, and validation split

In [None]:
# Step 6: Train, test, and validation split
# Split the dataset into training (80%) and test (20%) sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
# Further split the training set into training (70%) and validation (10%) sets
x_train_temp, x_val, y_train_temp, y_val = train_test_split(
    x_train, y_train, test_size=0.1, random_state=0
)
print(x_val.shape)  # Print the shape of the validation set
print(x_test.shape)  # Print the shape of the test set
val_rows = x_val.shape[0]  # Store the number of rows in the validation set
test_rows = x_test.shape[0]  # Store the number of rows in the test set
print(val_rows)
print(test_rows)

# Step 7: Model training with KNN Regressor

In [None]:
# Step 7: Model training with KNN Regressor
knn_regressor = KNeighborsRegressor(n_neighbors=605)  # Create a KNN model with 605 neighbors
knn_regressor.fit(x_train_temp, y_train_temp)  # Fit the model on the training data

# Step 8: Predictions for test and validation sets

In [None]:
# Step 8: Predictions for test and validation sets
# Predict WBGT for the test set
y_pred = knn_regressor.predict(x_test)
# Predict WBGT for the validation set
y_val_pred = knn_regressor.predict(x_val)

# Step 9: Performance and error metrics

In [None]:
# Step 9: Performance and error metrics
# Test set
r2_test = r2_score(y_test, y_pred)  # R2 score
adjusted_r2_test = 1 - ((1 - r2_test) * (test_rows - 1) / (test_rows - 1 - colms))  # Adjusted R2 score
print("R2 score: %f" % r2_test)
print("Adjusted R2 score: %f" % adjusted_r2_test)
print("MAE: %f" % mean_absolute_error(y_test, y_pred))
print("MSE: %f" % mean_squared_error(y_test, y_pred))
print("RMSE: %f" % np.sqrt(mean_squared_error(y_test, y_pred)))
# Validation set
r2_val = r2_score(y_val, y_val_pred)  # R2 score
adjusted_r2_val = 1 - ((1 - r2_val) * (val_rows - 1) / (val_rows - 1 - colms))  # Adjusted R2 score
print("R2 score: %f" % r2_val)
print("Adjusted R2 score: %f" % adjusted_r2_val)
print("MAE: %f" % mean_absolute_error(y_val, y_val_pred))
print("MSE: %f" % mean_squared_error(y_val, y_val_pred))
print("RMSE: %f" % np.sqrt(mean_squared_error(y_val, y_val_pred)))

# Step 10: Scatter plot of Actual vs Predicted values

In [None]:
# Step 10: Scatter plot of Actual vs Predicted values
plt.scatter(y_test, y_pred, alpha=1)  # Scatter plot
plt.plot([min(y_test), max(y_test)], [min(y_pred), max(y_pred)], color='red', linewidth=2)  # Line of equality
plt.legend(["Predicted", "Actual"], loc="upper left")
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('WBGT')

# Step 11: Inference time and memory usage

In [None]:
# Step 11: Inference time and memory usage
# Test set
test_initial_time = time.time()
x_test_temp_index = random.randrange(0, len(x_test))  # Choose a random row
x_test_temp = x_test[x_test_temp_index].reshape(1, -1)  # Reshape for prediction
y_pred = knn_regressor.predict(x_test_temp)  # Perform prediction
test_final_time = time.time()
test_inference_time = test_final_time - test_initial_time  # Calculate inference time
print('Test set inference time:', test_inference_time)
# Calculate average memory usage for prediction
def y_pred_func():
    return knn_regressor.predict(x_test_temp)

test_mem_usage = memory_usage(y_pred_func)  # Track memory usage
test_avg_mem_usage = np.mean(test_mem_usage)  # Calculate average memory usage
print('Test set avg. memory usage:', test_avg_mem_usage)
# Validation set
val_initial_time = time.time()
x_val_temp_index = random.randrange(0, len(x_val))  # Choose a random row
x_val_temp = x_val[x_val_temp_index].reshape(1, -1)  # Reshape for prediction
y_val_pred = knn_regressor.predict(x_val_temp)  # Perform prediction
val_final_time = time.time()
val_inference_time = val_final_time - val_initial_time  # Calculate inference time
print('Validation set inference time:', val_inference_time)
# Calculate average memory usage for prediction
def y_val_pred_func():
    return knn_regressor.predict(x_val_temp)

val_mem_usage = memory_usage(y_val_pred_func)  # Track memory usage
val_avg_mem_usage = np.mean(val_mem_usage)  # Calculate average memory usage
print('Validation set avg memory usage:', val_avg_mem_usage)

# Step 12: K-Fold cross-validation (K=10)

In [None]:
# Step 12: K-Fold cross-validation (K=10)
kf = KFold(n_splits=10) # Create K-Fold cross-validator
cv_score = cross_val_score(KNeighborsRegressor(n_neighbors=605), x_train, y_train, cv=kf) # Perform cross-validation
print("Cross-validation scores:", cv_score) # Print cross-validation scores