In [1]:
import pandas as pd

file_path = r"C:\Users\Deshan\Documents\IIT LECS\Year 2 Sem 1\DSGP\dataset creation\final_updated_dataset_with_cor_analysis.csv"
df = pd.read_csv(file_path)

print(df.head())



   LATITUDE  LONGITUDE  ScaledCountPerMinute  ScaledCountPerKM  \
0  6.079940  80.932975             -0.405618         -0.084023   
1  6.076015  80.932945              0.096322          0.089396   
2  6.083702  80.932344              0.456689          0.029366   
3  6.096865  80.924300              1.309344         -0.128135   
4  6.096865  80.924300              1.309344         -0.128135   

   Season_Northeast Monsoon  Season_Southwest Monsoon  TimeOfDay_Morning  \
0                      True                     False              False   
1                      True                     False               True   
2                      True                     False              False   
3                      True                     False              False   
4                      True                     False              False   

   Month  DayOfWeek  OBSERVATION COUNT  
0      1          6                4.0  
1      1          0                4.0  
2      2          4    

In [2]:
features = [
    'LATITUDE',
    'LONGITUDE',
    'ScaledCountPerMinute',
    'ScaledCountPerKM',
    'Season_Northeast Monsoon',
    'Season_Southwest Monsoon',
    'TimeOfDay_Morning',
    'Month',
    'DayOfWeek'
]
target = 'OBSERVATION COUNT'

df = df.dropna(subset=['OBSERVATION COUNT'])

X = df[features]
y = df[target]

print(features)
print("\nTarget variable:", target)


['LATITUDE', 'LONGITUDE', 'ScaledCountPerMinute', 'ScaledCountPerKM', 'Season_Northeast Monsoon', 'Season_Southwest Monsoon', 'TimeOfDay_Morning', 'Month', 'DayOfWeek']

Target variable: OBSERVATION COUNT


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")


X_train shape: (255, 9), y_train shape: (255,)
X_test shape: (64, 9), y_test shape: (64,)


In [4]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

#hyperparameter grid
param_grid = {
    'n_neighbors': [3, 5, 10, 20],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2]  # Manhattan and Euclidean distance
}

knn_model = KNeighborsRegressor()

#GridSearchCV
grid_search = GridSearchCV(
    estimator=knn_model,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='r2',
    n_jobs=-1,  # Use all available processors
    verbose=1
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

print("Best Hyperparameters:", grid_search.best_params_)
print("Best Model:", best_model)


Fitting 5 folds for each of 64 candidates, totalling 320 fits
Best Hyperparameters: {'algorithm': 'auto', 'n_neighbors': 20, 'p': 1, 'weights': 'distance'}
Best Model: KNeighborsRegressor(n_neighbors=20, p=1, weights='distance')


In [5]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Performance Metrics:")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R² Score: {r2:.4f}")



Model Performance Metrics:
Mean Squared Error (MSE): 93.9711
R² Score: 0.3070
