### Group 1: Year-on-Year Comparison (2023 vs 2024)
Focus: Perform a year-on-year comparison between the 2023 and 2024 food drives, analyzing donations, and route completion times.
ML Task: Comparative Analysis
Objective: Build a model that predicts how donation predictions from 2023 align with 2024.

In [313]:
#importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import joblib
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error,root_mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor

In [314]:
#load the 2024 dataset as the test dataset
data_2024 = pd.read_csv('cleaned_data_2024.csv', encoding='latin1')

In [315]:
#load the 2023 dataset as the test dataset
data_2023 = pd.read_csv('cleaned_data_2023.csv', encoding='latin1')

In [316]:
data_2024.head()

Unnamed: 0,date,drop_off_location,stake,route_number/name,time_spent_collecting_donations,#_of_adult_volunteers_who_participated_in_this_route,#_of_youth_volunteers_who_participated_in_this_route,#_of_doors_in_route,#_of_donation_bags_collected,did_you_complete_more_than_1_route?,Number of routes completed,ward,Form Completion Time,Total Volunteers
0,2024-09-28 10:01,Londonberry Chapel,Bonnie Doon Stake,Unassigned,0 - 30 Minutes,1,0,1,1,No,1,Clareview Ward,3 minutes 43 seconds,1
1,2024-09-28 10:31,Gateway Stake Centre,Gateway Stake,50,0 - 30 Minutes,2,2,20,20,No,1,Lee Ridge Ward,2 minutes 41 seconds,4
2,2024-09-28 10:33,Bonnie Doon Stake Centre,Bonnie Doon Stake,98,0 - 30 Minutes,2,2,20,15,No,1,Forest Heights Ward,3 minutes 27 seconds,4
3,2024-09-28 10:41,Bearspaw Chapel,Gateway Stake,6,30 - 60 Minutes,2,3,144,25,Yes,2,Lee Ridge Ward,2 minutes 3 seconds,5
4,2024-09-28 10:36,Gateway Stake Centre,Gateway Stake,19,30 - 60 Minutes,1,0,230,21,No,1,Silver Berry Ward,8 minutes 58 seconds,1


In [317]:
data_2023.head()

Unnamed: 0,Date,Location,Stake,# of Adult Volunteers,# of Youth Volunteers,Donation Bags Collected,Time to Complete (min),Completed More Than One Route,Ward/Branch,Routes Completed,Doors in Route,Time Spent
0,2023-09-23 10:36:10,Londonderry Chapel,Bonnie Doon Stake,1,3,14,25.0,0,Clareview Ward,2,14,25.0
1,2023-09-23 10:36:52,Gateway Stake Centre,Gateway Stake,3,1,18,25.0,1,Crawford Plains Ward,2,144,30.0
2,2023-09-23 10:40:22,Gateway Stake Centre,Gateway Stake,2,0,20,30.0,0,Silver Berry Ward,2,186,30.0
3,2023-09-23 10:41:08,Gateway Stake Centre,Gateway Stake,2,0,20,25.0,1,Crawford Plains Ward,1,194,30.0
4,2023-09-23 10:44:10,Londonderry Chapel,Bonnie Doon Stake,1,0,2,5.0,0,Londonderry Ward,2,1,-30.269048



Feature Engineering

In [318]:
data_2024.columns

Index(['date', 'drop_off_location', 'stake', 'route_number/name',
       'time_spent_collecting_donations',
       '#_of_adult_volunteers_who_participated_in_this_route',
       '#_of_youth_volunteers_who_participated_in_this_route',
       '#_of_doors_in_route', '#_of_donation_bags_collected',
       'did_you_complete_more_than_1_route?', 'Number of routes completed',
       'ward', 'Form Completion Time', 'Total Volunteers'],
      dtype='object')

In [319]:
data_2023.columns

Index(['Date', 'Location', 'Stake', '# of Adult Volunteers',
       '# of Youth Volunteers', 'Donation Bags Collected',
       'Time to Complete (min)', 'Completed More Than One Route',
       'Ward/Branch', 'Routes Completed', 'Doors in Route', 'Time Spent'],
      dtype='object')

In [320]:
#drop the time spent in 2023 data

data_2023 = data_2023.drop(columns = ['Time Spent', 'Date'])

In [321]:
data_2024 = data_2024.drop(columns = ['Form Completion Time', 'date', 'route_number/name', 'Total Volunteers'])

In [322]:
# Renaming columns in data_2023 to match data_2024
data_2023 = data_2023.rename(columns={

    'Location': 'drop_off_location',
    'Stake': 'stake',
    'Time to Complete (min)': 'time_spent_collecting_donations',
    'Number of routes completed': 'Number of routes completed',
    'Doors in Route': '#_of_doors_in_route',
    'Donation Bags Collected': '#_of_donation_bags_collected',
    'Routes Completed': 'Number of routes completed',
    'Ward/Branch': 'ward',
    '# of Adult Volunteers': '#_of_adult_volunteers_who_participated_in_this_route',
    '# of Youth Volunteers': '#_of_youth_volunteers_who_participated_in_this_route',
    'Completed More Than One Route': 'did_you_complete_more_than_1_route?'


})



In [323]:
data_2023.columns

Index(['drop_off_location', 'stake',
       '#_of_adult_volunteers_who_participated_in_this_route',
       '#_of_youth_volunteers_who_participated_in_this_route',
       '#_of_donation_bags_collected', 'time_spent_collecting_donations',
       'did_you_complete_more_than_1_route?', 'ward',
       'Number of routes completed', '#_of_doors_in_route'],
      dtype='object')

In [324]:
data_2024.columns

Index(['drop_off_location', 'stake', 'time_spent_collecting_donations',
       '#_of_adult_volunteers_who_participated_in_this_route',
       '#_of_youth_volunteers_who_participated_in_this_route',
       '#_of_doors_in_route', '#_of_donation_bags_collected',
       'did_you_complete_more_than_1_route?', 'Number of routes completed',
       'ward'],
      dtype='object')

In [325]:
data_2024['time_spent_collecting_donations'].unique()

array(['0 - 30 Minutes', '30 - 60 Minutes', '60 - 90 Minutes',
       '90 - 120 Minutes'], dtype=object)

In [326]:
data_2023['time_spent_collecting_donations'].unique()

array([ 25. ,  30. ,   5. ,  20. ,  45. ,  60. ,  40. ,  55. ,  63. ,
       180. ,  32. ,  50. ,  35. , 100. ,  17. ,  70. ,  58. , 120. ,
        75. ,  15. ,  90. ,  47. ,  10. ,   1. , 105. ,   3. ,  52. ,
        44. , 110. , 150. , 245. ,  49. ,  51. ,  28. ,  24. ,  80. ,
        57. ,  18.5,  34. ,  37. ,  23. ,  27. ,  65. , 240. , 115. ,
        62. ,  69. ,  26. ,  38. , 125. , 140. , 170. , 165. , 135. ,
       112. ,  72. ,   2. ])

Transform the 'time spent collecting donations' column in 2024 into single numbers

In [327]:
data_2024['time_spent_collecting_donations'] = data_2024['time_spent_collecting_donations'].str.replace('0 - 30 Minutes', '15') \
                                                                                               .str.replace('30 - 60 Minutes', '45') \
                                                                                               .str.replace('60 - 90 Minutes', '75') \
                                                                                               .str.replace('90 - 120 Minutes', '105')


Filter 2023 and 2024 datasets to contain the same wards information

In [328]:
# Get unique wards in both datasets
data_2023_wards = set(data_2023['ward'].unique())
data_2024_wards = set(data_2024['ward'].unique())

# Find wards that appear in both datasets
common_wards = data_2023_wards.intersection(data_2024_wards)

# Filter data_2023_copy and data_2024_copy to keep only rows with common wards
data_2023 = data_2023[data_2023['ward'].isin(common_wards)].reset_index(drop=True)
data_2024 = data_2024[data_2024['ward'].isin(common_wards)].reset_index(drop=True)


## **Predictions for number of donation bags collected per route**

Train-test split

In [329]:
# Use only 2023 as the train model
X_2023 = data_2023[['#_of_adult_volunteers_who_participated_in_this_route', '#_of_youth_volunteers_who_participated_in_this_route', 'ward', 'time_spent_collecting_donations',  'Number of routes completed', '#_of_doors_in_route']]
y_2023 = data_2023['#_of_donation_bags_collected']

# Use only 2024 for testing
X_2024 = data_2024[['#_of_adult_volunteers_who_participated_in_this_route', '#_of_youth_volunteers_who_participated_in_this_route', 'time_spent_collecting_donations', 'ward', 'Number of routes completed', '#_of_doors_in_route']]
y_2024 = data_2024['#_of_donation_bags_collected']

X_train = X_2023
y_train = y_2023
X_test = X_2024
y_test = y_2024

Encoding categorical variables

In [330]:

# Initialize OneHotEncoder with handle_unknown='ignore' to avoid errors for missing categories in test data
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit and transform on the training data, then transform on the test data
encoded_ward_train = ohe.fit_transform(X_train[['ward']])
encoded_ward_test = ohe.transform(X_test[['ward']])

# Convert encoded arrays to DataFrames with column names
encoded_ward_train_df = pd.DataFrame(encoded_ward_train, columns=ohe.get_feature_names_out(['ward']), dtype=int)
encoded_ward_test_df = pd.DataFrame(encoded_ward_test, columns=ohe.get_feature_names_out(['ward']), dtype=int)

# Align columns in both train and test sets to ensure they match
encoded_ward_train_df, encoded_ward_test_df = encoded_ward_train_df.align(encoded_ward_test_df, fill_value=0, axis=1)

# Drop the original 'ward' column from X_train and X_test
X_train = X_train.drop(columns=['ward']).reset_index(drop=True)
X_test = X_test.drop(columns=['ward']).reset_index(drop=True)

# Concatenate the original DataFrames with the encoded columns
X_train = pd.concat([X_train, encoded_ward_train_df], axis=1)
X_test = pd.concat([X_test, encoded_ward_test_df], axis=1)



In [331]:
# Standardize features (required for KNN)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [332]:
# Fit a regression model to predict the number of donation bags
model = LinearRegression()
model.fit(X_train, y_train)

# Continue with model training and prediction
y_pred = model.predict(X_test)


# Evaluate the model using root mean squared error
rmse = root_mean_squared_error(y_test, y_pred)
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 36.118284728985664


Implementing other models

In [333]:

# Define models and their parameters
models = {
    "Decision Tree": DecisionTreeRegressor(max_depth=5),
    "K-Nearest Neighbors (KNN)": KNeighborsRegressor(n_neighbors=5),
    "Random Forest": RandomForestRegressor(n_estimators=100),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)
}

# Train, predict, and evaluate models
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate metrics
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)

    # Print evaluation results
    print(f"\n{model_name} Performance:")
    print(f"R-squared: {r2:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")



Decision Tree Performance:
R-squared: -0.4321
Mean Absolute Error (MAE): 15.9154
Mean Squared Error (MSE): 1654.4077
Root Mean Squared Error (RMSE): 40.6744

K-Nearest Neighbors (KNN) Performance:
R-squared: 0.0419
Mean Absolute Error (MAE): 13.9388
Mean Squared Error (MSE): 1106.8264
Root Mean Squared Error (RMSE): 33.2690

Random Forest Performance:
R-squared: 0.1277
Mean Absolute Error (MAE): 13.5533
Mean Squared Error (MSE): 1007.7201
Root Mean Squared Error (RMSE): 31.7446

Gradient Boosting Performance:
R-squared: 0.1110
Mean Absolute Error (MAE): 13.7849
Mean Squared Error (MSE): 1026.9826
Root Mean Squared Error (RMSE): 32.0466


### **Cross Validation**




In [334]:
import joblib  # For saving the best model

# Define the models to evaluate
models = [
    ("Decision Tree", DecisionTreeRegressor()),
    ("K-Nearest Neighbors (KNN)", KNeighborsRegressor()),
    ("Random Forest", RandomForestRegressor()),
    ("Gradient Boosting", GradientBoostingRegressor())
]

# Variables to track the best model and its metrics
best_model_name = None
best_model = None
best_rmse = float("inf")  # Start with a high RMSE value

# K-fold cross-validation and evaluation
for name, model in models:
    # Cross-validation for multiple metrics
    cv_results_rmse = np.abs(
        cross_val_score(model, X_train, y_train, cv=KFold(n_splits=5, shuffle=True, random_state=42), scoring='neg_root_mean_squared_error')
    )
    cv_results_mae = cross_val_score(
        model, X_train, y_train, cv=KFold(n_splits=5, shuffle=True, random_state=42), scoring='neg_mean_absolute_error'
    )
    cv_results_r2 = cross_val_score(
        model, X_train, y_train, cv=KFold(n_splits=5, shuffle=True, random_state=42), scoring='r2'
    )
    cv_results_mse = np.abs(
        cross_val_score(model, X_train, y_train, cv=KFold(n_splits=5, shuffle=True, random_state=42), scoring='neg_mean_squared_error')
    )

    # Calculate metrics
    rmse = cv_results_rmse.mean()
    mae = -cv_results_mae.mean()
    r2 = cv_results_r2.mean()
    mse = -cv_results_mse.mean()

    # Print the results
    print(f"{name} Performance:")
    print(f"R-squared: {r2:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print("-" * 50)  # Separator for clarity

    # Update the best model if the current RMSE is lower
    if rmse < best_rmse:
        best_rmse = rmse
        best_model_name = name
        best_model = model

# Fit the best model on the entire training dataset
best_model.fit(X_train, y_train)

# Save the best model to a file
donation_bags_prediction_model = f"best_model_{best_model_name.replace(' ', '_')}.pkl"
joblib.dump(best_model, donation_bags_prediction_model)

# Print the best model's performance
print(f"\nBest Model: {best_model_name}")
print(f"Best RMSE: {best_rmse:.4f}")
print(f"The best model has been saved to {donation_bags_prediction_model}")


Decision Tree Performance:
R-squared: -0.6378
Mean Absolute Error (MAE): 15.3797
Mean Squared Error (MSE): -1665.0292
Root Mean Squared Error (RMSE): 37.3251
--------------------------------------------------
K-Nearest Neighbors (KNN) Performance:
R-squared: 0.4079
Mean Absolute Error (MAE): 13.3231
Mean Squared Error (MSE): -969.8987
Root Mean Squared Error (RMSE): 26.3879
--------------------------------------------------
Random Forest Performance:
R-squared: 0.3638
Mean Absolute Error (MAE): 12.2463
Mean Squared Error (MSE): -937.4018
Root Mean Squared Error (RMSE): 25.8353
--------------------------------------------------
Gradient Boosting Performance:
R-squared: 0.2418
Mean Absolute Error (MAE): 12.2407
Mean Squared Error (MSE): -1054.7820
Root Mean Squared Error (RMSE): 27.5158
--------------------------------------------------

Best Model: Random Forest
Best RMSE: 25.8353
The best model has been saved to best_model_Random_Forest.pkl


Hyperparameter Tuning of the models

In [337]:


# Define models and their parameter grids
models = {
    'KNN': {
        'model': KNeighborsRegressor(),
        'params': {
            'knn__n_neighbors': [3, 5, 7, 9, 11],
            'knn__weights': ['uniform', 'distance'],
            'knn__metric': ['euclidean', 'manhattan']
        }
    },
    'RandomForest': {
        'model': RandomForestRegressor(),
        'params': {
            'randomforest__n_estimators': [100, 150, 200],
            'randomforest__max_depth': [None, 10, 20],
            'randomforest__min_samples_split': [2, 5],
            'randomforest__min_samples_leaf': [1, 2],
            'randomforest__max_features': ['sqrt', 'log2'],
            'randomforest__bootstrap': [True, False]
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingRegressor(),
        'params': {
            'gradientboosting__n_estimators': [100, 150, 200],
            'gradientboosting__learning_rate': [0.1, 0.05, 0.01],
            'gradientboosting__max_depth': [3, 5, 7],
            'gradientboosting__min_samples_split': [2, 5],
            'gradientboosting__min_samples_leaf': [1, 2],
            'gradientboosting__max_features': ['sqrt', 'log2']
        }
    },
    'DecisionTree': {
        'model': DecisionTreeRegressor(),
        'params': {
            'decisiontree__max_depth': [None, 10, 20, 30],
            'decisiontree__splitter': ['best', 'random']
        }
    }
}

# Store results
best_models = {}
results = {}

# Iterate over models and perform GridSearchCV
for name, config in models.items():
    print(f"Training model: {name}...")
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        (name.lower(), config['model'])
    ])

    grid_search = GridSearchCV(
        pipeline,
        config['params'],
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=2,
        error_score=np.nan  # Skip invalid configurations
    )

    try:
        grid_search.fit(X_train, y_train)
        # Store the best model and metrics if successful
        best_model = grid_search.best_estimator_
        best_rmse = np.sqrt(-grid_search.best_score_)  # RMSE from neg MSE
        y_pred = best_model.predict(X_test)

        # Calculate other metrics
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Store results
        best_models[name] = best_model
        results[name] = {
            'best_params': grid_search.best_params_,
            'best_rmse': best_rmse,
            'mae': mae,
            'mse': mse,
            'r2': r2
        }
    except ValueError as e:
        print(f"GridSearchCV error for {name}: {e}")

# Define a custom scoring function (lower score is better)
def custom_score(result):
    return result['best_rmse'] + (1 - result['r2'])

# Select the best model based on the custom scoring function
best_model_name = min(results, key=lambda k: custom_score(results[k]))

# Make predictions with the best model
y_pred_best = best_models[best_model_name].predict(X_test)

# Print results for all models
for name, result in results.items():
    print(f"{name} - RMSE: {result['best_rmse']:.4f}, MAE: {result['mae']:.4f}, MSE: {result['mse']:.4f}, R²: {result['r2']:.4f}")

# Print the best model's name and performance
print(f"\nBest model based on custom scoring: {best_model_name}")
print(f"Best RMSE: {results[best_model_name]['best_rmse']:.4f}, Best R²: {results[best_model_name]['r2']:.4f}")


Training model: KNN...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Training model: RandomForest...
Fitting 5 folds for each of 144 candidates, totalling 720 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Training model: GradientBoosting...
Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Training model: DecisionTree...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
KNN - RMSE: 30.8188, MAE: 14.5556, MSE: 1151.8234, R²: 0.0030
RandomForest - RMSE: 29.2327, MAE: 13.1913, MSE: 984.2906, R²: 0.1480
GradientBoosting - RMSE: 28.2119, MAE: 14.7541, MSE: 1014.0372, R²: 0.1222
DecisionTree - RMSE: 39.6409, MAE: 15.6260, MSE: 1655.4045, R²: -0.4329

Best model based on custom scoring: GradientBoosting
Best RMSE: 28.2119, Best R²: 0.1222
