In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Reading the dataset file:
df = pd.read_csv('data_2024_dontation.csv')
# Setting the option to display 100 rows:
pd.set_option('display.max_rows', 100)

In [3]:
df.shape

(448, 9)

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 448 entries, 0 to 447
Data columns (total 9 columns):
 #   Column                                                 Non-Null Count  Dtype 
---  ------                                                 --------------  ----- 
 0   Drop Off Location                                      448 non-null    object
 1   Stake                                                  448 non-null    object
 2   Route Number/Name                                      448 non-null    object
 3   Time Spent Collecting Donations                        448 non-null    object
 4   # of Adult Volunteers who participated in this route   448 non-null    int64 
 5   # of Youth Volunteers who participated in this route
  448 non-null    int64 
 6   # of Doors in Route                                    448 non-null    int64 
 7   # of Donation Bags Collected                           448 non-null    int64 
 8   total number of volunteers                             448 no

In [5]:
df.duplicated().sum()

2

In [6]:
df.isnull().sum()

Unnamed: 0,0
Drop Off Location,0
Stake,0
Route Number/Name,0
Time Spent Collecting Donations,0
# of Adult Volunteers who participated in this route,0
# of Youth Volunteers who participated in this route\n,0
# of Doors in Route,0
# of Donation Bags Collected,0
total number of volunteers,0


In [7]:
df['Route Number/Name'].unique()


array(['676', '0', 'Unassigned', '50', '98', '6', '19', '7',
       'Meyokumin 25 ', '32', 'Route 1', '2', '1', 'Strathearn North',
       '64', '21', '10', '28/konietz', '58', '18', 'Caernarvon A', '30',
       'Ritchie 4', 'Mill Creek 2', 'Fulton Place #1B', 'Sloan', '37',
       '13', '169', '9', '3-3-1a', '3', '46', '49th street',
       'Laurel Park', 'Terrace Heights #3 (Poll 66)', '25', 'Ritchie 2',
       '16', '33', '8', '42', '5 2.2', '182', '54', 'Goldbar 5', '11',
       '4-3.2', '5', '4-1.1 ', 'Calder C', '63', '36', '73', '134',
       'Ottwell #3 (poll 48)', '22', '74',
       'Royal Gardens # / Gavin Speidwl', '113A, 133 street 131 ave',
       '52 and 52', '41', 'Poll 47 ', 'AG1', '56', '86/laurel', '66',
       'Forest height #4 (poll 68)', 'Hudson c', '23', '197/210', '24',
       '125', '123', '12', 'Howard Biggs', '3-2.4b', '15', 'Strathcona',
       'Gurney', '19,23', 'Cumberland E', '162 , 170', '62, 48, 61', '17',
       '52', '2-1.1', 'Not specified ', 'Strathc

In [8]:
df['Stake'].unique()

array(['Riverbend Stake', 'Gateway Stake', 'Bonnie Doon Stake',
       'Edmonton North Stake', 'YSA Stake'], dtype=object)

In [9]:
def generate_route_names(df):
    """Generates new route names based on stakes."""
    route_counters = {}  # Store counters for each stake
    new_route_names = []

    for index, row in df.iterrows():
        stake = row['Stake']
        if stake not in route_counters:
            route_counters[stake] = 1  # Initialize counter for new stake
        else:
            route_counters[stake] += 1  # Increment counter for existing stake

        new_route_names.append(f"{stake} {route_counters[stake]}")  # Create new route name

    df['New Route Number/Name'] = new_route_names  # Add new column to DataFrame
    return df

# Apply the function to generate new route names
df = generate_route_names(df)

# Print the updated DataFrame (optional)
print(df)

            Drop Off Location              Stake   Route Number/Name  \
0             Bearspaw Chapel    Riverbend Stake                 676   
1             Bearspaw Chapel      Gateway Stake                   0   
2          Londonberry Chapel  Bonnie Doon Stake          Unassigned   
3       Gateway Stake Centre       Gateway Stake                  50   
4    Bonnie Doon Stake Centre  Bonnie Doon Stake                  98   
..                        ...                ...                 ...   
443    Riverbend Stake Centre    Riverbend Stake        Greenfield 7   
444    Riverbend Stake Centre    Riverbend Stake                  70   
445  Bonnie Doon Stake Centre  Bonnie Doon Stake  King Edward Park 4   
446    Riverbend Stake Centre    Riverbend Stake        Greenfield 3   
447    Riverbend Stake Centre    Riverbend Stake                  56   

    Time Spent Collecting Donations  \
0                    0 - 30 Minutes   
1                    0 - 30 Minutes   
2                 

In [11]:
# Group by 'Stake' and get the size of each group (number of routes)
route_counts = df.groupby('Stake')['Route Number/Name'].nunique()

# Create a new column 'TotalRoutes' and assign the route counts using map
df['TotalRoutes'] = df['Stake'].map(route_counts)

# Print the updated DataFrame (optional) to see the new column
print(df[['Stake', 'TotalRoutes']].head())  # Display the first few rows with stake and total routes

               Stake  TotalRoutes
0    Riverbend Stake           94
1      Gateway Stake           82
2  Bonnie Doon Stake          104
3      Gateway Stake           82
4  Bonnie Doon Stake          104


In [12]:
# Group by 'Route Number/Name' and calculate the average '# of Doors in Route' for each group
average_doors_per_route = df.groupby('New Route Number/Name')['# of Doors in Route'].mean()

# Create a new column 'AvgDoorsPerRoute' and assign the average doors for each route using map
df['AvgDoorsPerRoute'] = df['New Route Number/Name'].map(average_doors_per_route)

# Print the updated DataFrame to see the new column
print(df[['New Route Number/Name', 'AvgDoorsPerRoute']].head())  # Display the first few rows with route and average doors

  New Route Number/Name  AvgDoorsPerRoute
0     Riverbend Stake 1              78.0
1       Gateway Stake 1               0.0
2   Bonnie Doon Stake 1               1.0
3       Gateway Stake 2              20.0
4   Bonnie Doon Stake 2              20.0


In [12]:
# # Define a dictionary to map time ranges to their average values in minutes
# time_range_mapping = {
#     '0-30': 15,  # Average of 0 and 30
#     '30-60': 45,  # Average of 30 and 60
# }

# # Replace time ranges with their average values in the 'Time Spent Collecting Donations' column
# df['Time Spent Collecting Donations'] = df['Time Spent Collecting Donations'].map(time_range_mapping)

In [15]:
def convert_time_to_minutes(time_str):
    """Converts time string to minutes.

    Args:
        time_str: The time string to convert, e.g., '0 - 30 Minutes', '1 Hour', etc.

    Returns:
        The time in minutes as an integer.
    """
    if '-' in time_str:  # Handle ranges like '0 - 30 Minutes'
        lower, upper = time_str.split('-')
        lower = int(lower.strip().split()[0])  # Extract lower bound
        # Extract upper bound, handling potential floats
        upper_str = upper.strip().split()[0]
        upper = int(float(upper_str)) if '.' in upper_str else int(upper_str)
        # Returning the average of the range
        return (lower + upper) // 2
    elif 'Hour' in time_str:  # Handle '1 Hour', '2 Hours', etc.
        # Extract only digits from the beginning of the string
        hours_str = ''.join(filter(str.isdigit, time_str.strip().split()[0]))
        hours = int(hours_str) if hours_str else 0  # Handle empty string case
        return hours * 60
    elif 'Minutes' in time_str: # Handle '30 Minutes', '45 Minutes', etc.
        minutes = int(time_str.strip().split()[0])
        return minutes
    else:
        # Handle unexpected formats (you can customize this)
        return None  # Or raise an exception, etc.

# Apply the function to the 'Time Spent Collecting Donations' column
df['Time Spent Collecting Donations'] = df['Time Spent Collecting Donations'].apply(convert_time_to_minutes)

# Convert the column to int64 after applying the function
df['Time Spent Collecting Donations'] = df['Time Spent Collecting Donations'].astype('Int64') # Int64 can handle NaN values

In [16]:
print(df['Time Spent Collecting Donations'].dtype)

Int64


In [18]:
average_time_per_door = df['Time Spent Collecting Donations'].sum() / df['# of Doors in Route'].sum()

print(f"Overall average time per door: {average_time_per_door:.2f} minutes")

Overall average time per door: 0.18 minutes


In [23]:
# Group by 'New Route Number/Name' and calculate the total donation volume for each route
route_donation_volume = df.groupby('New Route Number/Name')['# of Donation Bags Collected'].sum()

# Create a new column 'TotalDonationVolume' and assign the total donation volume for each route using map
df['TotalDonationVolume'] = df['New Route Number/Name'].map(route_donation_volume)

In [25]:
# Group by 'New Route Number/Name' and calculate the total number of volunteers for each route
route_volunteer_count = df.groupby('New Route Number/Name')['total number of volunteers '].sum()

# Create a new column 'TotalVolunteers' and assign the total volunteers for each route using map
df['TotalVolunteers'] = df['New Route Number/Name'].map(route_volunteer_count)

In [27]:
df.to_csv('data_2024_donation_processed_2.csv', index=False)

In [19]:
df['New Route Number/Name'].head(50)


Unnamed: 0,New Route Number/Name
0,Riverbend Stake 1
1,Gateway Stake 1
2,Bonnie Doon Stake 1
3,Gateway Stake 2
4,Bonnie Doon Stake 2
5,Gateway Stake 3
6,Gateway Stake 4
7,Bonnie Doon Stake 3
8,Gateway Stake 5
9,Bonnie Doon Stake 4


In [15]:
# Check if there are any rows with 'YSA' in the 'Stake' column:
ysa_count = df[df['Stake'] == 'YSA Stake'].shape[0]
print(f"Number of rows with 'YSA' stake: {ysa_count}")

# If ysa_count is 0, then there are no rows with 'YSA' in the 'Stake' column.
if ysa_count == 0:
    print("There are no rows with 'YSA' in the 'Stake' column. "
          "Please check the values in your DataFrame.")
else:
    # Filter the DataFrame to include only rows where the stake is "YSA"
    ysa_df = df[df['Stake'] == 'YSA']

    # Get all route numbers for the YSA stake
    ysa_route_numbers = ysa_df['New Route Number/Name']

    # Print the route numbers
    print(ysa_route_numbers)

Number of rows with 'YSA' stake: 4
Series([], Name: New Route Number/Name, dtype: object)


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 448 entries, 0 to 447
Data columns (total 12 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   Drop Off Location                                      448 non-null    object 
 1   Stake                                                  448 non-null    object 
 2   Route Number/Name                                      448 non-null    object 
 3   Time Spent Collecting Donations                        0 non-null      float64
 4   # of Adult Volunteers who participated in this route   448 non-null    int64  
 5   # of Youth Volunteers who participated in this route
  448 non-null    int64  
 6   # of Doors in Route                                    448 non-null    int64  
 7   # of Donation Bags Collected                           448 non-null    int64  
 8   total number of volunteers                        

In [13]:
# Group by 'Route Number/Name' and calculate the average '# of Doors in Route' for each group
average_doors_per_route = df.groupby('New Route Number/Name')['# of Doors in Route'].mean()

# Create a new column 'AvgDoorsPerRoute' and assign the average doors for each route using map
df['AvgDoorsPerRoute'] = df['New Route Number/Name'].map(average_doors_per_route)

# Print the updated DataFrame to see the new column
print(df[['New Route Number/Name', 'AvgDoorsPerRoute']].head())  # Display the first few rows with route and average doors

  New Route Number/Name  AvgDoorsPerRoute
0     Riverbend Stake 1              78.0
1       Gateway Stake 1               0.0
2   Bonnie Doon Stake 1               1.0
3       Gateway Stake 2              20.0
4   Bonnie Doon Stake 2              20.0


In [15]:
# import pandas as pd

# # Convert the columns to numeric, handling potential errors
# # df['Time Spent Collecting Donations'] = pd.to_numeric(df['Time Spent Collecting Donations'], errors='coerce')
# df['# of Doors in Route'] = pd.to_numeric(df['# of Doors in Route'], errors='coerce')

# # Clean the 'Time Spent Collecting Donations' column before converting to timedelta
# # This will remove any trailing or embedded negative signs
# df['Time Spent Collecting Donations'] = df['Time Spent Collecting Donations'].str.replace(r'(?!^)-', '', regex=True)

# # Now convert to timedelta
# df['Time Spent Collecting Donations'] = pd.to_timedelta(df['Time Spent Collecting Donations'], errors='coerce')
# df['Time Spent Collecting Donations'] = (df['Time Spent Collecting Donations'].dt.total_seconds() / 60).astype(int)


# # Remove rows with missing values in the relevant columns
# # df = df.dropna(subset=['Time Spent Collecting Donations', '# of Doors in Route'])

# # Calculate the overall average time per door
# average_time_per_door = df['Time Spent Collecting Donations'].sum() / df['# of Doors in Route'].sum()

# # Create a new column 'AvgTimePerRouteBasedOnDoors'
# df['AvgTimePerRouteBasedOnDoors'] = df['AvgDoorsPerRoute'] * average_time_per_door

In [14]:
df.to_csv('data_2024_donation_processed_1.csv', index=False)