In [87]:
import pandas as pd
import numpy as np

In [223]:
cols_to_use = ['HubRandomID', 'Hub_ActivityID', 'ActivityDate', 
               'ProgrammeID', 'ChildParticipants', 'AdultParticipants', 'ReferralParticipants', 
               'EngagedCount', 'StartTime', 'EndTime', 'HubVolunteers',
              'SchoolVolunteers', 'ExternalVolunteers', 'ProgrammeID', 
              'ProgrammeID.1', 'ServiceTypeID', 'ProgrammeCategoryID',
              'ProgrammeCategoryID.1', 'ShortName.1', 'CategoryName']

col_types = {'HubRandomID': 'int64', 'Hub_ActivityID': 'int64', 'ActivityDate': 'str', 
             'ProgrammeID': 'int64', 'ChildParticipants': 'int64', 'AdultParticipants': 'int64', 
             'ReferralParticipants': 'int64', 'EngagedCount': 'int64', 'StartTime': 'str', 
             'EndTime' : 'str', 'HubVolunteers': 'int64','SchoolVolunteers': 'int64', 
             'ExternalVolunteers': 'int64', 'ProgrammeID': 'int64', 'ProgrammeID.1': 'int64', 
             'ServiceTypeID': 'int64', 'ProgrammeCategoryID': 'int64', 'ProgrammeCategoryID.1': 'int64', 
             'ShortName.1': 'str', 'CategoryName': 'str'}

datetime_cols = ['ActivityDate', 'StartTime', 'EndTime']

df = pd.read_csv('Datathon - CHA activities 2016-2020.csv', parse_dates=datetime_cols,
                usecols=cols_to_use, dtype=col_types).sort_values(by="ActivityDate")

In [224]:
for x in datetime_cols:
    if 'Date' in x:
        df[x] = df[x].dt.date
    else:
        df[x] = df[x].dt.time

In [231]:
df.head()

Unnamed: 0,HubRandomID,Hub_ActivityID,ActivityDate,ProgrammeID,ChildParticipants,AdultParticipants,ReferralParticipants,EngagedCount,StartTime,EndTime,ExternalVolunteers,HubVolunteers,SchoolVolunteers,ProgrammeID.1,ProgrammeCategoryID,ServiceTypeID,ProgrammeCategoryID.1,CategoryName,ShortName.1,TotalParticipants
29936,14819,523499,2016-01-02,47,0,0,0,0,NaT,NaT,0,0,0,47,1,145,1,Programmed Activities,PROG,0
29870,18315,523433,2016-01-02,59,0,0,2,0,NaT,NaT,0,0,0,59,3,2,3,Service Referrals,SERV,0
29869,16407,523432,2016-01-02,59,0,0,0,0,NaT,NaT,0,0,0,59,3,2,3,Service Referrals,SERV,0
29868,17534,523431,2016-01-02,47,2,12,0,0,NaT,NaT,0,0,0,47,1,145,1,Programmed Activities,PROG,14
29867,16407,523430,2016-01-02,48,0,0,0,0,NaT,NaT,0,0,0,48,1,10,1,Programmed Activities,PROG,0


#### Descriptive Stats

In [225]:
# HubRandomID
# how many hubs?
df['HubRandomID'].nunique()

80

In [226]:
# how many activities?
how_many_acts = df.groupby('HubRandomID')['Hub_ActivityID']\
.agg(['count'])

print(f"Hub with Max ({how_many_acts['count'].max()}) Activites: "
      f"{how_many_acts['count'].idxmax()} \n")
print(f"Hub with Min ({how_many_acts['count'].min()}) Activities: " 
      f"{how_many_acts['count'].idxmin()}")

Hub with Max (5280) Activites: 19277 

Hub with Min (37) Activities: 10932


In [227]:
# total participants column (sum of all the participants)
particpant_cols = ['ChildParticipants', 'AdultParticipants']
df['TotalParticipants'] = df[particpant_cols].sum(axis=1)

In [228]:
# average attendance per session per hub?
attendance = df.groupby('HubRandomID')['TotalParticipants']\
.agg(['sum', 'mean'])

print(f"Hub with the highest average attendance ({round(attendance['mean'].max(), 2)}): "
      f"{attendance['mean'].idxmax()} \n")
print(f"Hub with the lowest average attendance ({round(attendance['mean'].min(), 2)}): "
      f"{attendance['mean'].idxmin()}")

Hub with the highest average attendance (36.39): 18922 

Hub with the lowest average attendance (1.43): 10932


In [229]:
# who attends more: adults or children? 
total_particpants = df['TotalParticipants'].sum()
adult_participants = df['AdultParticipants'].sum()
child_participants = df['ChildParticipants'].sum()
print(f"Total number of sessions: {total_particpants:,.0f}\n")
print(f"Adults attend {adult_participants:,.0f} sessions; {round(adult_participants/total_particpants * 100, 2)}% of the total participants\n")
print(f"Children attend {child_participants:,.0f} sessions; {round(child_participants/total_particpants * 100, 2)}% of the total participants")

Total number of sessions: 1,594,194

Adults attend 617,140 sessions; 38.71% of the total participants

Children attend 977,054 sessions; 61.29% of the total participants


In [230]:
# what percent of people come by referral? 
referred_participants = df['ReferralParticipants'].sum()
print(f"Total number of referred participants {referred_participants:,.0f}; {round(referred_participants/total_particpants * 100, 2)}% of the total participants")

Total number of referred participants 164,159; 10.3% of the total participants


In [None]:
# how many hubs opened in the last year 
# LOGIC: is their first activity date within the last year 


In [None]:
# how long does a hub session typically go for 

In [None]:
# how many volunteers? and why types of volunteers?

#### Predictive Stats

In [None]:
# is there a relationship between number of volunteers & participant rate 