# CSU EDA v0.0
- 2022 to present
- split 3 ways:
    - awareness_markets: Orlando, Jacksonville, Greenville, SC and Birmingham
    - Mobile (DMA)
    - all other DMA
    
- not lead attributed media so it can't be split by the above

In [1]:
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib as mpl

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.precision', 2)
pd.set_option('display.float_format',  '{:,.2f}'.format)

In [2]:
kpi_raw = pd.read_csv('../data/CSU_KPI_2022-01-01_2024-12-26.csv')
spend_raw = pd.read_csv('../data/CSU_spend_2022-01-01_2024-12-26.csv')

In [3]:
dma_unique = list(kpi_raw['dmaname'].dropna().unique())
dma_unique.sort()

In [4]:
list_awareness = ['GREENVLL-SPART-ASHEVLL-AND', 'GREENWOOD-GREENVILLE', 'JACKSONVILLE', 'ORLANDO-DAYTONA BCH-MELBRN', 'BIRMINGHAM (ANN AND TUSC)']

In [5]:
df_kpi = kpi_raw.copy()
df_kpi['leaddate'] = pd.to_datetime(df_kpi['leaddate']).dt.normalize()
df_kpi['appdate'] = pd.to_datetime(df_kpi['appdate']).dt.normalize()
df_kpi['enrolldate'] = pd.to_datetime(df_kpi['enrolldate']).dt.normalize()
df_kpi['split'] = None
df_kpi.loc[df_kpi['dmaname'].str.contains('|'.join(list_awareness), na=False), 'split'] = 'awareness'
df_kpi.loc[df_kpi['dmaname'].str.contains('MOBILE-PENSACOLA', na=False), 'split'] = 'mobile'
df_kpi.loc[df_kpi['split'].isna(), 'split'] = 'other'
df_kpi = df_kpi.loc[df_kpi['leaddate'] < '2024-12-26'].drop(['dmaname'], axis=1) 

### Making cohort KPI data frame where all KPI are by the lead date
df_kpi_cohort = df_kpi.groupby(['leaddate'])[['n_lead', 'n_app', 'n_enroll']].agg('sum').reset_index()

### Making in-period KPI data frame where lead, app, enroll are all based on their own corresponding date
df_lead = df_kpi.groupby(['leaddate'])['n_lead'].agg('sum').reset_index()
df_app = df_kpi.groupby(['appdate'])['n_app'].agg('sum').reset_index()
df_enroll = df_kpi.groupby(['enrolldate'])['n_enroll'].agg('sum').reset_index()
df_kpi_inperiod = df_lead.merge(df_app, left_on=['leaddate'], right_on=['appdate'], how='left')
df_kpi_inperiod = df_kpi_inperiod.merge(df_enroll, left_on=['leaddate'], right_on=['enrolldate'], how='left')
df_kpi_inperiod = df_kpi_inperiod.drop(['appdate', 'enrolldate'], axis=1).fillna(0)

### combining cohort and inperiod for comparisons
df_kpi_combined = pd.DataFrame({'date': pd.date_range(start="2022-01-01", end=pd.Timestamp.now().date(), freq='D')})
df_kpi_combined = df_kpi_combined.merge(df_kpi_inperiod, left_on='date', right_on='leaddate', how='left').drop('leaddate', axis=1)
df_kpi_combined = df_kpi_combined.merge(df_kpi_cohort, left_on='date', right_on='leaddate', how='outer').drop('leaddate', axis=1)
df_kpi_combined = df_kpi_combined.rename(columns={'n_lead_x':'n_lead', 'n_app_x':'n_app', 'n_enroll_x':'n_enroll',
                                                 'n_lead_y':'n_lead_cohort', 'n_app_y':'n_app_cohort', 'n_enroll_y':'n_enroll_cohort'})
df_kpi_combined.to_csv('../data/csu_kpi_cleaned_2024-12-26.csv', index=False)

### Rolling monthly
df_comb_monthly = df_kpi_combined.set_index('date').copy()
df_comb_monthly = df_comb_monthly.resample('MS').sum()

  df_kpi.loc[df_kpi['dmaname'].str.contains('|'.join(list_awareness), na=False), 'split'] = 'awareness'


In [11]:
df_kpi

Unnamed: 0,leaddate,appdate,enrolldate,n_lead,n_app,n_enroll,split
0,2022-01-01,NaT,NaT,1,0,0,other
1,2022-01-01,2023-05-08,2023-05-25,1,1,1,other
2,2022-01-01,NaT,NaT,1,0,0,other
3,2022-01-01,NaT,NaT,1,0,0,other
4,2022-01-01,NaT,NaT,0,0,0,other
...,...,...,...,...,...,...,...
270049,2024-12-25,NaT,NaT,1,0,0,other
270050,2024-12-25,NaT,NaT,0,0,0,other
270051,2024-12-25,NaT,NaT,1,0,0,other
270052,2024-12-25,NaT,NaT,1,0,0,other


In [6]:
df_kpi_combined

Unnamed: 0,date,n_lead,n_app,n_enroll,n_lead_cohort,n_app_cohort,n_enroll_cohort
0,2022-01-01,106.00,0.00,0.00,106.00,19.00,6.00
1,2022-01-02,167.00,9.00,0.00,167.00,25.00,9.00
2,2022-01-03,235.00,28.00,0.00,235.00,46.00,16.00
3,2022-01-04,240.00,39.00,1.00,240.00,46.00,10.00
4,2022-01-05,208.00,23.00,1.00,208.00,28.00,10.00
...,...,...,...,...,...,...,...
1093,2024-12-29,,,,,,
1094,2024-12-30,,,,,,
1095,2024-12-31,,,,,,
1096,2025-01-01,,,,,,


In [7]:
df_kpi_inperiod

Unnamed: 0,leaddate,n_lead,n_app,n_enroll
0,2022-01-01,106,0.00,0.00
1,2022-01-02,167,9.00,0.00
2,2022-01-03,235,28.00,0.00
3,2022-01-04,240,39.00,1.00
4,2022-01-05,208,23.00,1.00
...,...,...,...,...
1085,2024-12-21,159,14.00,0.00
1086,2024-12-22,187,8.00,0.00
1087,2024-12-23,211,24.00,0.00
1088,2024-12-24,167,2.00,0.00


In [8]:
df_kpi_cohort

Unnamed: 0,leaddate,n_lead,n_app,n_enroll
0,2022-01-01,106,19,6
1,2022-01-02,167,25,9
2,2022-01-03,235,46,16
3,2022-01-04,240,46,10
4,2022-01-05,208,28,10
...,...,...,...,...
1085,2024-12-21,159,6,0
1086,2024-12-22,187,7,0
1087,2024-12-23,211,5,1
1088,2024-12-24,167,1,0


In [19]:
df_spend = spend_raw.copy()
df_spend['date'] = pd.to_datetime(df_spend['date'])
df_spend['split'] = None
df_spend.loc[df_spend['location'].str.contains('Orlando' or 'Jacksonville' or 'Greenville' or 'Birmingham'), 'split'] = 'awareness'

In [20]:
df_spend['location'].unique()

array(['any', 'Nurture', 'Orlando', 'Missing', 'Jacksonville',
       'Charlotte', 'Birmingham', 'Greenville'], dtype=object)

In [24]:
df_spend.vendor.unique()

array(['BingNonBrand', 'Display', 'Facebook', 'GoogleNonBrand',
       'Discovery', 'YouTube', 'BingBrand', 'GoogleBrand', 'Archer',
       'TikTok', 'DO NOT BUDGET', 'TradeDesk CTV/OTT', 'YouTubeTrad',
       'TradeDesk Radio', 'TradeDesk DOOH', 'Missing', 'LinkedIn',
       'ArcherOrg'], dtype=object)

In [23]:
df_spend

Unnamed: 0,date,ad_source,vendor,campaign_name,spend,location,split
0,2022-01-01,Bing,BingNonBrand,BM_CSU_All_FireScience,54.14,any,
1,2022-01-01,Google,Display,BM_CSU_All_Display - Lead2App Nurture,16.78,Nurture,
2,2022-01-01,Facebook,Facebook,zzzJV_CSU_HR_Nurture_Exception_Campaign,3.31,any,
3,2022-01-01,Google,Display,BM_CSU_All_Display - Remarketing,97.20,any,
4,2022-01-01,Google,GoogleNonBrand,BM_CSU_All_FireScience,1524.27,any,
...,...,...,...,...,...,...,...
65342,2024-12-26,Google,Discovery,BM_CSU_All_DemandGen - Psychology,53.49,any,
65343,2024-12-26,LinkedIn,LinkedIn,Personas(CP)_Mail,24.16,any,
65344,2024-12-26,Google,Discovery,BM_CSU_All_DemandGen - Education,17.75,any,
65345,2024-12-26,Google,Discovery,BM_CSU_All_DemandGen_Persona - Returning Student,39.45,any,


# EDA

In [25]:
df_kpi.groupby('split')[['leaddate', 'appdate', 'enrolldate']].agg('count')

Unnamed: 0_level_0,leaddate,appdate,enrolldate
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
awareness,8161,1616,585
mobile,5099,1395,567
other,256885,42332,14895


In [26]:
df_kpi[['n_lead', 'n_app', 'n_enroll']].sum()

n_lead      221928
n_app        45343
n_enroll     16047
dtype: int64