# EDA Data Investigation

In [1]:
#data stuff
import time
import pandas as pd
import numpy as np
import datetime as dt

#regression stuff
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm

#graph stuff
import seaborn as sns
from plotly.subplots import make_subplots
from plotly import graph_objects as go
import plotly
import matplotlib.pyplot as plt

%matplotlib inline
pd.set_option('display.max_columns', 500)
pd.options.mode.chained_assignment = None  # default='warn'


#import functions from other file that we want
from utilities import *

# 0.1 Read in data

In [2]:
#read in clickthru data from vw dataset
q = '''SELECT 
     account
    , attr_window
    , campaign
    , _match
    , campaign_objective
    , Year
    , Quarter
    , Month
    , week
    , week_start
    , date
    , marketing_initiative
    , marketing_segment
    , marketing_subinitiative
    , channel
    , platform
    , site
    , sum(case when sor_prod = 'All (Core All, NPV)' then spend else 0 end) spend
    , sum(case when sor_prod = 'All (Core All, NPV)' then clicks else 0 end) clicks
    , sum(case when sor_prod = 'All (Core All, NPV)' then impressions else 0 end) impressions
    , sum(case
            when sor_prod = 'Core' then conversions else null end) digi_ada_conversions
    , sum(case
            when sor_prod = 'Crosswords' then conversions else null end) games_conversions
    , sum(case
            when sor_prod = 'EDU' then conversions else null end) edu_conversions
    , sum(case
            when sor_prod = 'Home Delivery' then conversions else null end) hd_conversions
    , sum(case
            when sor_prod = 'CK' then conversions else null end) ck_conversions     
    , sum(case
            when sor_prod = 'Core All (Core, HD, EDU)' then conversions else null end) core_conversions
    , sum(case
            when sor_prod = 'All (Core All, NPV)' then conversions else null end) all_conversions
FROM `nyt-mkt-prd.paid_media_data.placement_daily_vw`
WHERE date >= '2020-01-01' and attr_window in('Media Reported - Click Thru')
group by 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,12,13,14,15,16,17
order by date, campaign, _match
    '''
start_time = time.time() 

clickthru_df = pd.read_gbq(q,
                 project_id ='nyt-bigquery-beta-workspace',
                 dialect='standard')

print(f'time took: {str(round(time.time() - start_time, 2))}')

time took: 213.99


In [3]:
clickthru_df.head()

Unnamed: 0,account,attr_window,campaign,_match,campaign_objective,Year,Quarter,Month,week,week_start,date,marketing_initiative,marketing_segment,marketing_subinitiative,channel,platform,site,spend,clicks,impressions,digi_ada_conversions,games_conversions,edu_conversions,hd_conversions,ck_conversions,core_conversions,all_conversions
0,Audience Engagement 1,Media Reported - Click Thru,Book Review Live 2020,BRAND ASSET - B&W,X-UNKNOWN-X,2020,1,1,0,2019-12-29,2020-01-01 00:00:00+00:00,X-UNKNOWN-X,X-UNKNOWN-X,X-UNKNOWN-X,Social,Facebook,Facebook,94.010001,142.0,26617.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Audience Engagement 1,Media Reported - Click Thru,Book Review Live 2020,QUOTE GRAPHIC,X-UNKNOWN-X,2020,1,1,0,2019-12-29,2020-01-01 00:00:00+00:00,X-UNKNOWN-X,X-UNKNOWN-X,X-UNKNOWN-X,Social,Facebook,Facebook,27.900001,56.0,18624.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Audience Engagement 1,Media Reported - Click Thru,Book Review Live 2020,SINGLE SPEAKER ASSET - NICHOLAS,X-UNKNOWN-X,2020,1,1,0,2019-12-29,2020-01-01 00:00:00+00:00,X-UNKNOWN-X,X-UNKNOWN-X,X-UNKNOWN-X,Social,Facebook,Facebook,2.95,8.0,3747.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,UAC_Crosswords,Media Reported - Click Thru,Crosswords UAC - US - Android,CROSSWORDS UAC - US - ANDROID,X-UNKNOWN-X,2020,1,1,0,2019-12-29,2020-01-01 00:00:00+00:00,App-Install,X-UNKNOWN-X,Games,Display,Google Ads,UAC,888.334961,1117.0,110080.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,UAC_Crosswords,Media Reported - Click Thru,Crosswords UAC - US - iOS,CROSSWORDS UAC - US - IOS,X-UNKNOWN-X,2020,1,1,0,2019-12-29,2020-01-01 00:00:00+00:00,App-Install,X-UNKNOWN-X,Games,Display,Google Ads,UAC,65.69326,63.0,1903.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
#read in viewthru data from vw dataset
q = '''SELECT 
     account
    , attr_window
    , campaign
    , _match
    , campaign_objective
    , Year
    , Quarter
    , Month
    , week
    , week_start
    , date
    , marketing_initiative
    , marketing_segment
    , marketing_subinitiative
    , channel
    , platform
    , site
    , sum(case when sor_prod = 'All (Core All, NPV)' then spend else 0 end) spend
    , sum(case when sor_prod = 'All (Core All, NPV)' then clicks else 0 end) clicks
    , sum(case when sor_prod = 'All (Core All, NPV)' then impressions else 0 end) impressions
    , sum(case
            when sor_prod = 'Core' then conversions else null end) digi_ada_conversions
    , sum(case
            when sor_prod = 'Crosswords' then conversions else null end) games_conversions
    , sum(case
            when sor_prod = 'EDU' then conversions else null end) edu_conversions
    , sum(case
            when sor_prod = 'Home Delivery' then conversions else null end) hd_conversions
    , sum(case
            when sor_prod = 'CK' then conversions else null end) ck_conversions     
    , sum(case
            when sor_prod = 'Core All (Core, HD, EDU)' then conversions else null end) core_conversions
    , sum(case
            when sor_prod = 'All (Core All, NPV)' then conversions else null end) all_conversions
FROM `nyt-mkt-prd.paid_media_data.placement_daily_vw`
WHERE date >= '2020-01-01' and attr_window in('Media Reported - View Thru')
group by 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,12,13,14,15,16,17
order by date, campaign, _match
    '''
start_time = time.time() 

viewthru_df = pd.read_gbq(q,
                 project_id ='nyt-bigquery-beta-workspace',
                 dialect='standard')

print(f'time took: {str(round(time.time() - start_time, 2))}')

time took: 196.08


In [5]:
#rename columns in the separate dataframes to identify clickthru and viewthru
#first clickthru
for col in clickthru_df.columns:
    if 'conversions' in col: 
        clickthru_df[col+'_ct'] = clickthru_df[col]
        clickthru_df.drop(col, axis=1, inplace=True)
        
#now viewthru
for col in viewthru_df.columns:
    if 'conversions' in col: 
        viewthru_df[col+'_vt'] = viewthru_df[col]
        viewthru_df.drop(col, axis=1, inplace=True)

In [6]:
#now merge the dataframes 
#create a truncated version of viewthru DF to make the merge more efficient
vt_merge = viewthru_df[['account','campaign','_match','date','marketing_initiative','marketing_segment'] + [col for col in viewthru_df.columns if 'conversions' in col]]

#merge with clickthru df
df = clickthru_df.merge(vt_merge, how='left', on=[col for col in vt_merge.columns if 'conversions' not in col])

In [7]:
df.shape

(881041, 34)

In [8]:
df[df.duplicated()]

Unnamed: 0,account,attr_window,campaign,_match,campaign_objective,Year,Quarter,Month,week,week_start,date,marketing_initiative,marketing_segment,marketing_subinitiative,channel,platform,site,spend,clicks,impressions,digi_ada_conversions_ct,games_conversions_ct,edu_conversions_ct,hd_conversions_ct,ck_conversions_ct,core_conversions_ct,all_conversions_ct,digi_ada_conversions_vt,games_conversions_vt,edu_conversions_vt,hd_conversions_vt,ck_conversions_vt,core_conversions_vt,all_conversions_vt


## 0.1 Investigate Spend
where spend = 0 but clicks, impressions, or conversions != 0

In [9]:
#pick a channel (Display) -> campaign names where there is no spend but there are impressions, etc.
#look to see if there are a lot of situations (acquisition focused campaign) -> but if it's not acuqistion 
spend_df = df.loc[(df['spend'] == 0)]
spend_df.shape

(256608, 34)

In [10]:
spend_df.value_counts('channel')

channel
Paid Search        141518
X-UNKNOWN-X         67072
Display             28239
Native              11006
Social               6125
Other                1118
Video                 867
Connected Video       444
Youtube               178
Audio                  38
XX                      3
dtype: int64

In [11]:
spend_df['impressions'].describe()

count    2.566080e+05
mean     2.470748e+03
std      8.985099e+04
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      2.337060e+07
Name: impressions, dtype: float64

In [12]:
spend_or_df = df.loc[(df['spend'] == 0) & 
                    (
                        (df['impressions'] != 0) |
                        (df['clicks'] != 0) |
                        (df['all_conversions_vt'] != 0) | 
                        (df['all_conversions_ct'] != 0)
                        
                    )]
spend_or_df.shape

(52868, 34)

In [13]:
excluded_spend_df = pd.concat([df,spend_or_df]).drop_duplicates(keep=False)
excluded_spend_df.shape

(828173, 34)

In [14]:
excluded_spend_df[excluded_spend_df['spend']==0].sum(axis=0)

account                    NYT Crossword - BambooNYT Crossword - BambooNY...
attr_window                Media Reported - Click ThruMedia Reported - Cl...
campaign                   NYTI_APP_CROSSWD_COUSA_TW_SO_AINS_AndUACmbC7NY...
_match                     CROSSWORD_UA_ANDROID_SATIREACCOUNTS_HANDLES_CP...
campaign_objective         X-UNKNOWN-XX-UNKNOWN-XX-UNKNOWN-XX-UNKNOWN-XX-...
Year                                                               411555617
Quarter                                                               342480
Month                                                                 831160
week                                                                 3186381
marketing_initiative       App-InstallApp-InstallApp-InstallApp-InstallAp...
marketing_segment          X-UNKNOWN-XX-UNKNOWN-XX-UNKNOWN-XX-UNKNOWN-XX-...
marketing_subinitiative    GamesGamesGamesGamesGamesGamesGamesGamesGamesG...
channel                    SocialSocialSocialSocialSocialSocialSocialSoci...

In [15]:
df.shape

(881041, 34)

In [16]:
spend_or_df['impressions'].describe()

count    5.286800e+04
mean     1.199239e+04
std      1.976656e+05
min      0.000000e+00
25%      0.000000e+00
50%      1.000000e+00
75%      1.500000e+01
max      2.337060e+07
Name: impressions, dtype: float64

In [17]:
all_spend_df = spend_conv_total_df = df.loc[(df['spend'] == 0) &(df['impressions'] == 0) & (df['clicks'] == 0) 
                               & (df['total_conversions'] == 0)]
spend_conv_total_df.shape

KeyError: 'total_conversions'

In [None]:
spend_imp_df = df.loc[(df['spend'] == 0) &(df['impressions'] != 0)]
spend_imp_df.shape

In [None]:
spend_imp_df.value_counts('channel')

In [None]:
spend_clicks_df = df.loc[(df['spend'] == 0) &(df['impressions'] != 0) & (df['clicks'] != 0)]
spend_clicks_df.shape

In [None]:
spend_conv_ct_df = df.loc[(df['spend'] == 0) &(df['impressions'] != 0) & (df['clicks'] != 0) 
                               & (df['all_conversions_ct'] != 0)]
spend_conv_ct_df.shape

In [None]:
spend_conv_vt_df = df.loc[(df['spend'] == 0) &(df['impressions'] != 0) & (df['clicks'] != 0) 
                               & (df['all_conversions_vt'] != 0)]
spend_conv_vt_df.shape

In [None]:
spend_conv_vt_df.value_counts('channel')

In [None]:
spend_conv_total_df = df.loc[(df['spend'] == 0) &(df['impressions'] != 0) & (df['clicks'] != 0) 
                               & (df['total_conversions'] != 0)]
spend_conv_total_df.shape

In [None]:
spend_conv_total_df[['all_conversions_ct', 'all_conversions_vt','total_conversions']].sum()

In [None]:
spend_conv_total_df.groupby('channel')[['all_conversions_ct', 'all_conversions_vt','total_conversions']].sum()

In [None]:
spend_both_conv_df = df.loc[(df['spend'] == 0) &(df['impressions'] != 0) & (df['clicks'] != 0) 
                               & (df['all_conversions_vt'] != 0) & (df['all_conversions_ct'] != 0)]
spend_both_conv_df.shape

In [None]:
#look at date before row (21357204)
spend_both_conv_df.sort_values(by='date')

In [None]:
#check dates before
spend_check_1_df = df[(df['date'] == '2020-02-14') & (df['campaign'] == 'X-UNKNOWN-X')]
spend_check_1_df

In [None]:
spend_check_4_df = df[(df['date'] == '2020-02-16') & (df['campaign'] == 'X-UNKNOWN-X')]
spend_check_4_df

In [None]:
spend_check_5_df = df[(df['campaign'] == 'X-UNKNOWN-X') & (df['date'] >= '2020-02-14') & (df['date'] < '2020-03-01')]
spend_check_5_df

In [None]:
spend_check_2_df = spend_check_1_df[(spend_check_1_df['spend'] == 0) & 
                 (spend_check_1_df['impressions'] != 0)]
spend_check_2_df.shape

In [None]:
spend_check_3_df = df[(df['date'] == '2020-02-15') & (df['campaign'] == 'NYTI_AUC_COREXXX_COUSA_TW_SO_WTCP_TAFIFOODXX')]
spend_check_3_df

In [None]:
spend_check_7_df = df[(df['date'] <= '2020-11-10') & (df['campaign'] == '2020_HS_CORE_BRA_SWAY_NA_NA_Reddit')]
spend_check_7_df

In [None]:
spend_df['impressions'].describe(percentiles = [0.1,0.2,0.9])

In [None]:
spend_df['total_conversions'].describe(percentiles = [0.1,0.2,0.95])

In [None]:
spend_df[spend_df['impressions'] > 10000].shape

In [None]:
spend_df[spend_df['impressions'] > 100000].shape

In [None]:
mill_df = spend_df[spend_df['impressions'] > 1000000]
mill_df

In [None]:
spend_both_conv_df.sort_values(by='date').head()

In [None]:
spend_df.value_counts(['marketing_subinitiative'])

In [None]:
spend_conv_total_df.value_counts(['channel'])

# 0.4 Column by Column 'X-UNKNOWN-X' Exploration 

# put this in separate notebook 

## Account

In [None]:
df['account'].value_counts()

In [None]:
df['account'].value_counts(normalize = True)

In [None]:
df[df['account'] == 'X-UNKNOWN-X']['total_conversions'].sum()

In [None]:
df[df['account'] == 'X-UNKNOWN-X']['all_conversions_vt'].sum()

In [None]:
df[df['account'] == 'X-UNKNOWN-X'].head()

## Campaign

In [None]:
print(df['account'].unique())

In [None]:
percent_campaign_unknown_df = df['campaign'].value_counts(normalize = True)
percent_campaign_unknown_df['X-UNKNOWN-X']

In [None]:
campaign_unknown = df[df['campaign'] == 'X-UNKNOWN-X']
campaign_unknown.shape

In [None]:
campaign_unknown.head(10)

## Match

In [None]:
percent_match_unknown_df = df['_match'].value_counts(normalize = True)
percent_match_unknown_df['X-UNKNOWN-X']

In [None]:
match_unknown_df = df[df['_match'] == 'X-UNKNOWN-X']
match_unknown_df.shape

**Seems like these 34 rows all have to do with Twitter and the unknown variables are throughout metadata/cols**

In [None]:
match_unknown_df.sort_values(by=['date'])

## Campaign Objective

In [None]:
campaign_obj_df = df[df['campaign_objective'] == 'X-UNKNOWN-X']
campaign_obj_df.shape

In [None]:
df.value_counts('campaign_objective')

In [None]:
df['campaign_objective'].value_counts(normalize = True)

In [None]:
df[df['campaign_objective'] == 'X-UNKNOWN-X']['total_conversions'].sum()

In [None]:
df[df['campaign_objective'] == 'X-UNKNOWN-X']['all_conversions_vt'].sum()

In [None]:
df[df['campaign_objective'] == 'X-UNKNOWN-X']['all_conversions_ct'].sum()

In [None]:
#seems to have unknowns for entire date range
campaign_obj_df.sort_values(by=['date'])

In [None]:
campaign_obj_df.value_counts('marketing_subinitiative')

In [None]:
campaign_obj_df.value_counts('channel')

In [None]:
campaign_obj_df[['marketing_subinitiative', 'channel']].loc[(df['marketing_subinitiative'] == 'X-UNKNOWN-X') & 
                                                            (df['channel'] == 'X-UNKNOWN-X')]

## Year, Quarter, Month, week, week_start, Date

In [None]:
df[df['Year'] == 'X-UNKNOWN-X'].shape

In [None]:
df[df['Quarter'] == 'X-UNKNOWN-X'].shape

In [None]:
df[df['Month'] == 'X-UNKNOWN-X'].shape

In [None]:
df[df['week'] == 'X-UNKNOWN-X'].shape

In [None]:
df[df['week_start'] == 'X-UNKNOWN-X'].shape

In [None]:
df[df['date'] == 'X-UNKNOWN-X'].shape

## Marketing Initiative (Tactic)

In [None]:
df[df['marketing_initiative'] == 'X-UNKNOWN-X'].shape

In [None]:
#what do normal values look like
df.value_counts('marketing_initiative')

In [None]:
df['marketing_initiative'].value_counts(normalize = True)

In [None]:
df[df['marketing_initiative']== 'X-UNKNOWN-X'].shape

In [None]:
df[df['marketing_initiative'] == 'X-UNKNOWN-X']['total_conversions'].sum()

In [None]:
df[df['marketing_initiative'] == 'X-UNKNOWN-X']['all_conversions_vt'].sum()

In [None]:
df[df['marketing_initiative'] == 'X-UNKNOWN-X']['all_conversions_ct'].sum()

In [None]:
#didn't change anything df[df['marketing_initiative'] == 'X-UNKNOWN-X']['total_conversions'].sum()

#any campaigns with 'BAU' -> Business As Usual
df.loc[(df['marketing_initiative'] == 'X-UNKNOWN-X') & 
       (df['campaign'].str.contains('BAU')), 'marketing_initiative'] = 'Business As Usual'
df[df['marketing_initiative']== 'X-UNKNOWN-X'].shape

In [None]:
#didn't change anything


# #One Day Sale
# df.loc[(df['marketing_initiative'] == 'X-UNKNOWN-X') & 
#        (df['campaign'].str.contains('ODS')), 'marketing_initiative'] = 'One Day Sale'

# #Gifting
# #didn't change anything
# df.loc[(df['marketing_initiative'] == 'X-UNKNOWN-X') & 
#        (df['campaign'].str.contains('GFT')), 'marketing_initiative'] = 'Gifting'
       
       
# #Testing
# df.loc[(df['marketing_initiative'] == 'X-UNKNOWN-X') & 
#        (df['campaign'].str.contains('TST')), 'marketing_initiative'] = 'Testing'    

In [None]:
df[df['marketing_initiative']== 'X-UNKNOWN-X'].value_counts('channel')

In [None]:
paid_search_mkt_init_df = df.loc[(df['marketing_initiative'] == 'X-UNKNOWN-X') & 
       (df['channel'] == 'Paid Search')]
paid_search_mkt_init_df.value_counts('campaign')

In [None]:
unknown_channel_mkt_init_df = df.loc[(df['marketing_initiative'] == 'X-UNKNOWN-X') & 
       (df['channel'] == 'X-UNKNOWN-X')]
unknown_channel_mkt_init_df.value_counts('campaign')

In [None]:
social_mkt_init_df = df.loc[(df['marketing_initiative'] == 'X-UNKNOWN-X') & 
       (df['channel'] == 'Social')]
social_mkt_init_df.value_counts('campaign')

In [None]:
youtube_mkt_init_df = df.loc[(df['marketing_initiative'] == 'X-UNKNOWN-X') & 
       (df['channel'] == 'Youtube')]
youtube_mkt_init_df.value_counts('campaign')

In [None]:
display_mkt_init_df = df.loc[(df['marketing_initiative'] == 'X-UNKNOWN-X') & 
       (df['channel'] == 'Display')]
display_mkt_init_df.value_counts('campaign')

## Marketing Segment

In [None]:
df[df['marketing_segment']== 'X-UNKNOWN-X'].shape

In [None]:
df['marketing_segment'].value_counts(normalize = True)

In [None]:
df[df['marketing_segment'] == 'X-UNKNOWN-X']['total_conversions'].sum()

In [None]:
df[df['marketing_segment'] == 'X-UNKNOWN-X']['all_conversions_vt'].sum()

In [None]:
#what do normal values look like
df.value_counts('marketing_segment')

In [None]:
df[df['marketing_segment']== 'X-UNKNOWN-X'].value_counts('channel')

In [None]:
unknown_mkt_segment_df = df.loc[(df['marketing_segment'] == 'X-UNKNOWN-X') & 
       (df['channel'] == 'X-UNKNOWN-X')]
unknown_mkt_segment_df.value_counts('campaign')

In [None]:
paid_search_mkt_segment_df = df.loc[(df['marketing_segment'] == 'X-UNKNOWN-X') & 
       (df['channel'] == 'Paid Search')]
paid_search_mkt_segment_df.value_counts('campaign')

In [None]:
social_mkt_segment_df = df.loc[(df['marketing_segment'] == 'X-UNKNOWN-X') & 
       (df['channel'] == 'Social')]
social_mkt_segment_df.value_counts('campaign')

In [None]:
display_mkt_segment_df = df.loc[(df['marketing_segment'] == 'X-UNKNOWN-X') & 
       (df['channel'] == 'Display')]
display_mkt_segment_df.value_counts('campaign')

In [None]:
other_mkt_segment_df = df.loc[(df['marketing_segment'] == 'X-UNKNOWN-X') & 
       (df['channel'] == 'Other')]
other_mkt_segment_df.value_counts('campaign')

In [None]:
youtube_mkt_segment_df = df.loc[(df['marketing_segment'] == 'X-UNKNOWN-X') & 
       (df['channel'] == 'Youtube')]
youtube_mkt_segment_df.value_counts('campaign')

In [None]:
video_mkt_segment_df = df.loc[(df['marketing_segment'] == 'X-UNKNOWN-X') & 
       (df['channel'] == 'Video')]
video_mkt_segment_df.value_counts('campaign')

## Marketing Subinitiative (Product)

In [None]:
df[df['marketing_subinitiative']== 'X-UNKNOWN-X'].shape

In [None]:
#what do normal values look like
df.value_counts('marketing_subinitiative')

In [None]:
df['marketing_subinitiative'].value_counts(normalize = True)

In [None]:
df[df['marketing_subinitiative']== 'X-UNKNOWN-X'].value_counts('channel')

In [None]:
unknown_mkt_subinit_df = df.loc[(df['marketing_subinitiative'] == 'X-UNKNOWN-X') & 
       (df['channel'] == 'X-UNKNOWN-X')]
unknown_mkt_subinit_df.value_counts('campaign')

In [None]:
paid_search_mkt_subinit_df = df.loc[(df['marketing_subinitiative'] == 'X-UNKNOWN-X') & 
       (df['channel'] == 'Paid Search')]
paid_search_mkt_subinit_df.value_counts('campaign')

In [None]:
social_mkt_subinit_df = df.loc[(df['marketing_subinitiative'] == 'X-UNKNOWN-X') & 
       (df['channel'] == 'Social')]
social_mkt_subinit_df.value_counts('campaign')

In [None]:
display_mkt_subinit_df = df.loc[(df['marketing_subinitiative'] == 'X-UNKNOWN-X') & 
       (df['channel'] == 'Display')]
display_mkt_subinit_df.value_counts('campaign')

In [None]:
native_mkt_subinit_df = df.loc[(df['marketing_subinitiative'] == 'X-UNKNOWN-X') & 
       (df['channel'] == 'Native')]
native_mkt_subinit_df.value_counts('campaign')

## Channel

**Look at unknowns -> meta data?**

In [None]:
df['channel'].value_counts()

In [None]:
df['channel'].value_counts(normalize = True)

In [None]:
df[df['channel']== 'X-UNKNOWN-X'].shape

In [None]:
#marketing init, subinit, OR segment are unknown 
unknown_or_df = df.loc[(df['channel'] == 'X-UNKNOWN-X') & 
       (
          (df['marketing_initiative'] == 'X-UNKNOWN-X') | 
          (df['marketing_subinitiative'] == 'X-UNKNOWN-X') | 
          (df['marketing_segment'] == 'X-UNKNOWN-X')
       )]
unknown_or_df.shape

In [None]:
#all are unknown 
unknown_and_df = df.loc[(df['channel'] == 'X-UNKNOWN-X') & 
          (df['marketing_initiative'] == 'X-UNKNOWN-X') & 
          (df['marketing_subinitiative'] == 'X-UNKNOWN-X') &
          (df['marketing_segment'] == 'X-UNKNOWN-X')]
unknown_and_df.shape

In [None]:
#only mkt segment unknown
unknown_channel_mkt_segment_df = df.loc[(df['channel'] == 'X-UNKNOWN-X') & 
       (df['marketing_segment'] == 'X-UNKNOWN-X')]
unknown_channel_mkt_segment_df.shape

In [None]:
#only mkt initiative unknown
unknown_channel_mkt_init_df = df.loc[(df['channel'] == 'X-UNKNOWN-X') & 
       (df['marketing_initiative'] == 'X-UNKNOWN-X')]
unknown_channel_mkt_init_df.shape

In [None]:
#only mkt subinitiative unknown
unknown_channel_mkt_subinit_df = df.loc[(df['channel'] == 'X-UNKNOWN-X') & 
       (df['marketing_subinitiative'] == 'X-UNKNOWN-X')]
unknown_channel_mkt_subinit_df.shape

In [None]:
unknown_mkt_init_and_subinit_df = df.loc[(df['channel'] == 'X-UNKNOWN-X') & 
          (df['marketing_initiative'] == 'X-UNKNOWN-X') & 
          (df['marketing_subinitiative'] == 'X-UNKNOWN-X')]
unknown_mkt_init_and_subinit_df.shape

In [None]:
unknown_mkt_segment_and_init_df = df.loc[(df['channel'] == 'X-UNKNOWN-X') & 
          (df['marketing_segment'] == 'X-UNKNOWN-X') & 
          (df['marketing_initiative'] == 'X-UNKNOWN-X')]
unknown_mkt_segment_and_init_df.shape

In [None]:
#only clickthrough conversions recorded 
unknown_channel_df = df[df['channel']== 'X-UNKNOWN-X']
unknown_channel_df[['channel','all_conversions_ct','all_conversions_vt','total_conversions']].sum()

In [None]:
#show the clickthrough conversions
#could relabel these as Paid Search???
unknown_channel_df[unknown_channel_df['all_conversions_ct'] > 0]

## Platform

In [None]:
df['platform'].value_counts()

In [None]:
df['platform'].value_counts(normalize = True)

In [None]:
df[df['platform'] == 'X-UNKNOWN-X']

## Site

In [None]:
df['site'].value_counts()

In [None]:
df[df['site'] == 'X-UNKNOWN-X']

# 0.5 Miscellaneous

In [None]:
# no longer using
unknown_data = {
    'account': [34],
    'campaign': [34],
    '_match': [34],
    'campaign_objective': [113623],
    'date': [0],
    'marketing_initiative': [47690],
    'marketing_segment': [146257],
    'marketing_subinitiative': [54373],
    'channel': [67093],
    'platform': [100]
}

In [None]:
# no longer using 
zero_data = {
    'spend': [287886],
    'clicks': [455041],
    'impressions': [257788],
    'digi_ada_conversions_ct': [848358],
    'games_conversions_ct': [866122],
    'edu_conversions_ct': [869552],
    'hd_conversions_ct': [863469], 
    'ck_conversions_ct': [866666],
    'core_conversions_ct': [839002],
    'all_conversions_ct': [837180],
    'digi_ada_conversions_vt': [743328],
    'games_conversions_vt': [850975],
    'edu_conversions_vt': [858446],
    'hd_conversions_vt': [853454],
    'ck_conversions_vt': [837689],
    'core_conversions_vt': [711931],
    'all_conversions_vt': [699470]   
}

In [None]:
#Site traffic
df['site'].value_counts()

In [None]:
df[df['site'] == 'DBM_US'].value_counts('channel')

In [None]:
# low volume vs high volume channels
df[df['channel'] == 'Video'][['all_conversions_vt', 'all_conversions_ct','total_conversions']].sum()

In [None]:
df[df['channel'] == 'Video'].head(15)

# 0.6 Investigate why cooking has no paid search

## Switch to _vw table fixed it

In [None]:
ck_df = df[df['marketing_subinitiative']== 'Cooking']

In [None]:
ck_df.head()

In [None]:
ck_df['channel'].value_counts()

In [None]:
paid_search_ck_df = ck_df[ck_df['channel'] == 'Paid Search']

In [None]:
paid_search_ck_df.shape

In [None]:
paid_search_ck_df.head()

In [None]:
# #drop cols you don't want in solution attempt
# df_copy = df_copy.drop(['account','campaign','_match','campaign_objective','Month','Quarter','week','week_start','date','marketing_initiative',
#               'marketing_segment','platform',
#              'digi_ada_conversions_ct','games_conversions_ct','edu_conversions_ct','hd_conversions_ct','ck_conversions_ct','core_conversions_ct',
#              'digi_ada_conversions_vt','games_conversions_vt','edu_conversions_vt','hd_conversions_vt','ck_conversions_vt','core_conversions_vt',
#              'year_week_monday','channel_mkt_init'], axis=1)
# df_copy.reset_index(inplace = True)