In [1]:
import pandas as pd
from pandas.io import gbq
import numpy as np
import os
import glob
import re

In [2]:
import pydata_google_auth

In [3]:
    connect_bq = [
        'https://www.googleapis.com/auth/cloud-platform',
        'https://www.googleapis.com/auth/drive',
    ]

    credentials = pydata_google_auth.get_user_credentials(
        connect_bq,
        auth_local_webserver=True,
    )

#### Query that retrieves add_ons per user for a month and the add ons that a bundle has

In [4]:
sql_query = """
CREATE TEMP FUNCTION STRING_DEDUP(str STRING) AS (
(SELECT STRING_AGG(item ORDER BY item) FROM (
SELECT DISTINCT item FROM UNNEST(SPLIT(str)) item 
)) 
);
----------------------------------------------------------------------------------------------------------------------
with daily_status as (
Select * , DATE_TRUNC(day, month) as change_month
from `fubotv-prod.data_insights.daily_status_static_update` t1
where final_status_restated like '%paid%'
and day >= '2021-01-01'
AND day <= current_date()-1
),
add_on as (
select account_code, change_month, STRING_DEDUP(add_ons) as addons_sorted,STRING_DEDUP(bundle_add_on_list) as bundle_addons_sorted
FROM daily_status
)
select DISTINCT change_month, account_code, bundle_addons_sorted, addons_sorted
from add_on
"""

In [6]:
df = pd.read_gbq(
    sql_query,
    project_id='fubotv-prod',
    credentials=credentials,
)

#### Checking / Housekeeping

In [7]:
df.head()

Unnamed: 0,change_month,account_code,bundle_addons_sorted,addons_sorted
0,2021-05-01,5f35eea88374d800018a52d5,"advanced-dvr,showtime,third-screen","advanced-dvr,showtime,third-screen"
1,2021-12-01,5d29fdbbb021e30001f492ac,"advanced-dvr-250,third-screen","advanced-dvr-250,adventure,third-screen"
2,2021-01-01,5e167a959ff70b0001d582b6,"advanced-dvr,third-screen","advanced-dvr,deportes,rsn-fee,third-screen,unl..."
3,2022-06-01,6147ace4bad80c00014e68d0,"advanced-dvr-1000,third-screen,unlimited-scree...","advanced-dvr-1000,epix,rsn-fee,starz,third-scr..."
4,2021-12-01,607f603e62831a0001cf7a78,"advanced-dvr-1000,fubo-extra-lite,third-screen...","advanced-dvr-1000,fubo-extra-lite,intl-sports-..."


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18296490 entries, 0 to 18296489
Data columns (total 4 columns):
 #   Column                Dtype         
---  ------                -----         
 0   change_month          datetime64[ns]
 1   account_code          object        
 2   bundle_addons_sorted  object        
 3   addons_sorted         object        
dtypes: datetime64[ns](1), object(3)
memory usage: 558.4+ MB


##### Remove nulls

In [9]:
df = df.dropna()

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17269293 entries, 0 to 18296489
Data columns (total 4 columns):
 #   Column                Dtype         
---  ------                -----         
 0   change_month          datetime64[ns]
 1   account_code          object        
 2   bundle_addons_sorted  object        
 3   addons_sorted         object        
dtypes: datetime64[ns](1), object(3)
memory usage: 658.8+ MB


##### Comparing bundle add_ons to the add_ons field and identifying the difference between these two fields and joining each add_on by a comma to a new field called new_addition

In [11]:
df['new_addition'] = [','.join(set(n.split(',')) - set(o.split(','))) 
                                                          for o, n in zip(df.bundle_addons_sorted, df.addons_sorted)]

In [12]:
df

Unnamed: 0,change_month,account_code,bundle_addons_sorted,addons_sorted,new_addition
0,2021-05-01,5f35eea88374d800018a52d5,"advanced-dvr,showtime,third-screen","advanced-dvr,showtime,third-screen",
1,2021-12-01,5d29fdbbb021e30001f492ac,"advanced-dvr-250,third-screen","advanced-dvr-250,adventure,third-screen",adventure
2,2021-01-01,5e167a959ff70b0001d582b6,"advanced-dvr,third-screen","advanced-dvr,deportes,rsn-fee,third-screen,unl...","rsn-fee,deportes,unlimited-screen-home"
3,2022-06-01,6147ace4bad80c00014e68d0,"advanced-dvr-1000,third-screen,unlimited-scree...","advanced-dvr-1000,epix,rsn-fee,starz,third-scr...","rsn-fee,starz,epix"
4,2021-12-01,607f603e62831a0001cf7a78,"advanced-dvr-1000,fubo-extra-lite,third-screen...","advanced-dvr-1000,fubo-extra-lite,intl-sports-...",intl-sports-plus
...,...,...,...,...,...
18296485,2021-07-01,5e14eb4db0631800017e78c5,"advanced-dvr-1000,fubo-extra-lite,third-screen...","advanced-dvr-1000,fubo-extra-lite,rsn-fee,thir...",rsn-fee
18296486,2022-01-01,5efe133a9ab22100017bc961,"advanced-dvr-1000,fubo-extra-lite,third-screen...","advanced-dvr-1000,fubo-extra-lite,rsn-fee,thir...",rsn-fee
18296487,2022-03-01,61452815b8474700019dc2da,"advanced-dvr-1000,fubo-extra-lite,third-screen...","advanced-dvr-1000,fubo-extra-lite,rsn-fee,thir...",rsn-fee
18296488,2022-04-01,5fe943971f58fc000198154e,"advanced-dvr-1000,fubo-extra-lite,third-screen...","advanced-dvr-1000,fubo-extra-lite,rsn-fee,thir...",rsn-fee


#### Splitting new_addition column into additional columns and exploding them into individual rows

In [14]:
df['new_addition'] = df['new_addition'].str.split(',')
df = df.explode('new_addition').reset_index(drop=True)
cols = list(df.columns)
df = df[cols]

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18767739 entries, 0 to 18767738
Data columns (total 5 columns):
 #   Column                Dtype         
---  ------                -----         
 0   change_month          datetime64[ns]
 1   account_code          object        
 2   bundle_addons_sorted  object        
 3   addons_sorted         object        
 4   new_addition          object        
dtypes: datetime64[ns](1), object(4)
memory usage: 715.9+ MB


In [16]:
df.to_csv('/Users/nanditanandakumar/Dropbox (fuboTV)/Business Analytics/Team/Nandita/Git/Add_Ons_Project/All data.csv', index =False)

In [17]:
df.head()

Unnamed: 0,change_month,account_code,bundle_addons_sorted,addons_sorted,new_addition
0,2021-05-01,5f35eea88374d800018a52d5,"advanced-dvr,showtime,third-screen","advanced-dvr,showtime,third-screen",
1,2021-12-01,5d29fdbbb021e30001f492ac,"advanced-dvr-250,third-screen","advanced-dvr-250,adventure,third-screen",adventure
2,2021-01-01,5e167a959ff70b0001d582b6,"advanced-dvr,third-screen","advanced-dvr,deportes,rsn-fee,third-screen,unl...",rsn-fee
3,2021-01-01,5e167a959ff70b0001d582b6,"advanced-dvr,third-screen","advanced-dvr,deportes,rsn-fee,third-screen,unl...",deportes
4,2021-01-01,5e167a959ff70b0001d582b6,"advanced-dvr,third-screen","advanced-dvr,deportes,rsn-fee,third-screen,unl...",unlimited-screen-home


In [18]:
df = df.drop(['addons_sorted', 'bundle_addons_sorted'], axis=1)

In [19]:
df

Unnamed: 0,change_month,account_code,new_addition
0,2021-05-01,5f35eea88374d800018a52d5,
1,2021-12-01,5d29fdbbb021e30001f492ac,adventure
2,2021-01-01,5e167a959ff70b0001d582b6,rsn-fee
3,2021-01-01,5e167a959ff70b0001d582b6,deportes
4,2021-01-01,5e167a959ff70b0001d582b6,unlimited-screen-home
...,...,...,...
18767734,2021-07-01,5e14eb4db0631800017e78c5,rsn-fee
18767735,2022-01-01,5efe133a9ab22100017bc961,rsn-fee
18767736,2022-03-01,61452815b8474700019dc2da,rsn-fee
18767737,2022-04-01,5fe943971f58fc000198154e,rsn-fee


In [20]:
df = df.drop_duplicates()
df = df.dropna()

In [21]:
df

Unnamed: 0,change_month,account_code,new_addition
0,2021-05-01,5f35eea88374d800018a52d5,
1,2021-12-01,5d29fdbbb021e30001f492ac,adventure
2,2021-01-01,5e167a959ff70b0001d582b6,rsn-fee
3,2021-01-01,5e167a959ff70b0001d582b6,deportes
4,2021-01-01,5e167a959ff70b0001d582b6,unlimited-screen-home
...,...,...,...
18767734,2021-07-01,5e14eb4db0631800017e78c5,rsn-fee
18767735,2022-01-01,5efe133a9ab22100017bc961,rsn-fee
18767736,2022-03-01,61452815b8474700019dc2da,rsn-fee
18767737,2022-04-01,5fe943971f58fc000198154e,rsn-fee


#### Aggregating by month and new_addition to count distinct account_code

In [22]:
df = df.groupby(['change_month', 'new_addition'],  as_index=False ).agg({'account_code': 'nunique'})

In [23]:
print(df)

    change_month           new_addition  account_code
0     2021-01-01                               445184
1     2021-01-01           advanced-dvr         11215
2     2021-01-01      advanced-dvr-1000          3508
3     2021-01-01       advanced-dvr-250         56869
4     2021-01-01              adventure          6088
..           ...                    ...           ...
553   2022-06-01                  starz          1910
554   2022-06-01    starz-epix-showtime          6572
555   2022-06-01           third-screen          2226
556   2022-06-01              tv5-monde           707
557   2022-06-01  unlimited-screen-home          3103

[558 rows x 3 columns]


In [24]:
df.to_csv('/Users/nanditanandakumar/Dropbox (fuboTV)/Business Analytics/Team/Nandita/Git/Add_Ons_Project/Count of Users per Add On - By Month.csv', index =False)