In [27]:
import pandas as pd
from pandas.io import gbq
import numpy as np
import os
import glob
import re

In [28]:
import pydata_google_auth

In [29]:
    connect_bq = [
        'https://www.googleapis.com/auth/cloud-platform',
        'https://www.googleapis.com/auth/drive',
    ]

    credentials = pydata_google_auth.get_user_credentials(
        connect_bq,
        auth_local_webserver=True,
    )

#### Query that retrieves add_ons per user for a month and the add ons that a bundle has

In [30]:
sql_query = """
CREATE TEMP FUNCTION STRING_DEDUP(str STRING) AS (
(SELECT STRING_AGG(item ORDER BY item) FROM (
SELECT DISTINCT item FROM UNNEST(SPLIT(str)) item 
)) 
);
----------------------------------------------------------------------------------------------------------------------
with daily_status as (
Select * , DATE_TRUNC(day, month) as change_month
from `fubotv-prod.data_insights.daily_status_static_update` t1
where final_status_restated like '%paid%'
and day >= '2021-01-01'
AND day <= current_date()-1
),
add_on as (
select account_code, change_month, STRING_DEDUP(add_ons) as addons_sorted,STRING_DEDUP(bundle_add_on_list) as bundle_addons_sorted
FROM daily_status
)
select DISTINCT change_month, account_code, bundle_addons_sorted, addons_sorted
from add_on
"""

In [56]:
df = pd.read_gbq(
    sql_query,
    project_id='fubotv-prod',
    credentials=credentials,
)

#### Checking / Housekeeping

In [57]:
df.head()

Unnamed: 0,change_month,account_code,bundle_addons_sorted,addons_sorted
0,2022-03-01,5d4f2d7a007d840001e3b65d,"advanced-dvr-250,third-screen","advanced-dvr-250,rsn-fee,third-screen,unlimite..."
1,2022-06-01,625f79aabec11e000174c880,"advanced-dvr-1000,fubo-extra-lite,third-screen...","advanced-dvr-1000,fubo-extra-lite,news-plus,rs..."
2,2021-12-01,5edb13c5b128a80001378ebb,"advanced-dvr,third-screen","advanced-dvr,latino,third-screen"
3,2021-03-01,5734a4654c41890100d47b6c,"advanced-dvr,third-screen",third-screen
4,2022-06-01,5eadd90d1525120001f8873c,advanced-dvr-250,"advanced-dvr,extra"


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18296490 entries, 0 to 18296489
Data columns (total 4 columns):
 #   Column                Dtype         
---  ------                -----         
 0   change_month          datetime64[ns]
 1   account_code          object        
 2   bundle_addons_sorted  object        
 3   addons_sorted         object        
dtypes: datetime64[ns](1), object(3)
memory usage: 558.4+ MB


##### if the bundles addon column is empty, discard that row

In [59]:
df = df.dropna()

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17269293 entries, 0 to 18296489
Data columns (total 4 columns):
 #   Column                Dtype         
---  ------                -----         
 0   change_month          datetime64[ns]
 1   account_code          object        
 2   bundle_addons_sorted  object        
 3   addons_sorted         object        
dtypes: datetime64[ns](1), object(3)
memory usage: 658.8+ MB


##### Comparing bundle add_ons to the add_ons field and identifying the difference between these two fields and joining each add_on by a comma to a new field called new_addition

In [60]:
df['nonbundle_addons'] = [','.join(set(n.split(',')) - set(o.split(','))) 
                                                          for o, n in zip(df.bundle_addons_sorted, df.addons_sorted)]

In [61]:
df

Unnamed: 0,change_month,account_code,bundle_addons_sorted,addons_sorted,nonbundle_addons
0,2022-03-01,5d4f2d7a007d840001e3b65d,"advanced-dvr-250,third-screen","advanced-dvr-250,rsn-fee,third-screen,unlimite...","unlimited-screen-home,rsn-fee"
1,2022-06-01,625f79aabec11e000174c880,"advanced-dvr-1000,fubo-extra-lite,third-screen...","advanced-dvr-1000,fubo-extra-lite,news-plus,rs...","news-plus,rsn-fee"
2,2021-12-01,5edb13c5b128a80001378ebb,"advanced-dvr,third-screen","advanced-dvr,latino,third-screen",latino
3,2021-03-01,5734a4654c41890100d47b6c,"advanced-dvr,third-screen",third-screen,
4,2022-06-01,5eadd90d1525120001f8873c,advanced-dvr-250,"advanced-dvr,extra","advanced-dvr,extra"
...,...,...,...,...,...
18296485,2021-11-01,6154bd4f6d0058000113a797,"advanced-dvr-1000,fubo-extra-lite,third-screen...","advanced-dvr-1000,fubo-extra-lite,rsn-fee,thir...",rsn-fee
18296486,2021-06-01,60021a0ac44a8f0001ae0f53,"advanced-dvr-1000,fubo-extra-lite,third-screen...","advanced-dvr-1000,fubo-extra-lite,rsn-fee,thir...",rsn-fee
18296487,2022-05-01,617480ea9aca270001dcbda1,"advanced-dvr-1000,fubo-extra-lite,third-screen...","advanced-dvr-1000,fubo-extra-lite,rsn-fee,thir...",rsn-fee
18296488,2021-11-01,6168dec7b2c7ad0001dcf1a1,"advanced-dvr-1000,fubo-extra-lite,third-screen...","advanced-dvr-1000,fubo-extra-lite,rsn-fee,thir...",rsn-fee


#### Splitting new_addition column into additional columns and exploding them into individual rows

In [62]:
df['nonbundle_addons'] = df['nonbundle_addons'].str.split(',')
df = df.explode('nonbundle_addons').reset_index(drop=True)
cols = list(df.columns)
df = df[cols]

In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18767739 entries, 0 to 18767738
Data columns (total 5 columns):
 #   Column                Dtype         
---  ------                -----         
 0   change_month          datetime64[ns]
 1   account_code          object        
 2   bundle_addons_sorted  object        
 3   addons_sorted         object        
 4   nonbundle_addons      object        
dtypes: datetime64[ns](1), object(4)
memory usage: 715.9+ MB


In [41]:
df.head()

Unnamed: 0,change_month,account_code,bundle_addons_sorted,addons_sorted,nonbundle_addons
0,2022-02-01,61b79491bc49a800014cf8e3,"advanced-dvr-1000,fubo-extra-lite,third-screen...","advanced-dvr-1000,deportes,fubo-extra-lite,rsn...",deportes
1,2022-02-01,61b79491bc49a800014cf8e3,"advanced-dvr-1000,fubo-extra-lite,third-screen...","advanced-dvr-1000,deportes,fubo-extra-lite,rsn...",rsn-fee
2,2021-10-01,602f08ada99b060001ec72de,"advanced-dvr-1000,fubo-extra-lite,third-screen...","advanced-dvr-1000,amc-premiere,fubo-extra-lite...",amc-premiere
3,2022-05-01,5f99b0ecc89fdc00019e9234,"4k-screen,advanced-dvr-1000,fubo-extra-lite,ne...","4k-screen,advanced-dvr-1000,fubo-extra-lite,ne...",
4,2021-09-01,5fce5f8223e3470001f44351,"advanced-dvr-1000,fubo-extra-lite,third-screen...","advanced-dvr-1000,deportes,fubo-extra-lite,thi...",deportes


In [64]:
df = df.drop(['addons_sorted', 'bundle_addons_sorted'], axis=1)

In [65]:
df = df.drop_duplicates()

In [44]:
df

Unnamed: 0,change_month,account_code,nonbundle_addons
0,2022-02-01,61b79491bc49a800014cf8e3,deportes
1,2022-02-01,61b79491bc49a800014cf8e3,rsn-fee
2,2021-10-01,602f08ada99b060001ec72de,amc-premiere
3,2022-05-01,5f99b0ecc89fdc00019e9234,
4,2021-09-01,5fce5f8223e3470001f44351,deportes
...,...,...,...
18767732,2022-06-01,60b56dd40aa35c00015192f3,fubo-extra-lite
18767733,2022-05-01,600c808101d46400017e6068,rsn-fee
18767734,2021-05-01,6088a805408e240001ca2516,rsn-fee
18767735,2021-10-01,613e3d66eb4721000166b76c,rsn-fee


#### Aggregating by month and new_addition to count distinct account_code

In [85]:
df = df.groupby(['change_month', 'nonbundle_addons'],  as_index=False ).agg({'account_code': 'nunique'})

In [105]:
df = df[df.nonbundle_addons != 'rsn-fee']
df = df[df.nonbundle_addons != '']
df = df[df.nonbundle_addons != ""]

In [106]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 522 entries, 1 to 557
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   change_month      522 non-null    datetime64[ns]
 1   nonbundle_addons  522 non-null    object        
 2   account_code      522 non-null    int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 16.3+ KB


In [107]:
df.to_csv('/Users/nanditanandakumar/Dropbox (fuboTV)/Business Analytics/Team/Nandita/Git/Addons_Project/NonBundle_AddOns - Count.csv', index =False)

In [108]:
df.to_gbq ( destination_table='business_analytics.temp_nonbundle_addon_counts', project_id= 'fubotv-dev', if_exists= 'replace')