In [49]:
import pandas as pd
from pandas.io import gbq
import numpy as np
import os
import glob
import re

In [50]:
import pydata_google_auth

In [78]:
    connect_bq = [
        'https://www.googleapis.com/auth/cloud-platform',
        'https://www.googleapis.com/auth/drive',
    ]

    credentials = pydata_google_auth.get_user_credentials(
        connect_bq,
        auth_local_webserver=True,
    )

#### Query that retrieves add_ons per user for a month and new addition to the addons for that same month sorted in order as "ordered_updated_addons"

In [79]:
sql_query = """
# June 1st Add-Ons
CREATE TEMP FUNCTION STRING_DEDUP(str STRING) AS (
(SELECT STRING_AGG(item ORDER BY item) FROM (
SELECT DISTINCT item FROM UNNEST(SPLIT(str)) item 
)) 
);
----------------------------------------------------------------------------------------------------------------------
with daily_status as (
Select * , DATE_TRUNC(day, month) as change_month
from `fubotv-prod.data_insights.daily_status_static_update` t1
where final_status_restated like '%paid%'
AND day = current_date()-1
),
add_on as (
select account_code, change_month, STRING_DEDUP(add_ons) as addons_sorted,STRING_DEDUP(bundle_add_on_list) as bundle_addons_sorted
FROM daily_status
)
select DISTINCT change_month, account_code, bundle_addons_sorted, addons_sorted
from add_on
ORDER BY 1,2
LIMIT 1000
"""

In [97]:
df = pd.read_gbq(
    sql_query,
    project_id='fubotv-prod',
    credentials=credentials,
)

In [98]:
df.head()

Unnamed: 0,change_month,account_code,bundle_addons_sorted,addons_sorted
0,2022-06-01,53cd5564c44bf00200483fd8,"advanced-dvr-1000,fubo-extra-lite,third-screen...","advanced-dvr-1000,fubo-extra-lite,rsn-fee,thir..."
1,2022-06-01,542cc7a4c95b4a0200cf1f7f,"advanced-dvr-1000,fubo-extra-lite,third-screen...","advanced-dvr-1000,fubo-extra-lite,third-screen..."
2,2022-06-01,5430e0c50d22c6020023a64e,"advanced-dvr-1000,third-screen,unlimited-scree...","advanced-dvr-1000,rsn-fee,third-screen,unlimit..."
3,2022-06-01,5430ea839ca3180200fea345,,
4,2022-06-01,543204aadea93902000052d1,"advanced-dvr-1000,third-screen,unlimited-scree...","advanced-dvr-1000,third-screen,unlimited-scree..."


In [None]:
old_df = df #fail safe

In [99]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   change_month          1000 non-null   datetime64[ns]
 1   account_code          1000 non-null   object        
 2   bundle_addons_sorted  980 non-null    object        
 3   addons_sorted         985 non-null    object        
dtypes: datetime64[ns](1), object(3)
memory usage: 31.4+ KB


In [100]:
df = df.fillna(" ")

In [104]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   change_month          1000 non-null   datetime64[ns]
 1   account_code          1000 non-null   object        
 2   bundle_addons_sorted  1000 non-null   object        
 3   addons_sorted         1000 non-null   object        
dtypes: datetime64[ns](1), object(3)
memory usage: 31.4+ KB


In [103]:
df.head()

Unnamed: 0,change_month,account_code,bundle_addons_sorted,addons_sorted
0,2022-06-01,53cd5564c44bf00200483fd8,"advanced-dvr-1000,fubo-extra-lite,third-screen...","advanced-dvr-1000,fubo-extra-lite,rsn-fee,thir..."
1,2022-06-01,542cc7a4c95b4a0200cf1f7f,"advanced-dvr-1000,fubo-extra-lite,third-screen...","advanced-dvr-1000,fubo-extra-lite,third-screen..."
2,2022-06-01,5430e0c50d22c6020023a64e,"advanced-dvr-1000,third-screen,unlimited-scree...","advanced-dvr-1000,rsn-fee,third-screen,unlimit..."
3,2022-06-01,5430ea839ca3180200fea345,,
4,2022-06-01,543204aadea93902000052d1,"advanced-dvr-1000,third-screen,unlimited-scree...","advanced-dvr-1000,third-screen,unlimited-scree..."


###### Comparing ordered_updated_addons with ordered_previous_addons and get the difference by splitting the objects into sets. Making that a new column new_addition

In [105]:
df['new_addition'] = [','.join(set(n.split(',')) - set(o.split(','))) 
                                                          for o, n in zip(df.bundle_addons_sorted, df.addons_sorted)]

In [106]:
df

Unnamed: 0,change_month,account_code,bundle_addons_sorted,addons_sorted,new_addition
0,2022-06-01,53cd5564c44bf00200483fd8,"advanced-dvr-1000,fubo-extra-lite,third-screen...","advanced-dvr-1000,fubo-extra-lite,rsn-fee,thir...",rsn-fee
1,2022-06-01,542cc7a4c95b4a0200cf1f7f,"advanced-dvr-1000,fubo-extra-lite,third-screen...","advanced-dvr-1000,fubo-extra-lite,third-screen...",
2,2022-06-01,5430e0c50d22c6020023a64e,"advanced-dvr-1000,third-screen,unlimited-scree...","advanced-dvr-1000,rsn-fee,third-screen,unlimit...",rsn-fee
3,2022-06-01,5430ea839ca3180200fea345,,,
4,2022-06-01,543204aadea93902000052d1,"advanced-dvr-1000,third-screen,unlimited-scree...","advanced-dvr-1000,third-screen,unlimited-scree...",
...,...,...,...,...,...
995,2022-06-01,55e32e6b4651a00f0018910d,advanced-dvr-250,advanced-dvr,advanced-dvr
996,2022-06-01,55e3329ece3a600f00d001de,"advanced-dvr-250,third-screen","advanced-dvr-250,third-screen",
997,2022-06-01,55e3392d78cf680700d6b4ac,"advanced-dvr-1000,third-screen,unlimited-scree...","advanced-dvr-1000,rsn-fee,third-screen,unlimit...",rsn-fee
998,2022-06-01,55e33cdb4651a00f0018914a,"advanced-dvr-250,third-screen","advanced-dvr-250,third-screen",


#### Splitting new_addition column into additional columns and exploding them into individual rows

In [107]:
new_df = df #Fail safe

In [108]:
new_df['new_addition'] = new_df['new_addition'].str.split(',')
new_df = new_df.explode('new_addition').reset_index(drop=True)
cols = list(new_df.columns)
new2_df = new_df[cols]

In [109]:
new2_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   change_month          1115 non-null   datetime64[ns]
 1   account_code          1115 non-null   object        
 2   bundle_addons_sorted  1115 non-null   object        
 3   addons_sorted         1115 non-null   object        
 4   new_addition          1115 non-null   object        
dtypes: datetime64[ns](1), object(4)
memory usage: 43.7+ KB


In [110]:
new2_df.to_csv('Test-All data.csv', index =False)

In [111]:
new2_df = new2_df.drop(['addons_sorted', 'bundle_addons_sorted'], axis=1)

In [112]:
new2_df

Unnamed: 0,change_month,account_code,new_addition
0,2022-06-01,53cd5564c44bf00200483fd8,rsn-fee
1,2022-06-01,542cc7a4c95b4a0200cf1f7f,
2,2022-06-01,5430e0c50d22c6020023a64e,rsn-fee
3,2022-06-01,5430ea839ca3180200fea345,
4,2022-06-01,543204aadea93902000052d1,
...,...,...,...
1110,2022-06-01,55e32e6b4651a00f0018910d,advanced-dvr
1111,2022-06-01,55e3329ece3a600f00d001de,
1112,2022-06-01,55e3392d78cf680700d6b4ac,rsn-fee
1113,2022-06-01,55e33cdb4651a00f0018914a,


In [113]:
new2_df = new2_df.drop_duplicates()

In [114]:
new2_df

Unnamed: 0,change_month,account_code,new_addition
0,2022-06-01,53cd5564c44bf00200483fd8,rsn-fee
1,2022-06-01,542cc7a4c95b4a0200cf1f7f,
2,2022-06-01,5430e0c50d22c6020023a64e,rsn-fee
3,2022-06-01,5430ea839ca3180200fea345,
4,2022-06-01,543204aadea93902000052d1,
...,...,...,...
1110,2022-06-01,55e32e6b4651a00f0018910d,advanced-dvr
1111,2022-06-01,55e3329ece3a600f00d001de,
1112,2022-06-01,55e3392d78cf680700d6b4ac,rsn-fee
1113,2022-06-01,55e33cdb4651a00f0018914a,


In [115]:
new3_df = new2_df

In [116]:
new3_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1115 entries, 0 to 1114
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   change_month  1115 non-null   datetime64[ns]
 1   account_code  1115 non-null   object        
 2   new_addition  1115 non-null   object        
dtypes: datetime64[ns](1), object(2)
memory usage: 34.8+ KB


In [117]:
new4_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   change_month  27 non-null     datetime64[ns]
 1   new_addition  27 non-null     object        
 2   account_code  27 non-null     int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 776.0+ bytes


df = df.groupby(['id','product']).agg({'quantity':'sum'}).reset_index()


In [118]:
new4_df = new4_df.groupby(['change_month', 'new_addition'],  as_index=False ).agg({'account_code': 'nunique'})


In [119]:
print(new4_df)

   change_month           new_addition  account_code
0    2022-06-01                                    1
1    2022-06-01              4k-screen             1
2    2022-06-01           advanced-dvr             1
3    2022-06-01      advanced-dvr-1000             1
4    2022-06-01       advanced-dvr-250             1
5    2022-06-01              adventure             1
6    2022-06-01           amc-premiere             1
7    2022-06-01               deportes             1
8    2022-06-01          entertainment             1
9    2022-06-01       entertainment-v2             1
10   2022-06-01                  extra             1
11   2022-06-01        fubo-extra-lite             1
12   2022-06-01       intl-sports-plus             1
13   2022-06-01                 latino             1
14   2022-06-01              news-plus             1
15   2022-06-01                pantaya             1
16   2022-06-01             portuguese             1
17   2022-06-01               premiere        

In [120]:
new4_df.to_csv('Test-Count of Users - 2000 entries for June 2022.csv', index =False)