In [1]:
import pandas as pd
import numpy as np
import os.path as op
import matplotlib.pyplot as plt

In [25]:
from ta_lib.core.api import create_context, list_datasets, load_dataset, display_as_tabs,merge_info

In [3]:
config_path = op.join('conf', 'config.yml')
context = create_context(config_path)

In [19]:
google_df = load_dataset(context, 'cleaned/google')
product_df = load_dataset(context, 'cleaned/product')
sales_df = load_dataset(context, 'cleaned/sales')
media_df = load_dataset(context, 'cleaned/media')
theme_list_df = load_dataset(context, 'cleaned/theme_list')
tpl_df = load_dataset(context, 'cleaned/tpl')

### Emerging Theme in the google search data

In [5]:
google_df['week_day'] = pd.to_datetime(google_df['date']).dt.dayofweek
google_df['week'] = pd.to_datetime(google_df['date']).dt.week
google_df['month'] = pd.to_datetime(google_df['date']).dt.month
google_df['quarter'] = pd.to_datetime(google_df['date']).dt.quarter
google_df['year'] = pd.to_datetime(google_df['date']).dt.year

In [6]:
g_df = google_df.drop(columns=['date','week_number','year_new'])
g_df = pd.merge(g_df,theme_list_df,on= 'claim_id', how = 'inner')
g_df = g_df.drop(columns='claim_id')
g_df

Unnamed: 0,platform,search_volume,week_day,week,month,quarter,year,claim_name
0,google,349,6,1,1,1,2014,gluten free
1,google,349,0,2,1,1,2014,gluten free
2,google,697,1,2,1,1,2014,gluten free
3,google,349,4,2,1,1,2014,gluten free
4,google,697,0,4,1,1,2014,gluten free
...,...,...,...,...,...,...,...,...
181560,google,42,2,30,7,3,2019,snickerdoodle
181561,google,84,2,39,9,3,2019,snickerdoodle
181562,amazon,135,0,23,6,2,2018,hemp seeds
181563,amazon,39,3,5,1,1,2019,gingerbread


In [7]:
g_df.claim_name.nunique()

160

In [8]:
d = g_df.groupby(['year','claim_name'])['search_volume'].sum().reset_index()
# d.sort_values(by= ['search_volume'], ascending=[False],inplace=True)

In [9]:
df = d.groupby(['year','claim_name'])['search_volume'].sum()

In [10]:
df.sort_values( ascending=False,inplace=True)
df = df.sort_index(level = 0, sort_remaining=False).to_frame().reset_index()

In [11]:
# year = df.year.unique()
# for i in year:
#     print(df[df.year == i].iloc[:5])

In [12]:
print(df[df.year == 2019].iloc[:5].reset_index(drop=True))

   year             claim_name  search_volume
0  2019  ethical - environment       14703112
1  2019                  honey       13585784
2  2019                 shrimp       12736320
3  2019             sugar free       10484779
4  2019       health (passive)       10377761


### Feature Table

In [13]:
import ta_lib.eda.api as eda

In [21]:
google_df = google_df.drop(columns=['week_number','year_new'])

In [28]:
sum1 = eda.get_variable_summary(google_df)
sum2 = eda.get_variable_summary(product_df)
sum3 = eda.get_variable_summary(sales_df)
sum4 = eda.get_variable_summary(media_df)
sum5 = eda.get_variable_summary(theme_list_df)
sum6 = eda.get_variable_summary(tpl_df)

display_as_tabs([('google', sum1), ('product', sum2), ('sales', sum3)
                , ('media', sum4), ('theme_list', sum5), ('theme_production_list', sum6)])

In [23]:
# GOOGLE
google_df['week'] = pd.to_datetime(google_df['date']).dt.week
google_df['month'] = pd.to_datetime(google_df['date']).dt.month
google_df['quarter'] = pd.to_datetime(google_df['date']).dt.quarter
google_df['year'] = pd.to_datetime(google_df['date']).dt.year

# SALES
sales_df['week'] = pd.to_datetime(sales_df['system_calendar_key_n']).dt.week
sales_df['month'] = pd.to_datetime(sales_df['system_calendar_key_n']).dt.month
sales_df['quarter'] = pd.to_datetime(sales_df['system_calendar_key_n']).dt.quarter
sales_df['year'] = pd.to_datetime(sales_df['system_calendar_key_n']).dt.year

# MEDIA
media_df['week'] = pd.to_datetime(media_df['published_date']).dt.week
media_df['month'] = pd.to_datetime(media_df['published_date']).dt.month
media_df['quarter'] = pd.to_datetime(media_df['published_date']).dt.quarter
media_df['year'] = pd.to_datetime(media_df['published_date']).dt.year

In [27]:
google_df.drop(columns=['date'],inplace=True)
sales_df.drop(columns=['system_calendar_key_n'],inplace=True)
media_df.drop(columns=['published_date'],inplace=True)

In [29]:
fs_df = pd.merge(sales_df, product_df, on='product_id', how= 'inner')
merge_info(sales_df,product_df, fs_df)

Unnamed: 0,n_cols,n_rows
left_df,8,4526182
right_df,2,67175
merged_df,9,4526182


In [30]:
theme_df = pd.merge(theme_list_df, tpl_df, on='claim_id', how= 'inner')
merge_info(theme_list_df,tpl_df, theme_df)

Unnamed: 0,n_cols,n_rows
left_df,2,208
right_df,2,91485
merged_df,3,91485


In [31]:
final_sales_df = pd.merge(fs_df, theme_df, on='product_id', how= 'inner')
merge_info(fs_df,theme_df, final_sales_df)

Unnamed: 0,n_cols,n_rows
left_df,9,4526182
right_df,3,91485
merged_df,11,7767420


In [32]:
final_sales_df.head()

Unnamed: 0,product_id,sales_dollars_value,sales_units_value,sales_lbs_value,week,month,quarter,year,vendor,claim_id,claim_name
0,1,13927.0,934,18680,1,1,1,2016,Others,0,No Claim
1,1,12628.0,878,17564,3,1,1,2016,Others,0,No Claim
2,1,11379.0,810,16200,5,2,1,2016,Others,0,No Claim
3,1,11568.0,821,16424,4,1,1,2016,Others,0,No Claim
4,1,10959.0,784,15682,6,2,1,2016,Others,0,No Claim


In [44]:
final_sales_df =final_sales_df.groupby(
                ['year', 'quarter', 'month', 'week', 'claim_id', 'vendor']
                )['sales_dollars_value'].sum().to_frame().reset_index()

In [45]:
gf_df = google_df.groupby(
        ['year', 'quarter', 'month', 'week', 'claim_id']
        )['search_volume'].sum().to_frame().reset_index()
gf_df.head()

Unnamed: 0,year,quarter,month,week,claim_id,search_volume
0,2014,1,1,1,8,30266
1,2014,1,1,1,39,272
2,2014,1,1,1,75,4448
3,2014,1,1,1,81,5207
4,2014,1,1,1,100,4613


In [46]:
media_df.rename(columns={'theme_id': 'claim_id'},inplace=True)

In [47]:
mf_df = media_df.groupby(
        ['year', 'quarter', 'month', 'week', 'claim_id']
        )['total_post'].sum().to_frame().reset_index()
mf_df.head()

Unnamed: 0,year,quarter,month,week,claim_id,total_post
0,2015,2,5,21,8.0,412
1,2015,2,5,21,15.0,4
2,2015,2,5,21,26.0,0
3,2015,2,5,21,38.0,4
4,2015,2,5,21,39.0,10


In [53]:
display_as_tabs([('google', gf_df.shape), ('media', mf_df.shape), ('sales', final_sales_df.shape)])

In [52]:
sum1 = eda.get_variable_summary(gf_df)
sum2 = eda.get_variable_summary(mf_df)
sum3 = eda.get_variable_summary(final_sales_df)

display_as_tabs([('google', sum1), ('media', sum2), ('sales', sum3)])

In [54]:
f_df = gf_df.merge(mf_df, on=['year', 'quarter', 'month', 'week', 'claim_id'], how='inner')
merge_info(g_df,mf_df,f_df)

Unnamed: 0,n_cols,n_rows
left_df,8,181565
right_df,6,53421
merged_df,7,25579


In [55]:
df = f_df.merge(final_sales_df, on=['year', 'quarter', 'month', 'week', 'claim_id'], how='inner')
merge_info(f_df,final_sales_df,df)

Unnamed: 0,n_cols,n_rows
left_df,7,25579
right_df,7,34926
merged_df,9,15362


In [58]:
df

Unnamed: 0,year,quarter,month,week,claim_id,search_volume,total_post,vendor,sales_dollars_value
0,2016,1,1,1,8,51122,1123,A,8853853.0
1,2016,1,1,1,8,51122,1123,B,2585845.0
2,2016,1,1,1,8,51122,1123,D,9742859.0
3,2016,1,1,1,8,51122,1123,F,14347182.0
4,2016,1,1,1,8,51122,1123,H,661656.0
...,...,...,...,...,...,...,...,...,...
15357,2019,4,10,40,438,12938,1091,D,1413231.0
15358,2019,4,10,40,438,12938,1091,F,1311685.0
15359,2019,4,10,40,438,12938,1091,H,1075693.0
15360,2019,4,10,40,438,12938,1091,Others,4495238.0
