In [None]:
pip install researchpy

In [None]:
!pip install linearmodels

In [None]:
# data analysis and wrangling
import researchpy as rp
import pandas as pd
import numpy as np
import statistics
from itertools import chain
# visualization
from pandas.plotting import scatter_matrix
from statsmodels.graphics.tsaplots import plot_acf
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from pprint import pprint
%matplotlib inline
#plotly
import plotly.io as pio
import plotly.express as px
from plotly.offline import download_plotlyjs,init_notebook_mode, plot, iplot
import plotly as py 
import plotly.graph_objs as go # plotly graphical object
# setting the general visualization style
sns.set_style('whitegrid')
# feature engineering
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
#Libraries for Statistical Models
import statsmodels.api as sm
# ignoring warnings in the notebook
import warnings 
warnings.filterwarnings('ignore') 
# To display full output 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
SKU_new = pd.read_csv("/project/data_cleaning/SKU_new.csv",index_col=0)
df_valid = pd.read_csv("/project/data_cleaning/df_valid.csv",index_col=0)
df_reg = SKU_new[SKU_new['full_name'].isin(df_valid['full_name'].values.tolist())]
df_reg = df_reg[~df_reg['channel'].isin(['Tesco','Sainsbury'])]
df_reg.loc[(df_reg.channel == 'Tesco Express'),'format']='Express/Local'
df_reg.loc[(df_reg.channel == 'Tesco excl Local'),'format']='Excl.Express/Local'
df_reg.loc[(df_reg.channel == 'Sainsbury Local'),'format']='Express/Local'
df_reg.loc[(df_reg.channel == 'Sainsbury excl Local'),'format']='Excl.Express/Local'
df_reg.loc[(df_reg.date >= '2020-01-31'),'covid']='Yes'
df_reg.loc[(df_reg.date < '2020-01-31'),'covid']='No'
#df_reg.drop(['weight','off_shelf','units_sold','kg_sold'],axis=1)
df_reg
df_reg.describe()
df_reg.to_csv("/project/data_cleaning/df_reg.csv") 
df_had_promo = df_reg[~df_reg['full_name'].isin(['BSCTS DGSTV MCVTS DGSTVS PLN 400 GM SNGL',
                                                     'BSCTS RCH T MCVTS RCH T PLN 300 GM SNGL',
                                                     'BSCTS MLTD PRVT LBL MLTD MLK 200 GM SNGL',
                                                     'BSCTS FG RLLS PRVT LBL FG 200 GM SNGL'])]

df_had_promo
df_had_promo.to_csv("/project/data_cleaning/df_had_promo.csv") 
df_had_promo.describe()

In [None]:
rp.summary_cat(df_reg[['channel','company','brand','flavour','weight','categories']])
rp.summary_cont(df_reg[['Sales','distribution','off_shelf','price_per_unit','price_per_kg']])

In [None]:
from statsmodels.formula.api import ols

model = ols("price_per_unit ~ C(company, Sum) + C(retailer, Sum) + C(format, Sum)+ C(covid, Sum)+C(categories, Sum) + C(company, Sum)*C(retailer, Sum)* C(format, Sum)*C(covid, Sum)*C(categories, Sum)",
            data=df_reg).fit()

aov_table = sm.stats.anova_lm(model, typ=3)
aov_table

## vanilla fixed effect model

In [None]:
from linearmodels.panel import PanelOLS
import statsmodels.api as sm
df_reg['date'] = pd.to_datetime(df_reg['date'])

df_fe=df_reg.set_index(['date'], append=True)
exog = sm.add_constant(df_fe[['price_per_unit','price_per_kg','distribution','company','brand ']])
fe = PanelOLS(df_fe['Sales'], exog, entity_effects=False, time_effects=True)
fe = fe.fit()
print(fe)

## entity-demeaned OLS 

In [None]:

df_demean = df.copy()

# calculate the entity(state) mean 
df_demean['Mean_price'] = df_demean.groupby('full_name').price_per_unit.transform(np.mean)

# calculate the entity(state) mean for fatal rate
df_demean['Mean_fatal_rate_byState'] = df_demean.groupby('state').fatal_rate.transform(np.mean)

# demean, subtract each row by the entity-mean
df_demean["fatal_rate"] = df_demean["fatal_rate"] - df_demean['Mean_fatal_rate_byState']
df_demean["beertax"] = df_demean["beertax"] - df_demean['Mean_beerTax_byState']

model = sm.OLS(df_demean.fatal_rate, df_demean.beertax)
results2 = model.fit()
print(results2.summary())