In [2]:
import numpy as np
import pandas as pd
from scipy.stats import shapiro
import scipy.stats as stats

In [31]:
def AB_test(dataframe, group, target):

    #splitting groups
    groupA=dataframe[dataframe[group]==1][target] #holiday
    groupB=dataframe[dataframe[group]==0][target] #not holiday

    #checking distributin of groups using shapiro 
    #H0: distribution is normal
    # pA=shapiro(groupA)[1]
    # pB=shapiro(groupB)[1]

    # if (pA >=0.05) & (pB>=0.05):
        #both are normally distributed so we use parametric test

    #checking homogeneity of variances using levene test
    #H0: there is homogeneity of variances
    leveneTest_p = stats.levene(groupA, groupB)[1]

    if leveneTest_p<0.05:
        #heterogeneous

        #t test
        #H0: M1 == M2
        p=stats.ttest_ind(groupA, groupB, equal_var=False)[1]
    else:
        #homogeneity

        #ttest
        #H0:M1==M2
        p=stats.ttest_ind(groupA, groupB, equal_var=True)[1]
    # else:
    #     #non-parametric test

    #     #Mann-Whitney U test
    #     # H0: M1 == M2
    #     p=stats.mannwhitneyu(groupA, groupB)[1]

    group = [group]
    p = [p]
    # pA = [pA]
    # pB = [pB]

    AB = pd.DataFrame({
    "Feature": group,
    "p-value": p,
    #"Test": np.where((np.array(pA) == False) & (np.array(pB) == False), "t-Test (p)", "Mann-Whitney U (nonp)"),
    "Hypothesis": np.where(np.array(p) >= 0.05, "Fail to Reject H0", "Reject H0"),
    "Comment": np.where(np.array(p) >= 0.05, "A/B groups are similar", "A/B groups are not similar"),
    "GroupA_mean": np.mean(groupA),
    "GroupB_mean": np.mean(groupB),
    "GroupA_median": np.median(groupA),
    "GroupB_median": np.median(groupB)
    })
    return AB

In [3]:
df=pd.read_csv(r'novi_datasetovi\train_test_v1.csv', parse_dates=['date'])
df=df.drop(columns='Unnamed: 0')
df.head()

Unnamed: 0,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,work_day,...,N Futbol,N Independencia de Cuenca,N Independencia de Guayaquil,N Navidad,N Primer Grito de Independencia,N Primer dia del ano,N Terremoto Manabi,N Viernes Santo,oil_price,transactions
0,2013-01-01,1,AUTOMOTIVE,0.0,0.0,Quito,Pichincha,D,13,0,...,0,0,0,0,0,1,0,0,93.14,0.0
1,2013-01-01,1,BABY CARE,0.0,0.0,Quito,Pichincha,D,13,0,...,0,0,0,0,0,1,0,0,93.14,0.0
2,2013-01-01,1,BEAUTY,0.0,0.0,Quito,Pichincha,D,13,0,...,0,0,0,0,0,1,0,0,93.14,0.0
3,2013-01-01,1,BEVERAGES,0.0,0.0,Quito,Pichincha,D,13,0,...,0,0,0,0,0,1,0,0,93.14,0.0
4,2013-01-01,1,BOOKS,0.0,0.0,Quito,Pichincha,D,13,0,...,0,0,0,0,0,1,0,0,93.14,0.0


In [4]:
train=df[df['date']<'2017-08-16']
test=df[df['date']>= '2017-08-16']

In [32]:
#A/B Testing

cols = train.columns[train.columns.str.startswith("N ")].tolist() + train.columns[train.columns.str.startswith("R ")].tolist() + train.columns[train.columns.str.startswith("L ")].tolist()+ train.columns[train.columns.str.startswith("work_day")].tolist()
ab = []
for i in cols:
    ab.append(AB_test(dataframe=train, group = i, target = "sales"))
ab = pd.concat(ab)
ab

Unnamed: 0,Feature,p-value,Hypothesis,Comment,GroupA_mean,GroupB_mean,GroupA_median,GroupB_median
0,N Batalla de Pichincha,0.005181154,Reject H0,A/B groups are not similar,391.726745,356.824556,12.0,11.0
0,N Black Friday,0.6689129,Fail to Reject H0,A/B groups are similar,363.360708,356.916486,17.0,11.0
0,N Carnaval,0.002323565,Reject H0,A/B groups are not similar,332.882573,357.071237,10.0,11.0
0,N Cyber Monday,4.249429e-06,Reject H0,A/B groups are not similar,436.22025,356.786766,17.0,11.0
0,N Dia de Difuntos,6.682849e-08,Reject H0,A/B groups are not similar,431.034644,356.707777,16.0,11.0
0,N Dia de la Madre,4.93053e-06,Reject H0,A/B groups are not similar,397.836026,356.684149,13.0,11.0
0,N Dia del Trabajo,4.023208e-14,Reject H0,A/B groups are not similar,481.44618,356.55801,13.0,11.0
0,N Futbol,1.659731e-14,Reject H0,A/B groups are not similar,310.965144,357.312335,5.0,11.0
0,N Independencia de Cuenca,4.611872e-12,Reject H0,A/B groups are not similar,477.446607,356.641672,13.0,11.0
0,N Independencia de Guayaquil,2.608445e-06,Reject H0,A/B groups are not similar,430.35499,356.753528,16.0,11.0


In [42]:
#stores don't work on new years (except store 25,36) and christmass
train[(((train['date'].dt.month == 12) & (train['date'].dt.day == 25))| ((train['date'].dt.month == 1) & (train['date'].dt.day == 1) & (~(train['store_nbr'].isin([25,36]))))) & (train['sales']!=0)]

Unnamed: 0,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,work_day,...,N Futbol,N Independencia de Cuenca,N Independencia de Guayaquil,N Navidad,N Primer Grito de Independencia,N Primer dia del ano,N Terremoto Manabi,N Viernes Santo,oil_price,transactions


In [18]:
#again with local regional and national columns

df2=df.copy()

In [12]:
#PREDUGO

# nat_cols = [col for col in train2.columns if col.startswith('N ')]
# reg_cols = [col for col in train2.columns if col.startswith('R ')]
# loc_cols = [col for col in train2.columns if col.startswith('L ')]

# # ažuriranje kolona za praznike
# def check_for_holidays(row):
#     if row[nat_cols].sum() > 0:
#         row['national_holiday'] = 1
    
#     if row[reg_cols].sum() > 0:
#         for col in reg_cols:
#             if row[col] == 1 and row['state'] in col:
#                 row['regional_holiday'] = 1
#                 break
    
#     if row[loc_cols].sum() > 0:
#         for col in loc_cols:
#             if row[col] == 1 and row['city'] in col:
#                 row['local_holiday'] = 1
#                 break
    
#     return row

# train2 = train2.apply(check_for_holidays, axis=1)

KeyboardInterrupt: 

In [43]:
local_holidays=pd.read_csv(r'local_holidays.csv', parse_dates=['date'])
regional_holidays=pd.read_csv(r'regional_holidays.csv',parse_dates=['date'])
national_holidays=pd.read_csv(r'national_holidays.csv', parse_dates=['date'])

In [44]:
local_holidays['name']=local_holidays['local_holidays'].copy()
local_holidays.drop(columns=['local_holidays'])
local_holidays['local_holidays']=1
local_holidays

Unnamed: 0,date,city,local_holidays,name
0,2012-03-02,Manta,1,L Fundacion de Manta
1,2012-04-12,Cuenca,1,L Fundacion de Cuenca
2,2012-04-14,Libertad,1,L Cantonizacion de Libertad
3,2012-04-21,Riobamba,1,L Cantonizacion de Riobamba
4,2012-05-12,Puyo,1,L Cantonizacion del Puyo
...,...,...,...,...
142,2017-12-08,Loja,1,L Fundacion de Loja
143,2017-12-22,Salinas,1,L Cantonizacion de Salinas
144,2017-04-13,Cuenca,1,L Fundacion de Cuenca
145,2017-09-29,Ibarra,1,L Fundacion de Ibarra


In [20]:
local_holidays['name']=local_holidays['local_holidays'].copy()
local_holidays.drop(columns=['local_holidays'])
local_holidays['local_holidays']=1

regional_holidays.drop(columns=['regional_holidays'])
regional_holidays['regional_holidays']=1

national_holidays.drop(columns=['national_holidays'])
national_holidays['national_holidays']=1

In [21]:
df2=df2.merge(local_holidays, how='left', on=['date', 'city'])
df2['local_holidays']=df2['local_holidays'].fillna(0)

df2=df2.merge(regional_holidays, how='left', on=['date', 'state'])
df2['regional_holidays']=df2['regional_holidays'].fillna(0)

df2=df2.merge(national_holidays, how='left', on=['date'])
df2['national_holidays']=df2['national_holidays'].fillna(0)

df2['national_holidays'] = df2['national_holidays'].astype(int)
df2['regional_holidays'] = df2['regional_holidays'].astype(int)
df2['local_holidays'] = df2['local_holidays'].astype(int)

df2.head()

Unnamed: 0,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,work_day,...,N Navidad,N Primer Grito de Independencia,N Primer dia del ano,N Terremoto Manabi,N Viernes Santo,oil_price,transactions,local_holidays,regional_holidays,national_holidays
0,2013-01-01,1,AUTOMOTIVE,0.0,0.0,Quito,Pichincha,D,13,0,...,0,0,1,0,0,93.14,0.0,0,0,1
1,2013-01-01,1,BABY CARE,0.0,0.0,Quito,Pichincha,D,13,0,...,0,0,1,0,0,93.14,0.0,0,0,1
2,2013-01-01,1,BEAUTY,0.0,0.0,Quito,Pichincha,D,13,0,...,0,0,1,0,0,93.14,0.0,0,0,1
3,2013-01-01,1,BEVERAGES,0.0,0.0,Quito,Pichincha,D,13,0,...,0,0,1,0,0,93.14,0.0,0,0,1
4,2013-01-01,1,BOOKS,0.0,0.0,Quito,Pichincha,D,13,0,...,0,0,1,0,0,93.14,0.0,0,0,1


In [22]:
#df2.to_csv('novi_datasetovi/train_test_v3.csv', index=False)

In [58]:
train2=df2[df2['date']<'2017-08-16']
test2=df2[df2['date']>= '2017-08-16']

In [72]:
local_holidays.head()

Unnamed: 0,date,city,local_holidays,name
0,2012-03-02,Manta,1,L Fundacion de Manta
1,2012-04-12,Cuenca,1,L Fundacion de Cuenca
2,2012-04-14,Libertad,1,L Cantonizacion de Libertad
3,2012-04-21,Riobamba,1,L Cantonizacion de Riobamba
4,2012-05-12,Puyo,1,L Cantonizacion del Puyo


In [87]:
def AB_test_local_holidays(dataframe, group, target):

    city=local_holidays[local_holidays['name']==group].city.values[0]
    #splitting groups
    groupA=dataframe[(dataframe['city']==city)&(dataframe[group]==1)&(dataframe['national_holidays']==0)&(dataframe['regional_holidays']==0)][target] #holiday
    groupB=dataframe[(dataframe['city']==city)&(dataframe[group]==0)&(dataframe['national_holidays']==0)&(dataframe['regional_holidays']==0)][target] #not holiday


    # checking distributin of groups using shapiro 
    # H0: distribution is normal
    # pA=shapiro(groupA)[1]
    # pB=shapiro(groupB)[1]

    # if (pA >=0.05) & (pB>=0.05):
    #     both are normally distributed so we use parametric test

    # checking homogeneity of variances using levene test
    # H0: there is homogeneity of variances
    leveneTest_p = stats.levene(groupA, groupB)[1]

    if leveneTest_p<0.05:
        #heterogeneous

        #t test
        #H0: M1 == M2
        p=stats.ttest_ind(groupA, groupB, equal_var=False)[1]
    else:
        #homogeneity

        #ttest
        #H0:M1==M2
        p=stats.ttest_ind(groupA, groupB, equal_var=True)[1]
    # else:
    #     #non-parametric test

    #     #Mann-Whitney U test
    #     # H0: M1 == M2
    #     p=stats.mannwhitneyu(groupA, groupB)[1]

    group = [group]
    p = [p]
    # pA = [pA]
    # pB = [pB]

    AB = pd.DataFrame({
    "Feature": group,
    "p-value": p,
    #"Test": np.where((np.array(pA) == False) & (np.array(pB) == False), "t-Test (p)", "Mann-Whitney U (nonp)"),
    "Hypothesis": np.where(np.array(p) >= 0.05, "Fail to Reject H0", "Reject H0"),
    "Comment": np.where(np.array(p) >= 0.05, "A/B groups are similar", "A/B groups are not similar"),
    "GroupA_mean": np.mean(groupA),
    "GroupB_mean": np.mean(groupB),
    "GroupA_median": np.median(groupA),
    "GroupB_median": np.median(groupB)
    })
    return AB

In [88]:
cols =train2.columns[train2.columns.str.startswith("L ")].tolist()
ab_loc = []
for i in cols:
    ab_loc.append(AB_test_local_holidays(dataframe=train2, group = i, target = "sales"))
ab_loc = pd.concat(ab_loc)
ab_loc

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,Feature,p-value,Hypothesis,Comment,GroupA_mean,GroupB_mean,GroupA_median,GroupB_median
0,L Cantonizacion de Cayambe,0.577101,Fail to Reject H0,A/B groups are similar,445.602558,503.840132,23.0,15.0
0,L Cantonizacion de El Carmen,0.689255,Fail to Reject H0,A/B groups are similar,214.438109,196.498199,9.0,6.0
0,L Cantonizacion de Guaranda,0.2913841,Fail to Reject H0,A/B groups are similar,178.015915,231.550912,6.0,8.0
0,L Cantonizacion de Latacunga,0.9092,Fail to Reject H0,A/B groups are similar,192.49097,188.764269,11.0,6.0
0,L Cantonizacion de Libertad,0.1534507,Fail to Reject H0,A/B groups are similar,180.356318,270.480139,0.0,5.0
0,L Cantonizacion de Quevedo,0.3507754,Fail to Reject H0,A/B groups are similar,200.778636,252.130131,9.0,9.0
0,L Cantonizacion de Riobamba,0.4981113,Fail to Reject H0,A/B groups are similar,171.422417,202.831494,4.0,6.0
0,L Cantonizacion de Salinas,,Reject H0,A/B groups are not similar,,196.546649,,8.0
0,L Cantonizacion del Puyo,0.6445678,Fail to Reject H0,A/B groups are similar,56.501323,70.698262,0.0,0.0
0,L Fundacion de Ambato,0.5918035,Fail to Reject H0,A/B groups are similar,325.854186,357.439475,19.5,21.0


In [99]:
def AB_test_local_holidays_mw(dataframe, group, target):

    city=local_holidays[local_holidays['name']==group].city.values[0]
    #splitting groups
    groupA=dataframe[(dataframe['city']==city)&(dataframe[group]==1)&(dataframe['national_holidays']==0)&(dataframe['regional_holidays']==0)][target] #holiday
    groupB=dataframe[(dataframe['city']==city)&(dataframe[group]==0)&(dataframe['national_holidays']==0)&(dataframe['regional_holidays']==0)][target] #not holiday

    print(group)
    print('groupA ', groupA.shape)
    print('groupB ', groupB.shape)
    

    #checking distributin of groups using shapiro 
    #H0: distribution is normal
    if groupA.empty or groupB.empty:
        AB = pd.DataFrame(columns=["Feature", "p-value", "Test", "Hypothesis", "Comment", 
                                    "GroupA_mean", "GroupB_mean", "GroupA_median", "GroupB_median"])
        return AB
    pA=shapiro(groupA)[1]
    pB=shapiro(groupB)[1]

    if (pA >=0.05) & (pB>=0.05):
        #both are normally distributed so we use parametric test

        #checking homogeneity of variances using levene test
        #H0: there is homogeneity of variances
        leveneTest_p = stats.levene(groupA, groupB)[1]

        if leveneTest_p<0.05:
            #heterogeneous

            #t test
            #H0: M1 == M2
            p=stats.ttest_ind(groupA, groupB, equal_var=False)[1]
        else:
            #homogeneity

            #ttest
            #H0:M1==M2
            p=stats.ttest_ind(groupA, groupB, equal_var=True)[1]
    else:
        #non-parametric test

        #Mann-Whitney U test
        # H0: M1 == M2
        p=stats.mannwhitneyu(groupA, groupB)[1]

    group = [group]
    p = [p]
    pA = [pA]
    pB = [pB]

    AB = pd.DataFrame({
    "Feature": group,
    "p-value": p,
    "Test": np.where((np.array(pA) == False) & (np.array(pB) == False), "t-Test (p)", "Mann-Whitney U (nonp)"),
    "Hypothesis": np.where(np.array(p) >= 0.05, "Fail to Reject H0", "Reject H0"),
    "Comment": np.where(np.array(p) >= 0.05, "A/B groups are similar", "A/B groups are not similar"),
    "GroupA_mean": np.mean(groupA),
    "GroupB_mean": np.mean(groupB),
    "GroupA_median": np.median(groupA),
    "GroupB_median": np.median(groupB)
    })
    return AB

In [100]:
cols =train2.columns[train2.columns.str.startswith("L ")].tolist()
ab_loc2 = []
for i in cols:
    ab_loc2.append(AB_test_local_holidays_mw(dataframe=train2, group = i, target = "sales"))
ab_loc2 = pd.concat(ab_loc2)
ab_loc2

L Cantonizacion de Cayambe
groupA  (165,)
groupB  (51084,)
L Cantonizacion de El Carmen
groupA  (165,)
groupB  (51084,)
L Cantonizacion de Guaranda
groupA  (165,)
groupB  (51084,)
L Cantonizacion de Latacunga
groupA  (264,)
groupB  (101904,)
L Cantonizacion de Libertad
groupA  (132,)
groupB  (51117,)
L Cantonizacion de Quevedo
groupA  (132,)
groupB  (51117,)
L Cantonizacion de Riobamba
groupA  (132,)
groupB  (51117,)
L Cantonizacion de Salinas
groupA  (0,)
groupB  (51117,)
L Cantonizacion del Puyo
groupA  (99,)
groupB  (51150,)




L Fundacion de Ambato
groupA  (264,)
groupB  (102234,)
L Fundacion de Cuenca
groupA  (495,)
groupB  (153252,)
L Fundacion de Esmeraldas
groupA  (165,)
groupB  (51084,)
L Fundacion de Guayaquil
groupA  (2376,)
groupB  (407616,)
L Fundacion de Ibarra
groupA  (132,)
groupB  (50985,)
L Fundacion de Loja
groupA  (132,)
groupB  (51117,)
L Fundacion de Machala
groupA  (264,)
groupB  (102234,)
L Fundacion de Manta
groupA  (330,)
groupB  (102168,)
L Fundacion de Quito
groupA  (4752,)
groupB  (917730,)
L Fundacion de Riobamba
groupA  (165,)
groupB  (51084,)
L Fundacion de Santo Domingo
groupA  (495,)
groupB  (152856,)
L Independencia de Ambato
groupA  (264,)
groupB  (102234,)
L Independencia de Guaranda
groupA  (132,)
groupB  (51117,)
L Independencia de Latacunga
groupA  (264,)
groupB  (101904,)


Unnamed: 0,Feature,p-value,Test,Hypothesis,Comment,GroupA_mean,GroupB_mean,GroupA_median,GroupB_median
0,L Cantonizacion de Cayambe,0.3800485,Mann-Whitney U (nonp),Fail to Reject H0,A/B groups are similar,445.602558,503.840132,23.0,15.0
0,L Cantonizacion de El Carmen,0.7610964,Mann-Whitney U (nonp),Fail to Reject H0,A/B groups are similar,214.438109,196.498199,9.0,6.0
0,L Cantonizacion de Guaranda,0.5002666,Mann-Whitney U (nonp),Fail to Reject H0,A/B groups are similar,178.015915,231.550912,6.0,8.0
0,L Cantonizacion de Latacunga,0.1112805,Mann-Whitney U (nonp),Fail to Reject H0,A/B groups are similar,192.49097,188.764269,11.0,6.0
0,L Cantonizacion de Libertad,0.001059439,Mann-Whitney U (nonp),Reject H0,A/B groups are not similar,180.356318,270.480139,0.0,5.0
0,L Cantonizacion de Quevedo,0.5904379,Mann-Whitney U (nonp),Fail to Reject H0,A/B groups are similar,200.778636,252.130131,9.0,9.0
0,L Cantonizacion de Riobamba,0.3201984,Mann-Whitney U (nonp),Fail to Reject H0,A/B groups are similar,171.422417,202.831494,4.0,6.0
0,L Cantonizacion del Puyo,0.2959183,Mann-Whitney U (nonp),Fail to Reject H0,A/B groups are similar,56.501323,70.698262,0.0,0.0
0,L Fundacion de Ambato,0.6078569,Mann-Whitney U (nonp),Fail to Reject H0,A/B groups are similar,325.854186,357.439475,19.5,21.0
0,L Fundacion de Cuenca,0.1776302,Mann-Whitney U (nonp),Fail to Reject H0,A/B groups are similar,263.715962,292.431594,6.0,8.0


In [33]:
pd.set_option('display.max_columns',None)