In [1]:
import pandas as pd
import numpy as np
from patsy import dmatrices
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [21]:
df_train = pd.read_csv('input/train_processed.csv')

In [22]:
df_tmp = df_train.drop(['Unnamed: 0','id','center_id', 'meal_id',
       'city_code', 'region_code', 'city_region',
       'center_type', 'category', 'cuisine',
        'city_code', 'region_code', 'city_code_encoded', 'region_code_encoded',
         'checkout_price','num_orders_log1p','discount'], axis=1)

In [23]:
df_tmp.columns

Index(['week', 'base_price', 'emailer_for_promotion', 'homepage_featured',
       'num_orders', 'op_area', 'rate_of_discount', 'week_sin', 'week_cos',
       'center_id_encoded', 'meal_id_encoded', 'city_region_encoded',
       'center_type_encoded', 'category_encoded', 'cuisine_encoded', 'Quarter',
       'Year'],
      dtype='object')

In [13]:
df_tmp_vif = df_tmp._get_numeric_data()

In [14]:
cat_cols = ['week','meal_id_encoded',
                     'center_id_encoded',
                     'emailer_for_promotion',
                     'homepage_featured',
                     'city_region_encoded',
                     'center_type_encoded',
                     'category_encoded',
                     'cuisine_encoded',
                    'Quarter',
                    'Year'
                    ]

## __VIF__

In [15]:
numerical_columns = [col for col in df_tmp_vif.columns if col not in cat_cols]
df_tmp_vif = df_tmp_vif[numerical_columns]
df_tmp_vif

Unnamed: 0,base_price,num_orders,op_area,rate_of_discount,week_sin,week_cos
0,152.29,177,2.0,0.101517,0.120208,0.992749
1,152.29,323,2.0,0.108083,0.238672,0.971100
2,133.92,96,2.0,0.007467,0.353675,0.935368
3,134.86,163,2.0,-0.007415,0.463549,0.886071
4,147.50,215,2.0,0.006780,0.566700,0.823924
...,...,...,...,...,...,...
456541,630.53,13,4.5,0.075333,-0.958705,-0.284404
456542,582.03,42,4.5,0.001718,-0.985940,-0.167098
456543,581.03,40,4.5,-0.003442,-0.998877,-0.047369
456544,581.03,53,4.5,-0.001721,-0.997328,0.073048


In [16]:
df_tmp_vif.loc[df_tmp_vif.rate_of_discount == 0, 'rate_of_discount'] = 0.000001
#df_tmp.loc[df_tmp.discount == 0, 'discount'] = 0.000001

In [17]:
y = df_tmp_vif['num_orders']
X = df_tmp_vif.drop('num_orders', axis=1)

In [18]:
variance_inflation_factor(X.values, 2)

1.2568806024687715

In [19]:
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns

In [20]:
vif

Unnamed: 0,VIF Factor,features
0,4.590092,base_price
1,4.537343,op_area
2,1.256881,rate_of_discount
3,1.005448,week_sin
4,1.011712,week_cos


## __ChiSq and Cramer's V test__

In [31]:
df_tmp_chi = df_tmp[cat_cols]
df_tmp_chi.head()

Unnamed: 0,week,meal_id_encoded,center_id_encoded,emailer_for_promotion,homepage_featured,city_region_encoded,center_type_encoded,category_encoded,cuisine_encoded,Quarter,Year
0,1,22,23,0,0,30,2,0,3,Q1,Y1
1,2,22,23,0,0,30,2,0,3,Q1,Y1
2,3,22,23,0,0,30,2,0,3,Q1,Y1
3,4,22,23,0,0,30,2,0,3,Q1,Y1
4,5,22,23,0,0,30,2,0,3,Q1,Y1


In [35]:
import researchpy as rp
import scipy.stats as stats

In [38]:
crosstab = pd.crosstab(df_tmp_chi['category_encoded'],df_tmp_chi['cuisine_encoded'])

In [39]:
stats.chi2_contingency(crosstab)

(984756.3995761818,
 0.0,
 39,
 array([[28795.46102693, 31544.88901447, 34434.37955869, 33115.27039992],
        [ 4641.40772233,  5084.57535495,  5550.3190259 ,  5337.69789682],
        [ 6595.77946143,  7225.55304394,  7887.40882627,  7585.25866835],
        [ 3053.59326333,  3345.15431085,  3651.56818809,  3511.68423773],
        [ 2293.68489703,  2512.68890758,  2742.84951571,  2637.77667968],
        [ 6614.91789436,  7246.51883928,  7910.29507432,  7607.26819203],
        [ 6235.5266063 ,  6830.90277869,  7456.60886307,  7170.96175194],
        [ 7461.28694589,  8173.70030621,  8922.40573787,  8580.60701003],
        [ 7521.62929913,  8239.80422564,  8994.5647317 ,  8650.00174353],
        [ 6430.28830611,  7044.26057834,  7689.51009318,  7394.94102237],
        [ 7495.73612517,  8211.43873783,  8963.60098435,  8620.22415266],
        [ 6060.35365549,  6639.00408721,  7247.13238096,  6969.50987633],
        [ 2853.87808676,  3126.37007005,  3412.74345849,  3282.0083847 ],
       

In [40]:
table, results = rp.crosstab(df_tmp_chi['category_encoded'],df_tmp_chi['cuisine_encoded'], prop ='col', test = 'chi-square')

In [42]:
results

Unnamed: 0,Chi-square test,results
0,Pearson Chi-square ( 39.0) =,984756.3996
1,p-value =,0.0
2,Cramer's V =,0.8479


In [47]:
for col_i in df_tmp_chi.columns:
    df_tmp_ = df_tmp_chi.drop([col_i], axis=1)
    #list_of_cols = cat_cols.remove(col_i)
    for col_j in df_tmp_.columns:
        table, results = rp.crosstab(df_tmp_chi[col_i],
                                     df_tmp_chi[col_j], 
                                     prop ='col', test = 'chi-square')
        print('Results for '+col_i+' vs '+col_j+' : ')
        print(results)

Results for week vs meal_id_encoded : 
                   Chi-square test     results
0  Pearson Chi-square ( 7200.0) =   13793.5614
1                       p-value =       0.0000
2                    Cramer's V =       0.0246
Results for week vs center_id_encoded : 
                    Chi-square test    results
0  Pearson Chi-square ( 10944.0) =   1834.8490
1                        p-value =      1.0000
2                     Cramer's V =      0.0073
Results for week vs emailer_for_promotion : 
                  Chi-square test     results
0  Pearson Chi-square ( 144.0) =   13559.4209
1                      p-value =       0.0000
2                   Cramer's V =       0.1723
Results for week vs homepage_featured : 
                  Chi-square test    results
0  Pearson Chi-square ( 144.0) =   4180.4930
1                      p-value =      0.0000
2                   Cramer's V =      0.0957
Results for week vs city_region_encoded : 
                   Chi-square test   results
0  Pea

Results for emailer_for_promotion vs category_encoded : 
                 Chi-square test     results
0  Pearson Chi-square ( 13.0) =   31444.5013
1                     p-value =       0.0000
2                  Cramer's V =       0.2624
Results for emailer_for_promotion vs cuisine_encoded : 
                Chi-square test     results
0  Pearson Chi-square ( 3.0) =   15370.1465
1                    p-value =       0.0000
2                 Cramer's V =       0.1835
Results for emailer_for_promotion vs Quarter : 
                Chi-square test   results
0  Pearson Chi-square ( 3.0) =   360.6461
1                    p-value =     0.0000
2                 Cramer's V =     0.0281
Results for emailer_for_promotion vs Year : 
                Chi-square test  results
0  Pearson Chi-square ( 2.0) =   13.0686
1                    p-value =    0.0015
2                 Cramer's V =    0.0054
Results for homepage_featured vs week : 
                  Chi-square test    results
0  Pearson Chi-squar

Results for category_encoded vs emailer_for_promotion : 
                 Chi-square test     results
0  Pearson Chi-square ( 13.0) =   31444.5013
1                     p-value =       0.0000
2                  Cramer's V =       0.2624
Results for category_encoded vs homepage_featured : 
                 Chi-square test     results
0  Pearson Chi-square ( 13.0) =   12232.9433
1                     p-value =       0.0000
2                  Cramer's V =       0.1637
Results for category_encoded vs city_region_encoded : 
                  Chi-square test     results
0  Pearson Chi-square ( 650.0) =   34121.8618
1                      p-value =       0.0000
2                   Cramer's V =       0.0758
Results for category_encoded vs center_type_encoded : 
                 Chi-square test    results
0  Pearson Chi-square ( 26.0) =   3363.9994
1                     p-value =      0.0000
2                  Cramer's V =      0.0607
Results for category_encoded vs cuisine_encoded : 
         