In [1]:
import pandas as pd
import numpy as np
import pathlib

### Import 1 of 2 Datasets

In [2]:
behavior=pathlib.Path("shopping_behavior_updated.csv")
cons_habs=pd.read_csv(behavior)
mymap={'Yes':1,'No':0}
for col in ['Subscription Status', 'Discount Applied']:
    cons_habs[col]=cons_habs[col].map(mymap).astype('object')
def freq_factor(x):
    if x=='Every 3 Months': return 365/4
    if x=='Annually': return 365/1
    if x=='Quarterly': return 365/4
    if x=='Monthly': return 365/12
    if x=='Bi-Weekly': return 7/2
    if x=='Fortnightly': return 14
    if x=='Weekly': return 7
cons_habs['Total Days of Patronage']=(cons_habs['Frequency of Purchases'].map(freq_factor)*cons_habs['Previous Purchases']).astype(int)
cons_habs=cons_habs.drop(columns='Customer ID')
cons_habs.head()

Unnamed: 0,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases,Total Days of Patronage
0,55,Male,Blouse,Clothing,53,Kentucky,L,Gray,Winter,3.1,1,Express,1,Yes,14,Venmo,Fortnightly,196
1,19,Male,Sweater,Clothing,64,Maine,L,Maroon,Winter,3.1,1,Express,1,Yes,2,Cash,Fortnightly,28
2,50,Male,Jeans,Clothing,73,Massachusetts,S,Maroon,Spring,3.1,1,Free Shipping,1,Yes,23,Credit Card,Weekly,161
3,21,Male,Sandals,Footwear,90,Rhode Island,M,Maroon,Spring,3.5,1,Next Day Air,1,Yes,49,PayPal,Weekly,343
4,45,Male,Blouse,Clothing,49,Oregon,M,Turquoise,Spring,2.7,1,Free Shipping,1,Yes,31,PayPal,Annually,11315


In [3]:
cons_habs.dtypes

Age                          int64
Gender                      object
Item Purchased              object
Category                    object
Purchase Amount (USD)        int64
Location                    object
Size                        object
Color                       object
Season                      object
Review Rating              float64
Subscription Status         object
Shipping Type               object
Discount Applied            object
Promo Code Used             object
Previous Purchases           int64
Payment Method              object
Frequency of Purchases      object
Total Days of Patronage      int64
dtype: object

### Import 2 of 2 Datasets

In [4]:
coff_by_reg=pathlib.Path('../coffee/US_FL_IL_2020_to_2025_Weekly_googtrendcoffeesearch.csv')
coffee=pd.read_csv(coff_by_reg)
coffee=coffee[coffee.columns[1:]]
coffee.head()

Unnamed: 0,Florida,United States,Illinois
0,53,63,68
1,54,64,66
2,54,62,66
3,51,59,59
4,51,60,60


In [5]:
coffee.dtypes

Florida          int64
United States    int64
Illinois         int64
dtype: object

In [6]:
coffee.corr()

Unnamed: 0,Florida,United States,Illinois
Florida,1.0,0.923026,0.917687
United States,0.923026,1.0,0.920715
Illinois,0.917687,0.920715,1.0


# The column_comparison() function for:  
>## Hypothosis Test P-values  
>## Coefficients  

In [7]:
from CompareColumns import CompareColumns
cc=CompareColumns()

### A look at the function  

In [8]:
# Function: 
"""
cc.column_comparison(dataframe,  
                        numnum_meth_alpha_above=('pearson',0.2,True),  
                        catnum_meth_alpha_above=('kruskal',0.05,False),  
                        catcat_meth_alpha_above=('chi2',0.05,None),  
                        numeric_columns=None,  
                        categoric_columns=None,  
                        numeric_target=None,   
                        categoric_target='Size' ) 
"""   
# Parameters:
"""  
    dataframe: a pandas dataframe  
    numnum_meth_alpha_above, catnum_meth_alpha_above, and catcat_meth_alpha_above take input of:  
        None or a tuple with (test method, alpha threshold, and whether >= or < in relation to threshold or both)  
        if tuple, values should be (string, float, boolean|None).  
        Examples: ('chi2',0.05,False), ('anova',0.025,None), ('welch',0.01,True).
        where: 
            numnum_meth_alpha_above for a numeric-to-numeric comparison. Accepts methods of ('welch','student','pearson','spearman',kendall').
            catnum_meth_alpha_above for a categoric-to-numeric comparison. Accepts methods of ('kruskal','anova').
            catcat_meth_alpha_above for a categoric-to-categoric comparison. Accepts method of ('chi2').  
    numeric_columns and categoric_columns accept manual column input. Otherwise columns are autodetected.  
    numeric_target and categoric_target accept target columns. If either or both, only combinations involving targets will be considered. 
""" 
print('')




# A one-shot approach to examining a dataset  
>### This uses Welch's T-test, ANOVA, and Chi**2 to perform hypothesis tests  

In [9]:
print("Rejected Null for Numeric-to-Numeric, Categoric-to-Categoric, and Categoric-to-Numeric.")
cc.column_comparison(cons_habs,
                        numnum_meth_alpha_above=('welch',0.05,False),  # <--  False for strictly below threshold
                        catnum_meth_alpha_above=('anova',0.05,False),  # <--  False for strictly below threshold
                        catcat_meth_alpha_above=('chi2',0.05,False),  # <--  False for strictly below threshold
                        numeric_columns=None,
                        categoric_columns=None,
                        numeric_target=None,
                        categoric_target=None )

Rejected Null for Numeric-to-Numeric, Categoric-to-Categoric, and Categoric-to-Numeric.


Unnamed: 0,column_a,column_b,test,P-value
0,Purchase Amount (USD),Season,anova,0.01057616
1,Review Rating,Shipping Type,anova,0.03836537
2,Review Rating,Color,anova,0.01126284
3,Review Rating,Size,anova,0.0431546
4,Previous Purchases,Gender,anova,0.02507493
5,Previous Purchases,Item Purchased,anova,0.04231105
6,Total Days of Patronage,Frequency of Purchases,anova,0.0
0,Category,Item Purchased,chi2,0.0
1,Discount Applied,Promo Code Used,chi2,0.0
2,Discount Applied,Gender,chi2,2.101032e-186


# A look at using numeric and categoric targets to filter compute and output   
### Note: these use Kruskal_Wallis instead of ANOVA because Kruskal_Wallis is more suitable for this dataset.  

In [10]:
print("A look at the categorical 'Size' column and the numerical 'Review Rating' column.\nResults are not filtered based on p-values")
cc.column_comparison(cons_habs,
                        numnum_meth_alpha_above=('welch',0.05,None),  #  <-- None for unfiltered
                        catnum_meth_alpha_above=('kruskal',0.05,None),  #  <-- None for unfiltered
                        catcat_meth_alpha_above=('chi2',0.05,None),  #  <-- None for unfiltered
                        numeric_columns=None,
                        categoric_columns=None,
                        numeric_target='Review Rating',
                        categoric_target='Size' )

A look at the categorical 'Size' column and the numerical 'Review Rating' column.
Results are not filtered based on p-values


Unnamed: 0,column_a,column_b,test,P-value
0,Purchase Amount (USD),Size,kruskal,0.1847614
1,Age,Size,kruskal,0.512324
2,Previous Purchases,Size,kruskal,0.626309
3,Review Rating,Season,kruskal,0.1684497
4,Review Rating,Category,kruskal,0.1542907
5,Review Rating,Discount Applied,kruskal,0.4437409
6,Review Rating,Promo Code Used,kruskal,0.4437409
7,Review Rating,Gender,kruskal,0.6051729
8,Review Rating,Subscription Status,kruskal,0.7013635
9,Review Rating,Item Purchased,kruskal,0.6898478


# A look at options for categoric to numeric comparisons  
>### options are 'Kruskal Wallis' or 'ANOVA': 'kruskal', 'anova'  

In [11]:
print("Kruskal Wallis Comparisons.")
display(cc.column_comparison(cons_habs,
                        numnum_meth_alpha_above=None,
                        catnum_meth_alpha_above=('kruskal',0.05,None),  #  <-- None for unfiltered
                        catcat_meth_alpha_above=None,
                        numeric_columns=['Review Rating','Previous Purchases'],
                        categoric_columns='Color',
                        numeric_target=None,
                        categoric_target='Color' ))
print("ANOVA Comparisons")
display(cc.column_comparison(cons_habs,
                        numnum_meth_alpha_above=None,
                        catnum_meth_alpha_above=('anova',0.05,None),  #  <-- None for unfiltered
                        catcat_meth_alpha_above=None,
                        numeric_columns=['Review Rating','Previous Purchases'],
                        categoric_columns='Color',
                        numeric_target=None,
                        categoric_target='Color' ))

Kruskal Wallis Comparisons.


Unnamed: 0,column_a,column_b,test,P-value
0,Review Rating,Color,kruskal,0.011759
1,Previous Purchases,Color,kruskal,0.13087


ANOVA Comparisons


Unnamed: 0,column_a,column_b,test,P-value
0,Review Rating,Color,anova,0.011263
1,Previous Purchases,Color,anova,0.126958


## A look at numeric to numeric comparissons using:  
>### Coefficients: 'pearson', 'spearman', 'kendall'  
>### T-tests: "welch's", "student's"  

In [12]:
print("Pearson Correlation >= 0.92")
cc.column_comparison(coffee,
                        numnum_meth_alpha_above=('pearson',0.92,True),  #  <-- True for above or equal to threshold
                        catnum_meth_alpha_above=None,
                        catcat_meth_alpha_above=None,
                        numeric_columns=None,
                        categoric_columns=None,
                        numeric_target=None,
                        categoric_target=None )

Pearson Correlation >= 0.92


Unnamed: 0,column_a,column_b,test,Correlation
0,Florida,United States,pearson,0.923026
1,United States,Illinois,pearson,0.920715


In [13]:
print("Spearman Correlation < 0.902")
cc.column_comparison(coffee,
                        numnum_meth_alpha_above=('spearman',0.902,False),  # <--  False for strictly below threshold
                        catnum_meth_alpha_above=None,
                        catcat_meth_alpha_above=None,
                        numeric_columns=None,
                        categoric_columns=None,
                        numeric_target=None,
                        categoric_target=None )

Spearman Correlation < 0.902


Unnamed: 0,column_a,column_b,test,Correlation
0,United States,Illinois,spearman,0.900883


In [14]:
print("Kenall's Rank: unfiltered")
cc.column_comparison(coffee,
                        numnum_meth_alpha_above=('kendall',0.902,None),  #  <-- None for unfiltered
                        catnum_meth_alpha_above=None,
                        catcat_meth_alpha_above=None,
                        numeric_columns=None,
                        categoric_columns=None,
                        numeric_target=None,
                        categoric_target=None )

Kenall's Rank: unfiltered


Unnamed: 0,column_a,column_b,test,Correlation
0,Florida,United States,kendall,0.769343
1,Florida,Illinois,kendall,0.757979
2,United States,Illinois,kendall,0.763141


In [15]:
print("Welch's T-test: reject null")
cc.column_comparison(coffee,
                        numnum_meth_alpha_above=('welch',0.05,False),  # <--  False for strictly below threshold
                        catnum_meth_alpha_above=None,
                        catcat_meth_alpha_above=None,
                        numeric_columns=None,
                        categoric_columns=None,
                        numeric_target=None,
                        categoric_target=None )

Welch's T-test: reject null


Unnamed: 0,column_a,column_b,test,P-value
0,Florida,United States,welch,4.917673e-32
1,Florida,Illinois,welch,5.306237000000001e-32
2,United States,Florida,welch,4.917673e-32
3,Illinois,Florida,welch,5.306237000000001e-32


In [16]:
print("Student's T-test: fail to reject null")
cc.column_comparison(coffee,
                        numnum_meth_alpha_above=('student',0.05,True),  #  <-- True for above or equal to threshold
                        catnum_meth_alpha_above=None,
                        catcat_meth_alpha_above=None,
                        numeric_columns=None,
                        categoric_columns=None,
                        numeric_target=None,
                        categoric_target=None )

Student's T-test: fail to reject null


Unnamed: 0,column_a,column_b,test,P-value
0,United States,Illinois,student,0.715309
1,Illinois,United States,student,0.715309
