In [1]:
#Import pacakges
import pandas as pd
#import numpy as np
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi
from statsmodels.formula.api import ols
from scipy.stats import pearsonr, normaltest

import warnings
warnings.filterwarnings('ignore')

In [2]:
#Load cleaned_dark_chocolate_bar_ratings data into data frame
df_r = pd.read_csv('cleaned_dark_chocolate_bar_ratings.csv')
df_rating = df_r[['bean_country', 'cocoa_percent', 'total_ingredients','ingredient_combination','rating']].copy()
#df_rating.info()

#Load individual_characteristics data into data frame
df_c = pd.read_csv('individual_characteristics.csv')
df_char = df_c[['characteristic','rating']].copy()
#df_char.info()

# Test Statistical Significance
***
## Test for Normality
Before testing for statistical significance in the dataset, I tested for normality in the quantitative data to determine if I need to use parametric or non-parametric tests (Bhandari, 2020).

In [3]:
#Test Normality of 'rating'
k2, p = normaltest(df_rating['rating'])
alpha = 0.05
print("p = {:g}".format(p))

if p < alpha:  # null hypothesis: x comes from a normal distribution
    print("The null hypothesis for rating can be rejected.")
else:
    print("The null hypothesis for rating cannot be rejected.")

p = 1.46487e-14
The null hypothesis for rating can be rejected.


In [4]:
#Test Normality of 'total_ingredients'
k2, p = normaltest(df_rating['total_ingredients'])
alpha = 0.05
print("p = {:g}".format(p))

if p < alpha:  # null hypothesis: x comes from a normal distribution
    print("The null hypothesis for total_ingredients can be rejected.")
else:
    print("The null hypothesis for total_ingredients cannot be rejected.")

p = 9.33989e-29
The null hypothesis for total_ingredients can be rejected.


In [5]:
#Test Normality of 'cocoa_percent'
k2, p = normaltest(df_rating['cocoa_percent'])
alpha = 0.05
print("p = {:g}".format(p))

if p < alpha:  # null hypothesis: x comes from a normal distribution
    print("The null hypothesis for cocoa_percent can be rejected.")
else:
    print("The null hypothesis for cocoa_percent cannot be rejected.")

p = 1.19052e-78
The null hypothesis for cocoa_percent can be rejected.


Because I can reject the null hypothesis for each quantitative variable, I will use parametric tests for statistical significance.

## Testing Statistical Significance
To test statistical significance in the dataset, I will be using the ANOVA test for categorical explanatory variables and Pearson's R for quantitative data.

### Statistical Significance for Categorical Data using ANOVA

In [6]:
#ANOVA test for 'memorable_characteristics'
anova_df = df_char[['rating', 'characteristic']].dropna()

relate_df = df_char[['rating', 'characteristic']]
anova = smf.ols(formula='rating ~ C(characteristic)', data=relate_df).fit()

anova.summary()

0,1,2,3
Dep. Variable:,rating,R-squared:,0.343
Model:,OLS,Adj. R-squared:,0.258
Method:,Least Squares,F-statistic:,4.047
Date:,"Wed, 07 Apr 2021",Prob (F-statistic):,1.1600000000000001e-195
Time:,11:46:13,Log-Likelihood:,-2367.2
No. Observations:,6398,AIC:,6196.0
Df Residuals:,5667,BIC:,11140.0
Df Model:,730,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.2500,0.372,8.732,0.000,2.520,3.980
C(characteristic)[T.accessible],0.1250,0.416,0.300,0.764,-0.691,0.941
C(characteristic)[T.acidic],-0.0764,0.377,-0.202,0.840,-0.816,0.663
C(characteristic)[T.alcohol],-2.115e-12,0.430,-4.92e-12,1.000,-0.843,0.843
C(characteristic)[T.alkalized note],-0.5000,0.526,-0.950,0.342,-1.532,0.532
C(characteristic)[T.alluring aroma],0.2500,0.526,0.475,0.635,-0.782,1.282
C(characteristic)[T.almond],0.2500,0.456,0.548,0.583,-0.644,1.144
C(characteristic)[T.almond butter],0.2500,0.526,0.475,0.635,-0.782,1.282
C(characteristic)[T.almost burnt],-0.7500,0.526,-1.425,0.154,-1.782,0.282

0,1,2,3
Omnibus:,174.442,Durbin-Watson:,1.505
Prob(Omnibus):,0.0,Jarque-Bera (JB):,233.595
Skew:,-0.315,Prob(JB):,1.89e-51
Kurtosis:,3.693,Cond. No.,2180.0


In [7]:
#ANOVA test for 'bean_country'
anova_df = df_rating[['rating', 'bean_country']].dropna()

relate_df = df_rating[['rating', 'bean_country']]
anova = smf.ols(formula='rating ~ C(bean_country)', data=anova_df).fit()

anova.summary()

0,1,2,3
Dep. Variable:,rating,R-squared:,0.03
Model:,OLS,Adj. R-squared:,0.006
Method:,Least Squares,F-statistic:,1.229
Date:,"Wed, 07 Apr 2021",Prob (F-statistic):,0.122
Time:,11:46:14,Log-Likelihood:,-1287.7
No. Observations:,2274,AIC:,2687.0
Df Residuals:,2218,BIC:,3008.0
Df Model:,55,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.2500,0.249,13.042,0.000,2.761,3.739
C(bean_country)[T.Belize],3.197e-14,0.255,1.26e-13,1.000,-0.500,0.500
C(bean_country)[T.Bolivia],-0.0642,0.254,-0.253,0.801,-0.563,0.434
C(bean_country)[T.Brazil],4.03e-14,0.254,1.59e-13,1.000,-0.498,0.498
C(bean_country)[T.Cameroon],-0.1667,0.352,-0.473,0.636,-0.858,0.524
C(bean_country)[T.Colombia],-0.0373,0.255,-0.146,0.884,-0.537,0.462
C(bean_country)[T.Costa Rica],-0.0952,0.258,-0.369,0.712,-0.601,0.411
C(bean_country)[T.Cuba],0.0417,0.279,0.150,0.881,-0.505,0.588
C(bean_country)[T.Côte d’Ivoire (aka Ivory Coast)],-0.3500,0.315,-1.110,0.267,-0.968,0.268

0,1,2,3
Omnibus:,49.183,Durbin-Watson:,1.292
Prob(Omnibus):,0.0,Jarque-Bera (JB):,52.034
Skew:,-0.355,Prob(JB):,5.02e-12
Kurtosis:,3.211,Cond. No.,213.0


In [8]:
#ANOVA test for 'ingredient_combination'
anova_df = df_rating[['rating', 'ingredient_combination']].dropna()

relate_df = df_rating[['rating', 'ingredient_combination']]
anova = smf.ols(formula='rating ~ C(ingredient_combination)', data=anova_df).fit()

anova.summary()

0,1,2,3
Dep. Variable:,rating,R-squared:,0.055
Model:,OLS,Adj. R-squared:,0.046
Method:,Least Squares,F-statistic:,6.51
Date:,"Wed, 07 Apr 2021",Prob (F-statistic):,1.38e-17
Time:,11:46:14,Log-Likelihood:,-1257.9
No. Observations:,2274,AIC:,2558.0
Df Residuals:,2253,BIC:,2678.0
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.9500,0.189,15.606,0.000,2.579,3.321
C(ingredient_combination)[T.BC],0.0500,0.463,0.108,0.914,-0.858,0.958
C(ingredient_combination)[T.BS],0.2703,0.190,1.424,0.154,-0.102,0.642
C(ingredient_combination)[T.BS*],0.0097,0.204,0.048,0.962,-0.390,0.409
C(ingredient_combination)[T.BS*C],-0.0125,0.225,-0.056,0.956,-0.454,0.429
C(ingredient_combination)[T.BS*CL],-0.0750,0.354,-0.212,0.832,-0.769,0.619
C(ingredient_combination)[T.BS*CSa],0.1625,0.211,0.769,0.442,-0.252,0.577
C(ingredient_combination)[T.BS*CV],0.0500,0.248,0.202,0.840,-0.435,0.535
C(ingredient_combination)[T.BS*Sa],-0.4500,0.463,-0.972,0.331,-1.358,0.458

0,1,2,3
Omnibus:,44.165,Durbin-Watson:,1.307
Prob(Omnibus):,0.0,Jarque-Bera (JB):,46.391
Skew:,-0.336,Prob(JB):,8.44e-11
Kurtosis:,3.194,Cond. No.,114.0


The varaibles memorable_characteristics and ingredient_combination have a p-value less than 0.05. Therefore, we can reject the null and focus on those two variables during the exploratory analysis. The varible bean_country had a p-value greater than 0.05. Therefore, we cannot reject the null and that varaible wil not be examined during the exploratory analysis. 

### Statistical Significance for Quantitative Data using Pearson's R

In [9]:
#Pearson's r test for 'total_ingredients'
print('total_ingredients', pearsonr(df_rating['rating'], df_rating['total_ingredients']))

total_ingredients (-0.09840103412745788, 2.585971529232871e-06)


In [10]:
#Pearson's r test for 'cocoa_percent'
print('cocoa_percent',pearsonr(df_rating['rating'], df_rating['cocoa_percent']))

cocoa_percent (-0.0788768460323526, 0.00016644387820901796)


The variables cocoa_percent and total_ingredients have a p-value less than 0.05. Therefore, we can reject the null and focus on those two variables during the exploratory analysis.

# Conclusions

The variables memorable_characteristics (separated into individual characters in 'characteristic' variable in individual_characteristics.csv), 'cocoa_percent', 'total_ingredients','ingredient_combination', have p-value less than 0.05. Therefore, these variables will be analyzed during the exploratory data analysis. The variable 'bean_country' will not be explored during the exploratory data analysis since it had a p-value greater than 0.05.

### Sources
Bhandari, P. (2020, October 23). Understanding normal distributions. Scribbr. Retrieved March 21, 2021, from https://www.scribbr.com/statistics/normal-distribution/