# We have to determine if the shade of lipstick and the price category are related. 

## Load Packages

In [1]:
import pandas as pd
from scipy import stats

## Load data

In [3]:
lipstick = pd.read_csv("lead_lipstick.csv")
lipstick.head()

Unnamed: 0,JRC_code,purchCntry,prodCntry,Pb,sdPb,shade,prodType,priceCatgry
0,C135,NL,NL,3.75,0.24,Red,LP,2
1,C18,FI,FI,2.29,0.07,Red,LP,2
2,C20,FI,IT,1.27,0.06,Red,LP,2
3,C164,DE,FR,1.21,0.06,Red,LP,2
4,C71,MT,UK,0.85,0.04,Red,LP,2


In [5]:
lipstick.shade.value_counts()

Pink      81
Brown     60
Red       45
Purple    37
Name: shade, dtype: int64

In [6]:
# We have four levels/types of shades.

In [7]:
lipstick.priceCatgry.value_counts()

2    135
1     53
3     35
Name: priceCatgry, dtype: int64

In [8]:
# The price category, priceCatgry, has three levels:

# 1: < 5 euros
# 2: 5-15 euros
# 3: > 15 euros

## Test assumption and Run the Analysis

In [9]:
# There is only one assumption for Chi-Square, and it is that when you are looking at the contingency tables,
# the expected frequencies for each cell need to have at least 5 entries per cell.
# In Python, the only way to easily generate an expected frequencies table is actually to run the analysis. 
# So, you will conduct your independent Chi-Square first, and then make sure it meets this assumption!

## Create a contingency table.

In [10]:
# The first thing that needs to be done, before you can run the independent Chi-Square analysis,
# is to create a contingency table, sometimes called a crosstab, which shows how each level of 
# each variable crosses with the other variable levels. 

lipstick_crosstab = pd.crosstab(lipstick['shade'], lipstick['priceCatgry'])
lipstick_crosstab

priceCatgry,1,2,3
shade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Brown,20,30,10
Pink,20,49,12
Purple,8,23,6
Red,5,33,7


## Running the independent Chi-Square

In [11]:
# Once you have the contingency table, then you can run the function stats.chi2_contingency on the contingency
# table you have created:

stats.chi2_contingency(lipstick_crosstab)

(7.860569553614045,
 0.2484973879479863,
 6,
 array([[14.26008969, 36.32286996,  9.41704036],
        [19.25112108, 49.03587444, 12.71300448],
        [ 8.79372197, 22.39910314,  5.80717489],
        [10.69506726, 27.24215247,  7.06278027]]))

In [12]:
# The first value is your Chi-Square statistic which. is 7.86.
# The second value is your p value associated with that Chi-Square statistic which is .25. 
# Looking at this, it looks like there is not a significant relationship between Lipstick Shade and product price. 
# No shade is pricier or cheaper than the other.

## Test the assumptions of 5 cases per expected cell

In [13]:
# The last piece of the output, labeled array, is your expected count contingency table.
# The expected count is what you would expect to happen if there was no relationship between the two variables.
# Since all of these values are over 5, this means that the assumption has been met, 
# and you are free to present and discuss these results without any limitations!