## Load in Packages

In [1]:

# In order to run McNemar Chi-Squares in Python, you will need pandas to read in your data,
# and statsmodels to analyze it:

import pandas as pd
import statsmodels as sm
from statsmodels.stats.contingency_tables import mcnemar

## Load in Data


In [4]:
bakery_sales = pd.read_csv("bakery_sales.csv")

## Question SetUp

In [5]:
# You will be answering the following question:

# Do the sales of coffee change from the beginning of the month to the end of the month?

## Data Wrangling

### Separating the Pieces of the Date Variable

In [7]:
# The first order of business is to separate out your Date column. You can do this with the function str.split():

bakery_sales1 = bakery_sales['Date'].str.split('/', expand=True).rename(columns = lambda x: "Date" + str(x +1))

In [11]:
# And then of course you'll need to put your data back together again:

bakery_sales2 = pd.concat([bakery_sales, bakery_sales1], axis=1)

In [12]:
bakery_sales2

Unnamed: 0,Date,Time,Transaction,Item,Date1,Date2,Date3
0,10/30/2016,9:58:11 AM,1,Bread,10,30,2016
1,10/30/2016,10:05:34 AM,2,Scandinavian,10,30,2016
2,10/30/2016,10:05:34 AM,2,Scandinavian,10,30,2016
3,10/30/2016,10:07:57 AM,3,Hot chocolate,10,30,2016
4,10/30/2016,10:07:57 AM,3,Jam,10,30,2016
...,...,...,...,...,...,...,...
21288,4/9/2017,2:32:58 PM,9682,Coffee,4,9,2017
21289,4/9/2017,2:32:58 PM,9682,Tea,4,9,2017
21290,4/9/2017,2:57:06 PM,9683,Coffee,4,9,2017
21291,4/9/2017,2:57:06 PM,9683,Pastry,4,9,2017


### Changing Day to an Integer

In [15]:
# Next you'll need to recode the Date2 variable so that it provides information about beginning or ending of 
# the month. To do this, your Date2 variable will need to be an integer.
# You can double check that it is with the function info():

bakery_sales2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21293 entries, 0 to 21292
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Date         21293 non-null  object
 1   Time         21293 non-null  object
 2   Transaction  21293 non-null  int64 
 3   Item         21293 non-null  object
 4   Date1        21293 non-null  object
 5   Date2        21293 non-null  object
 6   Date3        21293 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.1+ MB


In [16]:
# So it looks like Date2 is currently string data, which is common after doing the str.split() function - 
# after all, it literally translates into "string split!" 
# However, this is an easy fix - you can use the astype(int) function:

bakery_sales2.Date2 = bakery_sales2.Date2.astype(int)

### Recoding to Beginning or End of Month


In [20]:
def Date_BegEnd(date_2):
    if date_2 <= 15:
        return 0
    if date_2 > 15:
        return 1
    
bakery_sales2['DayR'] = bakery_sales2['Date2'].apply(Date_BegEnd)

### Recoding coffee

In [25]:
def CofYesNo(Item_1):
    if Item_1 == 'Coffee':
        return 1
    if Item_1 != 'Coffee':
        return 0

bakery_sales2['CoffeeYN'] = bakery_sales2['Item'].apply(CofYesNo)

In [30]:
bakery_sales2.drop('CofeeYN',inplace = True, axis=1)

## Make a Contingency Table

In [32]:
# Next, you will need to make a contingency table, since the function for McNemar Chi-Squares in 
# Python will not accept raw data. Happily, the pd.crosstab() function you learned earlier will do this 
# job easily for you:

bakery_crosstab = pd.crosstab(bakery_sales2['DayR'], bakery_sales2['CoffeeYN'])
bakery_crosstab

CoffeeYN,0,1
DayR,Unnamed: 1_level_1,Unnamed: 2_level_1
0,8238,2841
1,7584,2630


In [31]:
bakery_sales2

Unnamed: 0,Date,Time,Transaction,Item,Date1,Date2,Date3,DayR,CoffeeYN
0,10/30/2016,9:58:11 AM,1,Bread,10,30,2016,1,0
1,10/30/2016,10:05:34 AM,2,Scandinavian,10,30,2016,1,0
2,10/30/2016,10:05:34 AM,2,Scandinavian,10,30,2016,1,0
3,10/30/2016,10:07:57 AM,3,Hot chocolate,10,30,2016,1,0
4,10/30/2016,10:07:57 AM,3,Jam,10,30,2016,1,0
...,...,...,...,...,...,...,...,...,...
21288,4/9/2017,2:32:58 PM,9682,Coffee,4,9,2017,0,1
21289,4/9/2017,2:32:58 PM,9682,Tea,4,9,2017,0,0
21290,4/9/2017,2:57:06 PM,9683,Coffee,4,9,2017,0,1
21291,4/9/2017,2:57:06 PM,9683,Pastry,4,9,2017,0,0


## Test Assumptions and Run Analyses


In [33]:
# In Python, there is no way to test the assumption of at least five expected per cell, 
# which means that if you are analyzing high stakes data, where accuracy really matters, 
# Python is NOT the program for you to run a McNemar Chi-Square in.

# You will use the function sm.stats.contingency_tables.mcnemar() to run your McNemar Chi-Square. 
# It takes the arguments of the crosstab you just created, exact=, which you can set to False, and correction=, 
# which will be set to True.



In [34]:
result = sm.stats.contingency_tables.mcnemar(bakery_crosstab, exact=False, correction=True)

In [35]:
print(result)

pvalue      0.0
statistic   2156.984556354916


In [36]:
# Interpret Results
# Alright! You now have results, and they are significant - the p value is less than .05, 
# so it looks like different amounts of coffee is sold in the morning and afternoon! How does it differ? 
# With Python, you'll NEVER KNOW! It does not provide the ability to look at standardized residuals, 
# so you can't look at post hocs.

