In [1]:
from scipy.stats import chi2_contingency
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame(
    [
        [6,6],
        [36,8],
        [7,16],
        [13,14],
        [13,27]
    ],
    index = ["Palatalization", "Prefix Change", "Sound Change", "Two Options", "No Mention"],
    columns=["Alt", "Neut"])

In [3]:
df.columns

Index(['Alt', 'Neut'], dtype='object')

In [4]:
df.index

Index(['Palatalization', 'Prefix Change', 'Sound Change', 'Two Options',
       'No Mention'],
      dtype='object')

In [5]:
chi2_contingency(df)

(26.18702518414051,
 2.9012087617689084e-05,
 4,
 array([[ 6.16438356,  5.83561644],
        [22.60273973, 21.39726027],
        [11.81506849, 11.18493151],
        [13.86986301, 13.13013699],
        [20.54794521, 19.45205479]]))

In [6]:
df2= chi2_contingency(df)[3]
pd.DataFrame(
    data=df2[:,:],
    index = ["Palatalization", "Prefix Change", "Sound Change", "Two Options", "No Mention"],
    columns=["Alt", "Neut"]).round(2)

Unnamed: 0,Alt,Neut
Palatalization,6.16,5.84
Prefix Change,22.6,21.4
Sound Change,11.82,11.18
Two Options,13.87,13.13
No Mention,20.55,19.45


In [7]:
chisquare = chi2_contingency(df)[0]
print("chisquare is: ", chisquare)
pvalue = chi2_contingency(df)[1]
print("pvalue is: ", pvalue)
dof = chi2_contingency(df)[2]
print("degree of freedom is :", dof )


chisquare is:  26.18702518414051
pvalue is:  2.9012087617689084e-05
degree of freedom is : 4


In [8]:
from scipy.stats import chi2
significance = 0.01
p= 1- significance
dof = chi2_contingency(df)[2]
critical_value = chi2.ppf(p, dof)
print("critical value is: ", critical_value)

critical value is:  13.276704135987622


## Treating as ordinal (Added by Jamie)

In [9]:
## Make a data frame with each response coded as a number, 
## where 5 = most detailed information about rule and 1 = no information.

alternation = [5]*6 + [4]*36 + [3]*7 + [2]*13 + [1]*13 
neutralization = [5]*6 + [4]*8 + [3]*20 + [2]*14 + [1]*27 

df = pd.DataFrame(
    {
        'alternation':alternation,
        'neutralization':neutralization
    }
)

In [10]:
df

Unnamed: 0,alternation,neutralization
0,5,5
1,5,5
2,5,5
3,5,5
4,5,5
5,5,5
6,4,4
7,4,4
8,4,4
9,4,4


In [11]:
## Get some basic information about the spread
df.describe()

Unnamed: 0,alternation,neutralization
count,75.0,75.0
mean,3.12,2.36
std,1.294061,1.290715
min,1.0,1.0
25%,2.0,1.0
50%,4.0,2.0
75%,4.0,3.0
max,5.0,5.0


In [12]:
## Calculate median for each condition, which should probably be reported
df.median()

alternation       4.0
neutralization    2.0
dtype: float64

In [14]:
from scipy.stats import mannwhitneyu

In [15]:
## Mann-Whitney U test (which allows you to treat the data as an ordinal scale)

mannwhitneyu(df['alternation'],df['neutralization'],alternative='two-sided')

MannwhitneyuResult(statistic=3746.5, pvalue=0.0003038548983868646)

Outcome:  we can say that participants in the alternation condition were able to express more detail about the phonological rule that they learned compared to participants in the neutralization condition.