In [1]:
import numpy as np
import scipy
import scipy.stats
import pandas as pd
import matplotlib.pyplot as plt

# Lab 6
## Hypothesis testing

### Task 1: Wald and permutation tests

In 1861, 10 essays appeared in the New Orleans Daily Crescent. They
were signed “Quintus Curtius Snodgrass” and some people suspected
they were actually written by Mark Twain. To investigate this, we will
consider the proportion of three letter words found in an author’s work.
From eight Twain essays we have:

In [2]:
X = np.array([.225, .262, .217, .240, .230, .229, .235, .217])

From 10 Snodgrass essays we have:

In [3]:
Y = np.array([.209, .205, .196, .210, .202, .207, .224, .223, .220, .201])

1. Perform a Wald test for equality of the means. Report the $p$-value and a 95% confidence
interval for the difference of means. What do you conclude?

2. Now use a permutation test to avoid the use of large sample methods.
What is your conclusion?

In [9]:
# Wald's criterion
X_mean = np.mean(X)
Y_mean = np.mean(Y)

X_var = np.var(X, ddof=1)
Y_var = np.var(Y, ddof=1)

se = np.sqrt(X_var/len(X) + Y_var/len(Y))
W = (X_mean - Y_mean) / se

# two-side p-value
p = 2 * scipy.stats.norm.cdf(-abs(W))
print(f'Wald\'s criterion: W: {W}, p-value: {p}')

# permutation test
XY = np.concatenate([X, Y])

per_list = [X_mean - Y_mean]
for i in range(len(XY)):
    for j in range(i + 1, len(XY)):
        XY_ = XY.copy()
        XY_[i], XY_[j] = XY_[j], XY_[i]
        XY_1 = XY_[:len(X)] # X
        XY_2 = XY_[len(X):]
        XY_1_mean = np.mean(XY_1)
        XY_2_mean = np.mean(XY_2)
        per_list.append(XY_1_mean - XY_2_mean) # Minghao to copliot: you are amazing!

per_list = np.array(per_list) 
n_per = len(per_list)
p_value = np.sum(per_list > per_list[0]) / n_per

print(f'Permutation test p-value: {p_value}')

# conclusion


Wald's criterion: W: 3.7035535443338206, p-value: 0.00021260028225810121
Permutation test p-value: 0.03896103896103896


Conclusion: 
1. W > 1.96, which means that H0 been rejected.
2. 0.01 < p-value < 0.05, H0 not true.

### Task 2: Multiple comparisons

A randomized, double-blind experiment was conducted to assess the
effectiveness of several drugs for reducing postoperative nausea. The
data are as follows:

In [11]:
df1 = pd.DataFrame({'Drug': ['Placebo', 'Chlorpromazine', 'Dimenhydrinate', 'Pentobarbital (100 mg)', 'Pentobarbital (150 mg)'],
                    'Number of Patients': [80, 75, 85, 67, 85],
                    'Incidence of Nausea': [45, 26, 52, 35, 33]})
df1

Unnamed: 0,Drug,Number of Patients,Incidence of Nausea
0,Placebo,80,45
1,Chlorpromazine,75,26
2,Dimenhydrinate,85,52
3,Pentobarbital (100 mg),67,35
4,Pentobarbital (150 mg),85,33


1. Test each drug versus the placebo at the 5 per cent level. Also, report the estimated odds–ratios. Summarize your findings. 
2. Use the Bonferroni method for multiple testing.
3. Compare the results of testing with and without correction for multiple testing.

*Hint*. Use simple $H_0$: "$p = p_0$".

In [14]:
from scipy.stats import chi2_contingency

def perform_chi_square_test(data_frame):

    for index in range(1, len(data_frame)):
        treatment_name = data_frame.iloc[index]['Drug']
        control_group = [data_frame.iloc[0]['Number of Patients'] - data_frame.iloc[0]['Incidence of Nausea'],
                         data_frame.iloc[0]['Incidence of Nausea']]
        treatment_group = [data_frame.iloc[index]['Number of Patients'] - data_frame.iloc[index]['Incidence of Nausea'],
                           data_frame.iloc[index]['Incidence of Nausea']]
        
        contingency_matrix = [control_group, treatment_group]
        stat_chi2, p_value, _, _ = chi2_contingency(contingency_matrix)
        odds_ratio_estimate = treatment_group[1] / treatment_group[0]
        
        print(f"--- {treatment_name}")
        print(f"Chi-square statistic: {stat_chi2}")
        print(f"p-value: {p_value}")
        print(f"odds-ratio: {odds_ratio_estimate}\n")

# Example usage
perform_chi_square_test(df1)


--- Chlorpromazine
Chi-square statistic: 6.42067596551531
p-value: 0.011279921768145679
odds-ratio: 0.5306122448979592

--- Dimenhydrinate
Chi-square statistic: 0.23454012347055284
p-value: 0.6281776400799568
odds-ratio: 1.5757575757575757

--- Pentobarbital (100 mg)
Chi-square statistic: 0.10244732784306113
p-value: 0.7489122792634635
odds-ratio: 1.09375

--- Pentobarbital (150 mg)
Chi-square statistic: 4.346286715946323
p-value: 0.03708973088034181
odds-ratio: 0.6346153846153846

