# Hypothesis testing

In [10]:
import numpy as np 
import pandas as pd
from scipy import stats

In hypothesis testing, we have two statements of interest: the first is our actual explanation, which we call the alternative hypothesis, and the second is that the explanation we have is not sufficient, and we call this the null hypothesis. Our actual testing method is to determine whether the null hypothesis is true or not. If we find that there's a difference between groups, then we can reject the null hypothesis and we accept our alternative.

In [11]:
df = pd.read_csv('datasets/grades.csv')
df.head()

Unnamed: 0,student_id,assignment1_grade,assignment1_submission,assignment2_grade,assignment2_submission,assignment3_grade,assignment3_submission,assignment4_grade,assignment4_submission,assignment5_grade,assignment5_submission,assignment6_grade,assignment6_submission
0,B73F2C11-70F0-E37D-8B10-1D20AFED50B1,92.733946,2015-11-02 06:55:34.282000000,83.030552,2015-11-09 02:22:58.938000000,67.164441,2015-11-12 08:58:33.998000000,53.011553,2015-11-16 01:21:24.663000000,47.710398,2015-11-20 13:24:59.692000000,38.168318,2015-11-22 18:31:15.934000000
1,98A0FAE0-A19A-13D2-4BB5-CFBFD94031D1,86.790821,2015-11-29 14:57:44.429000000,86.290821,2015-12-06 17:41:18.449000000,69.772657,2015-12-10 08:54:55.904000000,55.098125,2015-12-13 17:32:30.941000000,49.588313,2015-12-19 23:26:39.285000000,44.629482,2015-12-21 17:07:24.275000000
2,D0F62040-CEB0-904C-F563-2F8620916C4E,85.512541,2016-01-09 05:36:02.389000000,85.512541,2016-01-09 06:39:44.416000000,68.410033,2016-01-15 20:22:45.882000000,54.728026,2016-01-11 12:41:50.749000000,49.255224,2016-01-11 17:31:12.489000000,44.329701,2016-01-17 16:24:42.765000000
3,FFDF2B2C-F514-EF7F-6538-A6A53518E9DC,86.030665,2016-04-30 06:50:39.801000000,68.824532,2016-04-30 17:20:38.727000000,61.942079,2016-05-12 07:47:16.326000000,49.553663,2016-05-07 16:09:20.485000000,49.553663,2016-05-24 12:51:18.016000000,44.598297,2016-05-26 08:09:12.058000000
4,5ECBEEB6-F1CE-80AE-3164-E45E99473FB4,64.8138,2015-12-13 17:06:10.750000000,51.49104,2015-12-14 12:25:12.056000000,41.932832,2015-12-29 14:25:22.594000000,36.929549,2015-12-28 01:29:55.901000000,33.236594,2015-12-29 14:46:06.628000000,33.236594,2016-01-05 01:06:59.546000000


In [12]:
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

There are 2315 rows and 13 columns.


In [13]:
early_finishers = df[pd.to_datetime(df['assignment1_submission']) < '2016']
early_finishers.head()

Unnamed: 0,student_id,assignment1_grade,assignment1_submission,assignment2_grade,assignment2_submission,assignment3_grade,assignment3_submission,assignment4_grade,assignment4_submission,assignment5_grade,assignment5_submission,assignment6_grade,assignment6_submission
0,B73F2C11-70F0-E37D-8B10-1D20AFED50B1,92.733946,2015-11-02 06:55:34.282000000,83.030552,2015-11-09 02:22:58.938000000,67.164441,2015-11-12 08:58:33.998000000,53.011553,2015-11-16 01:21:24.663000000,47.710398,2015-11-20 13:24:59.692000000,38.168318,2015-11-22 18:31:15.934000000
1,98A0FAE0-A19A-13D2-4BB5-CFBFD94031D1,86.790821,2015-11-29 14:57:44.429000000,86.290821,2015-12-06 17:41:18.449000000,69.772657,2015-12-10 08:54:55.904000000,55.098125,2015-12-13 17:32:30.941000000,49.588313,2015-12-19 23:26:39.285000000,44.629482,2015-12-21 17:07:24.275000000
4,5ECBEEB6-F1CE-80AE-3164-E45E99473FB4,64.8138,2015-12-13 17:06:10.750000000,51.49104,2015-12-14 12:25:12.056000000,41.932832,2015-12-29 14:25:22.594000000,36.929549,2015-12-28 01:29:55.901000000,33.236594,2015-12-29 14:46:06.628000000,33.236594,2016-01-05 01:06:59.546000000
5,D09000A0-827B-C0FF-3433-BF8FF286E15B,71.647278,2015-12-28 04:35:32.836000000,64.05255,2016-01-03 21:05:38.392000000,64.75255,2016-01-07 08:55:43.692000000,57.467295,2016-01-11 00:45:28.706000000,57.467295,2016-01-11 00:54:13.579000000,57.467295,2016-01-20 19:54:46.166000000
8,C9D51293-BD58-F113-4167-A7C0BAFCB6E5,66.595568,2015-12-25 02:29:28.415000000,52.916454,2015-12-31 01:42:30.046000000,48.344809,2016-01-05 23:34:02.180000000,47.444809,2016-01-02 07:48:42.517000000,37.955847,2016-01-03 21:27:04.266000000,37.955847,2016-01-19 15:24:31.060000000


In [14]:
late_finishers = df[~df.index.isin(early_finishers.index)]
late_finishers.head()

Unnamed: 0,student_id,assignment1_grade,assignment1_submission,assignment2_grade,assignment2_submission,assignment3_grade,assignment3_submission,assignment4_grade,assignment4_submission,assignment5_grade,assignment5_submission,assignment6_grade,assignment6_submission
2,D0F62040-CEB0-904C-F563-2F8620916C4E,85.512541,2016-01-09 05:36:02.389000000,85.512541,2016-01-09 06:39:44.416000000,68.410033,2016-01-15 20:22:45.882000000,54.728026,2016-01-11 12:41:50.749000000,49.255224,2016-01-11 17:31:12.489000000,44.329701,2016-01-17 16:24:42.765000000
3,FFDF2B2C-F514-EF7F-6538-A6A53518E9DC,86.030665,2016-04-30 06:50:39.801000000,68.824532,2016-04-30 17:20:38.727000000,61.942079,2016-05-12 07:47:16.326000000,49.553663,2016-05-07 16:09:20.485000000,49.553663,2016-05-24 12:51:18.016000000,44.598297,2016-05-26 08:09:12.058000000
6,3217BE3F-E4B0-C3B6-9F64-462456819CE4,87.498744,2016-03-05 11:05:25.408000000,69.998995,2016-03-09 07:29:52.405000000,55.999196,2016-03-16 22:31:24.316000000,50.399276,2016-03-18 07:19:26.032000000,45.359349,2016-03-19 10:35:41.869000000,45.359349,2016-03-23 14:02:00.987000000
7,F1CB5AA1-B3DE-5460-FAFF-BE951FD38B5F,80.57609,2016-01-24 18:24:25.619000000,72.518481,2016-01-27 13:37:12.943000000,65.266633,2016-01-30 14:34:36.581000000,65.266633,2016-02-03 22:08:49.002000000,65.266633,2016-02-16 14:22:23.664000000,65.266633,2016-02-18 08:35:04.796000000
9,E2C617C2-4654-622C-AB50-1550C4BE42A0,59.270882,2016-03-06 12:06:26.185000000,59.270882,2016-03-13 02:07:25.289000000,53.343794,2016-03-17 07:30:09.241000000,53.343794,2016-03-20 21:45:56.229000000,42.675035,2016-03-27 15:55:04.414000000,38.407532,2016-03-30 20:33:13.554000000


In [15]:
print(early_finishers['assignment1_grade'].mean())
print(late_finishers['assignment1_grade'].mean())

74.94728457024303
74.0450648477065


The means look pretty similar. But, are they the same? What do we mean by similar? This is where the students' t-test comes in. It allows us to form the alternative hypothesis ("These are different") as well as the null hypothesis ("These are the same") and then test that null hypothesis.

When doing hypothesis testing, we have to choose a significance level as a threshold for how much of a chance we're willing to accept. this significance level is typically called alpha. For this example, let's use a threshold of 0.05.

In [16]:
from scipy.stats import ttest_ind

ttest_ind(early_finishers['assignment1_grade'], late_finishers['assignment1_grade'])

Ttest_indResult(statistic=1.322354085372139, pvalue=0.1861810110171455)

Here we can see that the probability is 0.18, and this is above our alpha value of 0.05. This means we cannot reject the null hypothesis. The null hypothesis was that the two populations are the same, and we don't have enought certainty in our evidence (because it is greater than alpha) to come to a conclusion to the contrary. This doesn't mean that we have proven the populations are the same.

Let's check the other assingment grades...

In [17]:
print(ttest_ind(early_finishers['assignment2_grade'], late_finishers['assignment2_grade']))
print(ttest_ind(early_finishers['assignment3_grade'], late_finishers['assignment3_grade']))
print(ttest_ind(early_finishers['assignment4_grade'], late_finishers['assignment4_grade']))
print(ttest_ind(early_finishers['assignment5_grade'], late_finishers['assignment5_grade']))
print(ttest_ind(early_finishers['assignment6_grade'], late_finishers['assignment6_grade']))

Ttest_indResult(statistic=1.2514717608216366, pvalue=0.2108889627004424)
Ttest_indResult(statistic=1.6133726558705392, pvalue=0.10679998102227865)
Ttest_indResult(statistic=0.049671157386456125, pvalue=0.960388729789337)
Ttest_indResult(statistic=-0.05279315545404755, pvalue=0.9579012739746492)
Ttest_indResult(statistic=-0.11609743352612056, pvalue=0.9075854011989656)


So it really looks like in this data we don't have enough evidence to suggest the populations differ with respect to grade. Let's take a look at those p-values for a moment though, beucase they are saying thing that can inform experimental design down the road. For instance, one of the assignements, assignemtn 3, has a p-value around 0.1. This means that if we accepted a level of chance similarity of 11% this would have been considered statistically significant. As a research, this would suggest to me that there is something here worth considering following up on. For instance, if we had a small number of participants (we don't) or if there was something unique about this assignment as it relates to our experiment (whatever it was) then there may be followup experiments we could run.

P-values have come under fire recently for being insuficient for telling us enough about the interactions which are happening, and two ther tecniques, confidence intervalues and bayesian analyses, are being used more regularly. One issue with p-values is that as you run more tests you are likely to get a value which is statistically significant just by chance.

In [27]:
df1 = pd.DataFrame([np.random.random(100) for x in range(100)])
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.567735,0.058548,0.278407,0.287481,0.455623,0.067368,0.913498,0.037095,0.13051,0.591639,...,0.333511,0.992016,0.599891,0.434306,0.015861,0.114943,0.78297,0.455538,0.82099,0.504029
1,0.483883,0.313553,0.273534,0.918986,0.463574,0.267547,0.23612,0.912908,0.110963,0.755558,...,0.053164,0.031721,0.55251,0.15884,0.670102,0.606612,0.116886,0.318697,0.567423,0.730159
2,0.079767,0.898636,0.747162,0.494806,0.178454,0.14294,0.417732,0.287079,0.219665,0.365946,...,0.824843,0.525684,0.873278,0.383773,0.089465,0.568227,0.389272,0.033191,0.895813,0.325061
3,0.523099,0.764312,0.607967,0.741515,0.880428,0.09077,0.826527,0.549065,0.259881,0.314154,...,0.422591,0.372754,0.360373,0.297087,0.545871,0.130012,0.226823,0.882542,0.104414,0.002658
4,0.213853,0.37036,0.935839,0.271704,0.941386,0.748506,0.436346,0.916858,0.832426,0.497802,...,0.093588,0.35498,0.828998,0.319809,0.237287,0.288314,0.028503,0.111392,0.873789,0.140319


In [28]:
df2 = pd.DataFrame([np.random.random(100) for x in range(100)])

Your question will be: "Are these DataFrames the same? For a given row inside df1, is it the same as the row inside df2?" Let's use a critical value of 0.1 (alpha equals 10%) and we're going to compare each column in df1 to the same numbered column in df2. And we'll report when the p-value isn't less than 10%, which means that we have sufficient evidence to say that the columns are different.

In [37]:
def test_columns(alpha=0.1):
    num_dif = 0
    for col in df1.columns:
        teststat, pval = ttest_ind(df1[col], df2[col])
        if pval <= alpha:
            print(f'Col {col} is statistically significantly different at alpha={alpha}')
            num_dif += 1
            
    print(f'Total number different was {num_dif}, which is {float(num_dif)/len(df1.columns)*100}%.')

    
test_columns()

Col 0 is statistically significantly different at alpha=0.1
Col 1 is statistically significantly different at alpha=0.1
Col 2 is statistically significantly different at alpha=0.1
Col 3 is statistically significantly different at alpha=0.1
Col 4 is statistically significantly different at alpha=0.1
Col 5 is statistically significantly different at alpha=0.1
Col 6 is statistically significantly different at alpha=0.1
Col 7 is statistically significantly different at alpha=0.1
Col 8 is statistically significantly different at alpha=0.1
Col 9 is statistically significantly different at alpha=0.1
Col 10 is statistically significantly different at alpha=0.1
Col 11 is statistically significantly different at alpha=0.1
Col 12 is statistically significantly different at alpha=0.1
Col 13 is statistically significantly different at alpha=0.1
Col 14 is statistically significantly different at alpha=0.1
Col 15 is statistically significantly different at alpha=0.1
Col 16 is statistically significan

In [30]:
test_columns(0.05)

Col 26 is statistically significantly different at alpha=0.05
Col 46 is statistically significantly different at alpha=0.05
Col 70 is statistically significantly different at alpha=0.05
Col 95 is statistically significantly different at alpha=0.05
Total nmber different was 4, which is 4.0%.


In [31]:
test_columns(0.02)

Col 70 is statistically significantly different at alpha=0.02
Col 95 is statistically significantly different at alpha=0.02
Total nmber different was 2, which is 2.0%.


In [36]:
df2 = pd.DataFrame([np.random.chisquare(df=1,size=100) for x in range(100)])
test_columns()

Col 0 is statistically significantly different at alpha=0.1
Col 1 is statistically significantly different at alpha=0.1
Col 2 is statistically significantly different at alpha=0.1
Col 3 is statistically significantly different at alpha=0.1
Col 4 is statistically significantly different at alpha=0.1
Col 5 is statistically significantly different at alpha=0.1
Col 6 is statistically significantly different at alpha=0.1
Col 7 is statistically significantly different at alpha=0.1
Col 8 is statistically significantly different at alpha=0.1
Col 9 is statistically significantly different at alpha=0.1
Col 10 is statistically significantly different at alpha=0.1
Col 11 is statistically significantly different at alpha=0.1
Col 12 is statistically significantly different at alpha=0.1
Col 13 is statistically significantly different at alpha=0.1
Col 14 is statistically significantly different at alpha=0.1
Col 15 is statistically significantly different at alpha=0.1
Col 16 is statistically significan