Basics of statistical testing in python

In [26]:
import numpy as np
import pandas as pd

#now lets bring in some new libraries from scipy
from scipy import stats

Hypothesis Testing

When we do hypothesis testing, we actually have two statements of interest: the first is our actual explanation, which we call the alternative hypothesis, and the second is that the explanation we have is not sufficient, and we call this the null hypothesis. Our actual testing method is to determine whether the null hypothesis is true or not. If we find that there is a difference between groups, then we can reject the null hypothesis and we accept our alternative.

In [27]:
#lets see an example of this using some grade data
df = pd.read_csv('grades.csv')
df.head()

Unnamed: 0,student_id,assignment1_grade,assignment1_submission,assignment2_grade,assignment2_submission,assignment3_grade,assignment3_submission,assignment4_grade,assignment4_submission,assignment5_grade,assignment5_submission,assignment6_grade,assignment6_submission
0,B73F2C11-70F0-E37D-8B10-1D20AFED50B1,92.733946,2015-11-02 06:55:34.282000000,83.030552,2015-11-09 02:22:58.938000000,67.164441,2015-11-12 08:58:33.998000000,53.011553,2015-11-16 01:21:24.663000000,47.710398,2015-11-20 13:24:59.692000000,38.168318,2015-11-22 18:31:15.934000000
1,98A0FAE0-A19A-13D2-4BB5-CFBFD94031D1,86.790821,2015-11-29 14:57:44.429000000,86.290821,2015-12-06 17:41:18.449000000,69.772657,2015-12-10 08:54:55.904000000,55.098125,2015-12-13 17:32:30.941000000,49.588313,2015-12-19 23:26:39.285000000,44.629482,2015-12-21 17:07:24.275000000
2,D0F62040-CEB0-904C-F563-2F8620916C4E,85.512541,2016-01-09 05:36:02.389000000,85.512541,2016-01-09 06:39:44.416000000,68.410033,2016-01-15 20:22:45.882000000,54.728026,2016-01-11 12:41:50.749000000,49.255224,2016-01-11 17:31:12.489000000,44.329701,2016-01-17 16:24:42.765000000
3,FFDF2B2C-F514-EF7F-6538-A6A53518E9DC,86.030665,2016-04-30 06:50:39.801000000,68.824532,2016-04-30 17:20:38.727000000,61.942079,2016-05-12 07:47:16.326000000,49.553663,2016-05-07 16:09:20.485000000,49.553663,2016-05-24 12:51:18.016000000,44.598297,2016-05-26 08:09:12.058000000
4,5ECBEEB6-F1CE-80AE-3164-E45E99473FB4,64.8138,2015-12-13 17:06:10.750000000,51.49104,2015-12-14 12:25:12.056000000,41.932832,2015-12-29 14:25:22.594000000,36.929549,2015-12-28 01:29:55.901000000,33.236594,2015-12-29 14:46:06.628000000,33.236594,2016-01-05 01:06:59.546000000


In [28]:
#There's 6 different assignments inside this dataset, lets look at the summary statistics
print('There are {} rows and {} columns'.format(df.shape[0],df.shape[1]))

There are 2315 rows and 13 columns


In [29]:
early_finishers = df[pd.to_datetime(df['assignment1_submission'])<'2016']
early_finishers

Unnamed: 0,student_id,assignment1_grade,assignment1_submission,assignment2_grade,assignment2_submission,assignment3_grade,assignment3_submission,assignment4_grade,assignment4_submission,assignment5_grade,assignment5_submission,assignment6_grade,assignment6_submission
0,B73F2C11-70F0-E37D-8B10-1D20AFED50B1,92.733946,2015-11-02 06:55:34.282000000,83.030552,2015-11-09 02:22:58.938000000,67.164441,2015-11-12 08:58:33.998000000,53.011553,2015-11-16 01:21:24.663000000,47.710398,2015-11-20 13:24:59.692000000,38.168318,2015-11-22 18:31:15.934000000
1,98A0FAE0-A19A-13D2-4BB5-CFBFD94031D1,86.790821,2015-11-29 14:57:44.429000000,86.290821,2015-12-06 17:41:18.449000000,69.772657,2015-12-10 08:54:55.904000000,55.098125,2015-12-13 17:32:30.941000000,49.588313,2015-12-19 23:26:39.285000000,44.629482,2015-12-21 17:07:24.275000000
4,5ECBEEB6-F1CE-80AE-3164-E45E99473FB4,64.813800,2015-12-13 17:06:10.750000000,51.491040,2015-12-14 12:25:12.056000000,41.932832,2015-12-29 14:25:22.594000000,36.929549,2015-12-28 01:29:55.901000000,33.236594,2015-12-29 14:46:06.628000000,33.236594,2016-01-05 01:06:59.546000000
5,D09000A0-827B-C0FF-3433-BF8FF286E15B,71.647278,2015-12-28 04:35:32.836000000,64.052550,2016-01-03 21:05:38.392000000,64.752550,2016-01-07 08:55:43.692000000,57.467295,2016-01-11 00:45:28.706000000,57.467295,2016-01-11 00:54:13.579000000,57.467295,2016-01-20 19:54:46.166000000
8,C9D51293-BD58-F113-4167-A7C0BAFCB6E5,66.595568,2015-12-25 02:29:28.415000000,52.916454,2015-12-31 01:42:30.046000000,48.344809,2016-01-05 23:34:02.180000000,47.444809,2016-01-02 07:48:42.517000000,37.955847,2016-01-03 21:27:04.266000000,37.955847,2016-01-19 15:24:31.060000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2308,EFDA9F93-D0C3-864F-B0F6-2E9AA3E05E31,71.481182,2015-10-03 09:04:46.358000000,70.981182,2015-10-06 03:57:28.420000000,64.603064,2015-10-12 07:58:25.081000000,63.703064,2015-10-17 07:59:49.005000000,50.962451,2015-10-18 02:29:34.374000000,45.866206,2015-10-27 00:21:47.208000000
2309,6D2AB78F-44F4-2E8B-5C5E-B79119BC7EAC,82.640274,2015-10-01 23:25:20.529000000,65.752219,2015-10-05 02:06:11.522000000,53.341775,2015-10-22 23:58:36.426000000,47.197598,2015-10-16 12:32:56.809000000,47.197598,2015-10-24 12:16:54.993000000,37.758078,2015-10-26 10:34:41.293000000
2311,DE88902E-C7A7-E37A-CFA7-F2C8F2D219F2,75.367870,2015-11-29 02:43:27.932000000,59.934296,2015-12-03 05:30:39.218000000,48.687437,2015-12-09 15:56:44.895000000,43.008693,2015-12-13 06:18:01.342000000,38.707824,2015-12-20 02:39:39.248000000,38.707824,2015-12-22 13:34:42.931000000
2312,EFDA9F93-D0C3-864F-B0F6-2E9AA3E05E31,73.269463,2015-10-20 08:09:27.418000000,58.255570,2015-11-18 19:07:06.930000000,58.955570,2015-12-10 08:54:54.871000000,52.250013,2015-11-23 19:40:00.434000000,41.800010,2015-11-29 14:23:43.659000000,41.800010,2015-12-04 09:56:07.156000000


In [30]:
late_finishers = df[~df.index.isin(early_finishers.index)]
late_finishers.head()

Unnamed: 0,student_id,assignment1_grade,assignment1_submission,assignment2_grade,assignment2_submission,assignment3_grade,assignment3_submission,assignment4_grade,assignment4_submission,assignment5_grade,assignment5_submission,assignment6_grade,assignment6_submission
2,D0F62040-CEB0-904C-F563-2F8620916C4E,85.512541,2016-01-09 05:36:02.389000000,85.512541,2016-01-09 06:39:44.416000000,68.410033,2016-01-15 20:22:45.882000000,54.728026,2016-01-11 12:41:50.749000000,49.255224,2016-01-11 17:31:12.489000000,44.329701,2016-01-17 16:24:42.765000000
3,FFDF2B2C-F514-EF7F-6538-A6A53518E9DC,86.030665,2016-04-30 06:50:39.801000000,68.824532,2016-04-30 17:20:38.727000000,61.942079,2016-05-12 07:47:16.326000000,49.553663,2016-05-07 16:09:20.485000000,49.553663,2016-05-24 12:51:18.016000000,44.598297,2016-05-26 08:09:12.058000000
6,3217BE3F-E4B0-C3B6-9F64-462456819CE4,87.498744,2016-03-05 11:05:25.408000000,69.998995,2016-03-09 07:29:52.405000000,55.999196,2016-03-16 22:31:24.316000000,50.399276,2016-03-18 07:19:26.032000000,45.359349,2016-03-19 10:35:41.869000000,45.359349,2016-03-23 14:02:00.987000000
7,F1CB5AA1-B3DE-5460-FAFF-BE951FD38B5F,80.57609,2016-01-24 18:24:25.619000000,72.518481,2016-01-27 13:37:12.943000000,65.266633,2016-01-30 14:34:36.581000000,65.266633,2016-02-03 22:08:49.002000000,65.266633,2016-02-16 14:22:23.664000000,65.266633,2016-02-18 08:35:04.796000000
9,E2C617C2-4654-622C-AB50-1550C4BE42A0,59.270882,2016-03-06 12:06:26.185000000,59.270882,2016-03-13 02:07:25.289000000,53.343794,2016-03-17 07:30:09.241000000,53.343794,2016-03-20 21:45:56.229000000,42.675035,2016-03-27 15:55:04.414000000,38.407532,2016-03-30 20:33:13.554000000


In [31]:
#lets compare the means of both groups
print(early_finishers['assignment1_grade'].mean())
print(late_finishers['assignment1_grade'].mean())

74.94728457024304
74.0450648477065


Students' t test

The Student's t-test is a statistical test used to compare the means of two groups and determine if they are significantly different from each other, especially when sample sizes are small

"Significantly different" means that the difference between the two group means is unlikely to be due to random chance. In other words, it suggests that there is a real effect or relationship, and the observed difference is not just a result of sampling variability. This is determined based on a threshold called the p-value, which helps decide whether to reject the null hypothesis that there is no difference between the groups.

In [32]:
# The SciPy library contains a number of different statistical tests and forms a basis
# for hypothesis testing in Python and we're going to use the ttest_ind() function 
# which does an independent t-test (meaning the populations are not related to one 
# another). The result of ttest_index() are the t-statistic and a p-value.

# lets bring in our ttest_ind function
from scipy.stats import ttest_ind

# lets run this function with our two populations, looking at the assignment 1 grades
ttest_ind(early_finishers['assignment1_grade'], late_finishers['assignment1_grade'])

TtestResult(statistic=np.float64(1.3223540853721598), pvalue=np.float64(0.18618101101713855), df=np.float64(2313.0))

In [33]:
#lets check other assignment grades
print(ttest_ind(early_finishers['assignment2_grade'], late_finishers['assignment2_grade']))
print(ttest_ind(early_finishers['assignment3_grade'], late_finishers['assignment3_grade']))
print(ttest_ind(early_finishers['assignment4_grade'], late_finishers['assignment4_grade']))
print(ttest_ind(early_finishers['assignment5_grade'], late_finishers['assignment5_grade']))
print(ttest_ind(early_finishers['assignment6_grade'], late_finishers['assignment6_grade']))

TtestResult(statistic=np.float64(1.2514717608216366), pvalue=np.float64(0.2108889627004424), df=np.float64(2313.0))
TtestResult(statistic=np.float64(1.6133726558705392), pvalue=np.float64(0.10679998102227865), df=np.float64(2313.0))
TtestResult(statistic=np.float64(0.049671157386456125), pvalue=np.float64(0.960388729789337), df=np.float64(2313.0))
TtestResult(statistic=np.float64(-0.05279315545404755), pvalue=np.float64(0.9579012739746492), df=np.float64(2313.0))
TtestResult(statistic=np.float64(-0.11609743352612056), pvalue=np.float64(0.9075854011989656), df=np.float64(2313.0))


In [34]:
# P-values have come under fire recently for being insuficient for telling us enough about the interactions
# which are happening, and two other techniques, confidence intervalues and bayesian analyses, are being used
# more regularly.
# One issue with p-values is that as you run more tests you are likely to get a value which
# is statistically significant just by chance.

# lets see a simulation of this
# First lets create 100x100 dataframe
df1 = pd.DataFrame([np.random.random(100) for x in range(100)])
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.775109,0.633477,0.441437,0.481435,0.675719,0.076464,0.397375,0.221554,0.998056,0.47433,...,0.701979,0.272315,0.902146,0.044112,0.551059,0.741833,0.263761,0.608259,0.128671,0.766785
1,0.405121,0.339032,0.439529,0.83328,0.006421,0.839938,0.696243,0.590042,0.172419,0.428578,...,0.606766,0.613095,0.126827,0.529556,0.825855,0.931653,0.296262,0.391448,0.304496,0.72694
2,0.637858,0.752517,0.091701,0.501294,0.363896,0.391779,0.381124,0.425643,0.435957,0.523514,...,0.486058,0.536856,0.850203,0.321775,0.168465,0.081545,0.581888,0.883094,0.48005,0.975588
3,0.713502,0.855876,0.714689,0.855255,0.184206,0.811279,0.423033,0.196875,0.52526,0.744232,...,0.979568,0.905559,0.474762,0.645814,0.840198,0.827705,0.467386,0.168189,0.28868,0.811293
4,0.504855,0.344934,0.564368,0.248353,0.902731,0.40838,0.509903,0.96584,0.622861,0.864311,...,0.478399,0.326766,0.737627,0.643552,0.054221,0.548842,0.780747,0.760195,0.606374,0.322648


In [35]:
#lets create a 2nd dataframe
df2 = pd.DataFrame([np.random.random(100) for x in range(100)])
df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.084896,0.297927,0.340597,0.255046,0.606908,0.865757,0.512192,0.16981,0.627784,0.3735,...,0.13702,0.619126,0.351757,0.561881,0.493201,0.549331,0.440599,0.951139,0.533006,0.877108
1,0.77029,0.775443,0.4176,0.625997,0.812068,0.053827,0.965241,0.296131,0.535578,0.51373,...,0.758366,0.80645,0.655636,0.643697,0.200293,0.459937,0.060828,0.568101,0.42675,0.570898
2,0.519003,0.543244,0.620169,0.028995,0.956412,0.735701,0.344924,0.784868,0.366263,0.550302,...,0.435064,0.576324,0.970446,0.295492,0.452312,0.690415,0.528608,0.070156,0.27547,0.384968
3,0.730775,0.436289,0.559252,0.337824,0.825272,0.176972,0.583861,0.414706,0.961941,0.445,...,0.970345,0.084673,0.492295,0.509104,0.063849,0.428849,0.570269,0.718104,0.579166,0.712663
4,0.010705,0.466304,0.557849,0.345188,0.655009,0.800981,0.400737,0.718304,0.617587,0.484817,...,0.43937,0.462031,0.566136,0.793412,0.881958,0.748853,0.249542,0.696137,0.731457,0.046506


In [36]:
# Are these two DataFrames the same? Maybe a better question is, for a given row inside
# of df1, is it the same as the row inside df2?

#lets say alpha = 0.1; let us compare each column in df1 with the same numbered column
#in df2

#lets write a function called test_columns
def test_columns(alpha=0.1):
    #I want to keep track of how many differ
    num_diff = 0
    # now lets iterate over the columns
    for col in df1.columns:
        #we can run the ttest between the two dataframes
        teststat,pval = ttest_ind(df1[col],df2[col])
        #and we check the pvalue versus the alpha
        if pval<=alpha:
            #we'll print out if they are different and increment num_diff
            num_diff=num_diff+1
    print('The total number different was {}, which is {}%'.format(num_diff,((num_diff*100)/100)))

test_columns()                

The total number different was 14, which is 14.0%


In [37]:
# Interesting, so we see that there are a bunch of columns that are different! In fact,
# that number looks a lot like the alpha value we chose. So what's going on - shouldn't
# all of the columns be the same? Remember that all the ttest does is check if two sets
# are similar given some level of confidence, in our case, 10%. The more random 
# comparisons you do, the more will just happen to be the same by chance

#test using alpha value of 0.05
test_columns(0.05)

The total number different was 7, which is 7.0%


In [38]:
#Just for fun, lets create a second dataframe using a non-normal distribution, for 
#instance a chi-squared distribution
df2 = pd.DataFrame([np.random.chisquare(df=1,size=100) for x in range(100)])
df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.488025,1.249749,2.928849e-01,0.269534,0.246106,0.201256,1.166601,3.831195,0.005872,0.023273,...,0.341287,0.127429,0.035283,1.231417,0.866222,0.310401,0.137301,0.052559,0.035421,0.851289
1,1.959617,0.033628,5.850339e-01,1.792440,0.059743,0.059224,0.025302,1.633268,0.091545,0.393002,...,3.166571,0.019425,0.089961,0.028293,0.030350,0.028142,1.808838,0.560410,0.084259,0.371174
2,0.155820,0.107745,7.709218e-01,7.753496,0.003774,0.931830,0.348502,0.701562,1.610225,0.139565,...,5.165169,1.290203,0.243771,0.388958,0.064002,2.123776,0.879755,0.562299,1.093725,0.249472
3,0.095657,0.478798,3.049536e-02,2.382060,2.171112,0.268492,0.016127,0.568524,0.062863,1.575413,...,2.136097,1.482909,0.280945,0.011458,0.014738,0.026352,1.874834,1.862715,9.466022,1.290552
4,0.011574,0.326115,1.427422e+00,0.246009,1.394251,9.818236,1.300457,0.063362,1.916721,2.830464,...,1.829371,0.009496,0.588587,0.006268,0.372687,0.022100,0.894832,0.414826,0.113453,0.268408
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1.345198,0.013234,2.651751e-02,1.097299,0.016741,1.978254,0.070728,1.034125,2.090374,4.825810,...,1.198175,2.560270,0.026038,0.434544,0.310779,0.045844,0.073783,4.475636,1.267274,1.125670
96,0.014845,0.347164,1.473697e+00,0.013624,1.701990,0.070283,0.940972,0.032465,3.693800,0.000137,...,0.538217,0.378713,0.003499,0.003936,0.828737,1.060151,1.446909,0.857598,0.246336,0.019115
97,0.027560,0.305767,2.372907e-01,0.014346,0.117078,1.533652,0.126599,1.143061,0.194297,3.218475,...,6.568337,1.154707,2.217242,0.704512,3.229984,2.182266,0.844220,0.000887,0.027548,0.004114
98,0.000869,3.502096,4.880472e-07,0.000032,0.108430,5.490591,0.186462,0.271276,0.102666,0.003758,...,0.802152,0.613486,0.008685,0.488757,0.546883,0.639249,0.082574,4.665786,0.242002,3.159766


In [39]:
test_columns()

The total number different was 99, which is 99.0%


In [40]:
for col in df1.columns:
    ttest,pval = stats.ttest_rel(df1[col],df1[col])
ttest,pval    

(np.float64(nan), np.float64(nan))