In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import scipy

### T-test: One Sample
- To test if given sample is coming from population with specified mean mu0

In [2]:
df = pd.read_csv('StudentsPerformance.csv')
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [3]:
#CLaim:The average maths score of students is less than 70.

#H0 : The average maths score of students is 70.
#H1 : The average maths score of students is less than 70.

d = df['math score']
t = stats.ttest_1samp(d,70,alternative='less')
print("Test Statistics is {} and P-value is {}".format(t[0],t[1]))
if t[1]<0.05:
    print('Therefore, the null hypothesis is rejected.')
else:
    print('Therefore, the null hypothesis is accepted.')

Test Statistics is -8.156435137590153 and P-value is 5.145632170811557e-16
Therefore, the null hypothesis is rejected.


In [4]:
#CLaim:The average maths score of students is less than 66.

#H0 : The average maths score of students is 66.
#H1 : The average maths score of students is less than 66.

d = df['math score']
d1 = d[np.random.randint(0,1000,30)]
t = stats.ttest_1samp(d[d1],66,alternative='less')
print("Test Statistics is {} and P-value is {}".format(t[0],t[1]))
if t[1]<0.05:
    print('Therefore, the null hypothesis is rejected.')
else:
    print('Therefore, the null hypothesis is accepted.')

Test Statistics is -2.686457043243347 and P-value is 0.005913221930935952
Therefore, the null hypothesis is rejected.


### T-test: Two Sample
- To test if given samples are coming from population with same mean.

In [5]:
df['gender'].value_counts()

female    518
male      482
Name: gender, dtype: int64

In [6]:
#To decide sample size,we will use proportional sample size
f_prop = df['gender'].value_counts()['female']/(df['gender'].value_counts()['male']+df['gender'].value_counts()['female'])
m_prop = df['gender'].value_counts()['male']/(df['gender'].value_counts()['male']+df['gender'].value_counts()['female'])
num_of_female_samples = round(f_prop*60)
num_of_male_samples = round(m_prop*60)
print('No. of female samples: ',num_of_female_samples)
print('No. of male samples: ',num_of_male_samples)
#Math Scores of Females
f = df[df['gender']=='female']['math score']
#Math Scores of Males
m = df[df['gender']=='male']['math score']

No. of female samples:  31
No. of male samples:  29


In [7]:
#CLaim:The average maths score of female students is greater than male students.

#H0 : The average maths score of female students and male students is same.
#H1 : The average maths score of female students is greater than male students.

female_scores = np.random.choice(f,num_of_female_samples)
male_scores = np.random.choice(m,num_of_male_samples)
t = stats.ttest_ind(female_scores,male_scores,alternative='greater')
print("Test Statistics is {} and P-value is {}".format(t[0],t[1]))
if t[1]<0.05:
    print('Therefore, the null hypothesis is rejected.')
else:
    print('Therefore, the null hypothesis is accepted.')

Test Statistics is -1.4291846789822416 and P-value is 0.920842369327858
Therefore, the null hypothesis is accepted.


In [8]:
#CLaim:The average maths score of female students is greater than male students.

#H0 : The average maths score of female students and male students is same.
#H1 : The average maths score of female students is greater than male students.

t = stats.ttest_ind(f,m,alternative='greater')
print("Test Statistics is {} and P-value is {}".format(t[0],t[1]))
if t[1]<0.05:
    print('Therefore, the null hypothesis is rejected.')
else:
    print('Therefore, the null hypothesis is accepted.')

Test Statistics is -5.383245869828983 and P-value is 0.9999999543990722
Therefore, the null hypothesis is accepted.


### T-test: Paired
Each value of one group corresponds directly to a value in the other group, 
before and after values in an experiment. Subtract two values and perform a 
one-sample t-test with null mean set to 0.

In [9]:
# Check if there is significant increase in swimming time after taking health drink .

#H0 : There is no significant increase in swimming time after taking health drink .
#H1 : There is significant increase in swimming time after taking health drink .

B = np.array([302,306,350,342,310,298,285,360,341,360])
A = np.array([322,320,369,380,350,360,380,390,385,390])
df1 = pd.DataFrame({'B':B,'A':A})
t = stats.ttest_rel(df1['A'],df1['B'],alternative='greater')
print("Test Statistics is {} and P-value is {}".format(t[0],t[1]))
if t[1]<0.05:
    print('Therefore, the null hypothesis is rejected.')
else:
    print('Therefore, the null hypothesis is accepted.')

Test Statistics is 5.137575947594257 and P-value is 0.0003066463914488514
Therefore, the null hypothesis is rejected.


In [10]:
A.mean() - B.mean()

39.200000000000045

### Chi-squre test
- For checking goodness of fit
- For testing independence of attributes
A general rule is to have cell frequencies more than 5

In [11]:
df1 = pd.read_csv(r'C:\Users\jaych\Downloads\Refactored_Py_DS_ML_Bootcamp-master\10-Data-Capstone-Projects\911.csv')
df1.head()

Unnamed: 0,lat,lng,desc,zip,title,timeStamp,twp,addr,e
0,40.297876,-75.581294,REINDEER CT & DEAD END; NEW HANOVER; Station ...,19525.0,EMS: BACK PAINS/INJURY,2015-12-10 17:40:00,NEW HANOVER,REINDEER CT & DEAD END,1
1,40.258061,-75.26468,BRIAR PATH & WHITEMARSH LN; HATFIELD TOWNSHIP...,19446.0,EMS: DIABETIC EMERGENCY,2015-12-10 17:40:00,HATFIELD TOWNSHIP,BRIAR PATH & WHITEMARSH LN,1
2,40.121182,-75.351975,HAWS AVE; NORRISTOWN; 2015-12-10 @ 14:39:21-St...,19401.0,Fire: GAS-ODOR/LEAK,2015-12-10 17:40:00,NORRISTOWN,HAWS AVE,1
3,40.116153,-75.343513,AIRY ST & SWEDE ST; NORRISTOWN; Station 308A;...,19401.0,EMS: CARDIAC EMERGENCY,2015-12-10 17:40:01,NORRISTOWN,AIRY ST & SWEDE ST,1
4,40.251492,-75.60335,CHERRYWOOD CT & DEAD END; LOWER POTTSGROVE; S...,,EMS: DIZZINESS,2015-12-10 17:40:01,LOWER POTTSGROVE,CHERRYWOOD CT & DEAD END,1


In [12]:
df1['Reason'] = df1['title'].apply(lambda x:x.split(':')[0])

In [13]:
import datetime
df1['timeStamp'] = pd.to_datetime(df1['timeStamp'])
df1['Month'] = df1['timeStamp'].apply(lambda x:x.month)
df1['Hour'] = df1['timeStamp'].apply(lambda x:x.hour)
df1['Dayofweek'] = df1['timeStamp'].apply(lambda x:x.dayofweek)

In [14]:
dmap = {0:'Mon',1:'Tue',2:'Wed',3:'Thu',4:'Fri',5:'Sat',6:'Sun'}
df1['Dayofweek']=df1['Dayofweek'].map(dmap)

In [15]:
dd = pd.crosstab(df1['Reason'],df1['Dayofweek'])
dd

Dayofweek,Fri,Mon,Sat,Sun,Thu,Tue,Wed
Reason,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
EMS,7315,7240,6685,6456,6993,7222,6966
Fire,2088,2204,2068,2076,2136,2121,2227
Traffic,5430,5236,4583,3604,5349,5807,5686


In [16]:
#Test whether the reason for emergency 911 calls is dependent of Day of week or not.

#Ho: The reason for emergency 911 calls is independent of Day of week.
#H1: The reason for emergency 911 calls is dependent of Day of week.

chi2, p, dof, exp = stats.chi2_contingency(dd)
print("Test Statistics =",chi2)
print("P-value =",p)
print("D.F. =", dof)
print("Expected Freq =",exp)
print('*'*80)
print("Test Statistics is {} and P-value is {}".format(chi2,p))
if p<0.05:
    print('Therefore, the null hypothesis is rejected.')
else:
    print('Therefore, the null hypothesis is accepted.')

Test Statistics = 318.6484437821447
P-value = 5.6539911169434376e-61
D.F. = 12
Expected Freq = [[7286.94308085 7211.77943955 6551.51843364 5961.99967837 7112.54378241
  7442.67428537 7309.5412998 ]
 [2224.38346802 2201.4393117  1999.89064447 1819.9364773  2171.1470269
  2271.92136051 2231.28171109]
 [5321.67345113 5266.78124874 4784.59092188 4354.06384433 5194.30919069
  5435.40435412 5338.1769891 ]]
********************************************************************************
Test Statistics is 318.6484437821447 and P-value is 5.6539911169434376e-61
Therefore, the null hypothesis is rejected.


In [17]:
month={1:'Jan',2:'Feb',3:'Mar',4:'Apr',5:'May',6:'Jun',7:'Jul',8:'Aug',9:'Sep',12:'Dec'}
df1['Month']=df1['Month'].map(month)
dm = pd.crosstab(df1['Reason'],df1['Month'])
dm

Month,Apr,Aug,Dec,Feb,Jan,Jul,Jun,Mar,May
Reason,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
EMS,5680,4508,3898,5413,6063,6029,5720,5832,5734
Fire,1717,1473,1096,1869,1905,1901,1796,1590,1573
Traffic,3929,3097,2975,4185,5237,4207,4270,3679,4116


In [18]:
#Test whether the reason for emergency 911 calls depends on month of the year.

#Ho: The reason for emergency 911 calls is independent of Month.
#H1: The reason for emergency 911 calls is dependent of Month.

chi2, p, dof, exp = stats.chi2_contingency(dm)
print("Test Statistics is {} and P-value is {}".format(chi2,p))
if p<0.05:
    print('Therefore, the null hypothesis is rejected.')
else:
    print('Therefore, the null hypothesis is accepted.')

Test Statistics is 221.65443326681236 and P-value is 3.2099977969575655e-38
Therefore, the null hypothesis is rejected.


### Chi-Square Goodness of Fit

In [19]:
gof = df1['Dayofweek'].value_counts()
gof

Tue    15150
Wed    14879
Fri    14833
Mon    14680
Thu    14478
Sat    13336
Sun    12136
Name: Dayofweek, dtype: int64

In [20]:
#Test whether the proportions of the emergency calls in a day are the same.
#H0: The proportions of the emergency calls in a day are the same.
#H1: The proportions of the emergency calls in a day are not same.

chi2, p = stats.chisquare(gof)
print("Test Statistics is {} and P-value is {}".format(chi2,p))
if p<0.05:
    print('Therefore, the null hypothesis is rejected.')
else:
    print('Therefore, the null hypothesis is accepted.')

Test Statistics is 497.94031680939173 and P-value is 2.3355401185589408e-104
Therefore, the null hypothesis is rejected.


### F-test
- For equality of variances

In [21]:
#Math Scores of Females
f = df[df['gender']=='female']['math score']
#Math Scores of Males
m = df[df['gender']=='male']['math score']

In [22]:
f.mean(),m.mean()

(63.633204633204635, 68.72821576763485)

In [23]:
#Check whether the variability in maths score of female students and male students is same or not.

#H0 : The variability in maths score of female students and male students is same.
#H1 : The variability in maths score of female students is greater than male students.

f,p = stats.f_oneway(f,m)
print("Test Statistics is {} and P-value is {}".format(f,p))
if p<0.05:
    print('Therefore, the null hypothesis is rejected.')
else:
    print('Therefore, the null hypothesis is accepted.')


Test Statistics is 28.979336095030888 and P-value is 9.120185549332254e-08
Therefore, the null hypothesis is rejected.
