# T-Test

## One sample t-test

In [64]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np

In [65]:
df_2015 = pd.read_csv('./data_pd/marathon_results_2015.csv')
df_2016 = pd.read_csv('./data_pd/marathon_results_2016.csv')
df_2017 = pd.read_csv('./data_pd/marathon_results_2017.csv')

In [66]:

df = pd.concat([df_2015,df_2016,df_2017], ignore_index=True)
df.head(5)

Unnamed: 0.1,Unnamed: 0,Bib,Name,Age,M/F,City,State,Country,Citizen,Unnamed: 9,...,30K,35K,40K,Pace,Proj Time,Official Time,Overall,Gender,Division,Unnamed: 8
0,0.0,3,"Desisa, Lelisa",25,M,Ambo,,ETH,,,...,1:32:00,1:47:59,2:02:39,0:04:56,-,2:09:17,1,1,1,
1,1.0,4,"Tsegay, Yemane Adhane",30,M,Addis Ababa,,ETH,,,...,1:31:59,1:47:59,2:02:42,0:04:58,-,2:09:48,2,2,2,
2,2.0,8,"Chebet, Wilson",29,M,Marakwet,,KEN,,,...,1:32:00,1:47:59,2:03:01,0:04:59,-,2:10:22,3,3,3,
3,3.0,11,"Kipyego, Bernard",28,M,Eldoret,,KEN,,,...,1:32:00,1:48:03,2:03:47,0:05:00,-,2:10:47,4,4,4,
4,4.0,10,"Korir, Wesley",32,M,Kitale,,KEN,,,...,1:32:00,1:47:59,2:03:27,0:05:00,-,2:10:49,5,5,5,


In [67]:
df['Age'].mean()

42.41516612672342

In [68]:
np.random.seed(4)
age_data = df['Age'].sample(n=30, replace=False)
len(age_data)

30

In [69]:
pop_avg_age = 40    #Estimated population mean age

In [70]:
# Perform One Sample t-test
t_statistics, p_value = stats.ttest_1samp(age_data, pop_avg_age)

In [71]:
# Degree of freedom
degree_of_freedom = len(age_data)-1


In [72]:
# Print the results
print("T-statistics: ", t_statistics)
print("P-value: ", p_value)
print("Degree of freedom: ", degree_of_freedom)

T-statistics:  2.2649036558388898
P-value:  0.031166863165459135
Degree of freedom:  29


In [73]:
# Interpretation 
alpha = 0.025
if p_value < alpha:
    print("The null hypothesis is rejected")
else:
    print("The null hypothesis cannot be rejected")

critical_value = stats.t.ppf(1-alpha, degree_of_freedom)
print("Critical value: ", critical_value)

The null hypothesis cannot be rejected
Critical value:  2.045229642132703


In [74]:
# Two -Tail t-Test (BEcause we are just checking if sample mean = population mean)
# So we need to take absolute value of t-statistics
if (abs(t_statistics) < critical_value):
    print("The  null hypothesis (mean age = 40) cannot be rejected")
else:
    print("The  null hypothesis (mean age = 40) is rejected")
    

The  null hypothesis (mean age = 40) is rejected


## Two sampled t-test

In [75]:
tips_df = pd.read_csv("./data_pd/tips.csv")
tips_df.head()

Unnamed: 0,total_bill,tip,gender,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560330000000000.0,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478070000000000.0,Sun4608
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011810000000000.0,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676140000000000.0,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832730000000000.0,Sun2251


In [76]:
male = tips_df[tips_df['gender'] == 'Male']['tip']
female = tips_df[tips_df['gender'] == 'Female']['tip']


In [77]:
np.random.seed(4)
male_tips = male.sample(n=30, replace=False)
female_tips = female.sample(n=30, replace=False)


In [78]:
print(male_tips.mean())
print(female_tips.mean())

2.867
2.747333333333333


In [79]:
# Perform two-sample t-test
t_statistics, p_value = stats.ttest_ind(male_tips, female_tips)

In [80]:
# Degree of freedom
n_male = len(male_tips)
n_female = len(female_tips)
degree_of_freedom = n_male + n_female - 2
alpha = 0.025

In [81]:
critical_value = stats.t.ppf(1 - alpha, degree_of_freedom)
print("T-statistics: ", t_statistics)
print("p-value: ", p_value)
print("Degree of freedom: ", degree_of_freedom)
print("Critical value", critical_value)

T-statistics:  0.37323171332401517
p-value:  0.7103366684196261
Degree of freedom:  58
Critical value 2.0017174841452356


In [82]:
# Interpretation 
alpha = 0.025
if p_value < alpha:
    print("There is a significant difference.")
else:
    print("There is no significant difference.")

There is no significant difference.


In [83]:

if (abs(t_statistics) > critical_value):
    print("There is a significant difference.")
else:
    print("There is no significant difference.")

There is no significant difference.


    - Two sampled t-test for combined marathon datasets, determine if there is a significant difference between the average pace of atheletes of age <=30 and atheletes with age > 30

In [84]:
df_2015 = pd.read_csv('./data_pd/marathon_results_2015.csv')
df_2016 = pd.read_csv('./data_pd/marathon_results_2016.csv')
df_2017 = pd.read_csv('./data_pd/marathon_results_2017.csv')
df = pd.concat([df_2015,df_2016,df_2017], ignore_index=True)


In [85]:
df[['Hours_pace','Minutes_pace','Seconds_pace']]  = df['Pace'].str.split(':', expand=True)
df['Pace_duration'] = df['Hours_pace'].astype(int)*3600 + df['Minutes_pace'].astype(int)*60 + df['Seconds_pace'].astype(int)
df = df.drop(['Hours_pace','Minutes_pace','Seconds_pace'], axis=1)
df.head(5)

Unnamed: 0.1,Unnamed: 0,Bib,Name,Age,M/F,City,State,Country,Citizen,Unnamed: 9,...,35K,40K,Pace,Proj Time,Official Time,Overall,Gender,Division,Unnamed: 8,Pace_duration
0,0.0,3,"Desisa, Lelisa",25,M,Ambo,,ETH,,,...,1:47:59,2:02:39,0:04:56,-,2:09:17,1,1,1,,296
1,1.0,4,"Tsegay, Yemane Adhane",30,M,Addis Ababa,,ETH,,,...,1:47:59,2:02:42,0:04:58,-,2:09:48,2,2,2,,298
2,2.0,8,"Chebet, Wilson",29,M,Marakwet,,KEN,,,...,1:47:59,2:03:01,0:04:59,-,2:10:22,3,3,3,,299
3,3.0,11,"Kipyego, Bernard",28,M,Eldoret,,KEN,,,...,1:48:03,2:03:47,0:05:00,-,2:10:47,4,4,4,,300
4,4.0,10,"Korir, Wesley",32,M,Kitale,,KEN,,,...,1:47:59,2:03:27,0:05:00,-,2:10:49,5,5,5,,300


In [86]:
age_data_1 = df[df['Age'] <= 30]['Pace_duration']
age_data_2 = df[df['Age'] > 30]['Pace_duration']

In [87]:
np.random.seed(4)
age_more_than_30 = age_data_2.sample(n=30, replace=False)
age_less_than_30 = age_data_1.sample(n=30, replace=False)


In [88]:
print(age_less_than_30.mean())
print(age_more_than_30.mean())

513.3333333333334
550.2


In [89]:
# Perform two-sample t-test
t_statistics, p_value = stats.ttest_ind(age_more_than_30, age_less_than_30)

In [90]:
# Degree of freedom
n_male_gt_30 = len(age_more_than_30)
n_male_lt_30 = len(age_less_than_30)
degree_of_freedom = n_male_gt_30 + n_male_lt_30 - 2
alpha = 0.025

In [91]:
critical_value = stats.t.ppf(1 - alpha, degree_of_freedom)
print("T-statistics: ", t_statistics)
print("p-value: ", p_value)
print("Degree of freedom: ", degree_of_freedom)
print("Critical value", critical_value)

T-statistics:  1.4263534563789626
p-value:  0.15912712455503233
Degree of freedom:  58
Critical value 2.0017174841452356


In [92]:
# Interpretation 
alpha = 0.025
if p_value < alpha:
    print("There is a significant difference.")
else:
    print("There is no significant difference.")
if (abs(t_statistics) > critical_value):
    print("There is a significant difference.")
else:
    print("There is no significant difference.")

There is no significant difference.
There is no significant difference.


In [93]:
# another condition
young_atheletes = df[(df['Age'] <= 30) & (df['M/F'] == 'M')]['Pace_duration']
older_atheletes = df[(df['Age'] > 30) & (df['M/F'] == 'F')]['Pace_duration']

In [94]:
np.random.seed(4)
young_male_athletes = young_atheletes.sample(n=30, replace=False)
older_female_athletes = older_atheletes.sample(n=30, replace=False)

In [95]:
print(young_male_athletes.mean())
print(older_female_athletes.mean())

471.6666666666667
563.1333333333333


In [96]:
# Perform two-sample t-test
t_statistics, p_value = stats.ttest_ind(young_male_athletes, older_female_athletes)

In [97]:
## Incomplete code....

## Paired T-test

In [98]:
from scipy.stats import ttest_rel
from math import sqrt
from scipy.stats import t

In [99]:
df = pd.read_csv("./data_pd/cust_seg.csv")
df.head()

Unnamed: 0,custid,sex,AqChannel,region,Marital_status,segment,pre_usage,Post_usage_1month,Latest_mon_usage,post_usage_2ndmonth
0,70,0,4,1,1,1,57,52,49.2,57.2
1,121,1,4,2,1,3,68,59,63.6,64.9
2,86,0,4,3,1,1,44,33,64.8,36.3
3,141,0,4,3,1,3,63,44,56.4,48.4
4,172,0,4,2,1,2,47,52,68.4,57.2


In [100]:
# Select relevant columns (assuming pre_usage and post_usage_1month are numeric)
usage_data = df[['pre_usage','Post_usage_1month']]


In [101]:
# perform paired t-test
tstat, pval = ttest_rel(usage_data['pre_usage'], usage_data['Post_usage_1month'])

In [102]:
# Degree of freedom (assuming equal sample sizes)
df = len(usage_data['pre_usage']) - 1

In [103]:
# critical value (alpha = 0.05, two - tailed test)
alpha = 0.05
critical_value = t.ppf(1-alpha/2, df)       # One tailed critical value for each tail

In [104]:
# Print result
print(f"\nT-statistics: {tstat:.2f}, p-value : {pval:.4f}")
print(f"Critical value (alpha = {alpha}, two-tailed): +/- {critical_value:.2f}")


T-statistics: -0.87, p-value : 0.3868
Critical value (alpha = 0.05, two-tailed): +/- 1.97


In [105]:
# interpretation
if pval < alpha:
    print("Reject null hypothesis: There's a significant difference in usage before and after the campaign (p<0.05).")
else:
    print("Fail to reject null hypothesis : The difference in usage before and after the campaign might be due to chance (p >=0.05).")

Fail to reject null hypothesis : The difference in usage before and after the campaign might be due to chance (p >=0.05).


In [106]:
if abs(tstat) > critical_value:
    print("The difference is statistically significant at the 5% level (|t| > critical value).")
else:
    print("The difference is statistically significant (|t| <= critical value).")

The difference is statistically significant (|t| <= critical value).


    - Run a paired t-test on the Hypothermia dataset between t. 1 and t. 2 columns (Hypothermia: condition that occurs when core body temp drops below 95 degrees Fahrentheit(35 degree Celcius))

In [107]:
hypothermia_df = pd.read_csv("./data_pd/Hypothermia.csv")
hypothermia_df.head()

Unnamed: 0,case,code,date,time,weight,t.nur,t.or,t.1,t.2,t.3,t.4,t.5,t.6
0,1,98887,12.0,20.3,2550,29,28,34.0,35.1,,,,
1,2,98528,11.0,18.1,1200,29,23,29.0,33.0,36.5,36.8,,
2,3,95723,3.0,10.24,2650,31,32,31.0,36.8,,,,
3,4,97694,8.0,0.3,800,27,22,32.0,34.0,35.0,35.5,36.0,36.5
4,5,96892,6.0,6.2,1880,30,30,32.2,36.6,,,,


In [108]:
# Select relevant columns (assuming t.1 and t.2 are numeric)
np.random.seed(4)
usage_data = hypothermia_df[['t.1','t.2']].sample(50)
# len(usage_data['t.1']), len(usage_data['t.2'])

In [109]:
# perform paired t-test
tstat, pval = ttest_rel(usage_data['t.1'], usage_data['t.2'])

In [110]:
# Degree of freedom (assuming equal sample sizes)
df = len(usage_data['t.1']) - 1

In [111]:
# critical value (alpha = 0.05, two - tailed test)
alpha = 0.05
critical_value = t.ppf(1-alpha/2, df)       # One tailed critical value for each tail

In [112]:
# Print result
print(f"\nT-statistics: {tstat:.2f}, p-value : {pval}")
print(f"Critical value (alpha = {alpha}, two-tailed): +/- {critical_value:.2f}")


T-statistics: -12.94, p-value : 2.031237846346811e-17
Critical value (alpha = 0.05, two-tailed): +/- 2.01


In [113]:
# interpretation
if pval < alpha:
    print("Reject null hypothesis : (p<0.05).")
else:
    print("Fail to reject null hypothesis : (p >=0.05).")

Reject null hypothesis : (p<0.05).


In [114]:
if abs(tstat) > critical_value:
    print("The difference is statistically significant at the 5% level (|t| > critical value).")
else:
    print("The difference is statistically significant (|t| <= critical value).")

The difference is statistically significant at the 5% level (|t| > critical value).


## ANOVA 

In [115]:
from scipy.stats import f_oneway

In [116]:
df = pd.read_csv("./data_pd/cust_seg.csv")
df.head()

Unnamed: 0,custid,sex,AqChannel,region,Marital_status,segment,pre_usage,Post_usage_1month,Latest_mon_usage,post_usage_2ndmonth
0,70,0,4,1,1,1,57,52,49.2,57.2
1,121,1,4,2,1,3,68,59,63.6,64.9
2,86,0,4,3,1,1,44,33,64.8,36.3
3,141,0,4,3,1,3,63,44,56.4,48.4
4,172,0,4,2,1,2,47,52,68.4,57.2


In [117]:
# Find out hoe many segments and how many customer per segment we have 
print(df.segment.value_counts())

segment
2    105
3     50
1     45
Name: count, dtype: int64


In [118]:
# Separate segments 
s2 = df.Latest_mon_usage[df.segment ==2]
s1 = df.Latest_mon_usage[df.segment ==1]
s3 = df.Latest_mon_usage[df.segment==3]

In [119]:
fstat, pval = f_oneway(s1,s2,s3)

In [120]:
# Degree of freedom
between_df = len([s1,s2,s3]) - 1        # Number of groups - 1
within_df = len(s1) + len(s2) + len(s3) - len([s1,s2,s3])       # total observation - number of groups


In [121]:
# Critical value (alpha = 0.05)
alpha = 0.05    # Because we are only checking for the difference, not < > etc
critical_value = stats.f.ppf(1 - alpha, between_df, within_df)      # One tailed critical value
critical_value

3.0417530299846947

In [122]:
# Print result
print(f"\nF-statistics: {fstat}, p-values: {pval:.4f}")
print(f"Critical value (alpha = {alpha}) : {critical_value:.2f}")



F-statistics: 29.279283801321778, p-values: 0.0000
Critical value (alpha = 0.05) : 3.04


In [123]:
# Interpretation (add your logic here)
if pval < alpha:
    print("Reject null hypothesis: There's a significant difference in Latest_mon_usage across customer segments (p < 0.05). ")
else:
    print("Fail to reject null ypothesis : The Latest_mon_usage might be similar acoss segments (p>=0.05)")
    

Reject null hypothesis: There's a significant difference in Latest_mon_usage across customer segments (p < 0.05). 


In [125]:
if fstat > critical_value:
    print("F-score is greater than the critical value, suggesting significant differece exist.")
else:
    print("F-score is not significant enough to reject the null hypothesis")

F-score is greater than the critical value, suggesting significant differece exist.


    - For studentsPerformance.csv, using ANOVA find if there is a significant difference in Maths marks depending on the level of parental education.

In [129]:
student_df = pd.read_csv("./data_pd/StudentsPerformance.csv")
student_df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [133]:
print(student_df['parental level of education'].value_counts())

parental level of education
some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: count, dtype: int64


In [136]:
# Separate segments 
s1 = student_df['math score'][student_df['parental level of education'] == 'some college']
s2 = student_df['math score'][student_df['parental level of education'] == "associate's degree"]
s3 = student_df['math score'][student_df['parental level of education'] == 'high school']
s4 = student_df['math score'][student_df['parental level of education'] == 'some high school']
s5 = student_df['math score'][student_df['parental level of education'] == "bachelor's degree"]
s6 = student_df['math score'][student_df['parental level of education'] == "master's degree"]


In [137]:
fstat, pval = f_oneway(s1,s2,s3,s4,s5,s6)
fstat, pval

(6.521582600453217, 5.592272384107223e-06)

In [138]:
# Degree of freedom
between_df = len([s1,s2,s3,s4,s5,s6]) - 1        # Number of groups - 1
within_df = len(s1) + len(s2) + len(s3) + len(s4) + len(s5) + len(s6) - len([s1,s2,s3,s4,s5,s6])       # total observation - number of groups

In [139]:
# Critical value (alpha = 0.05)
alpha = 0.05    # Because we are only checking for the difference, not < > etc
critical_value = stats.f.ppf(1 - alpha, between_df, within_df)      # One tailed critical value
critical_value

2.223106606552079

In [140]:
# Interpretation (add your logic here)
if pval < alpha:
    print("Reject null hypothesis:  (p < 0.05). ")
else:
    print("Fail to reject null ypothesis :  (p>=0.05)")
    

Reject null hypothesis:  (p < 0.05). 


In [141]:
if fstat > critical_value:
    print("F-score is greater than the critical value, suggesting significant differece exist.")
else:
    print("F-score is not significant enough to reject the null hypothesis")

F-score is greater than the critical value, suggesting significant differece exist.


## Chi-squared Test

In [144]:
df = pd.read_csv("./data_pd/tips.csv")
df.head()

Unnamed: 0,total_bill,tip,gender,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560330000000000.0,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478070000000000.0,Sun4608
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011810000000000.0,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676140000000000.0,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832730000000000.0,Sun2251


In [None]:
# Cross tabulation 
crosstab = pd.crosstab(df['gender'], df['smoker'])
print("Crosstab of gender and smoker: \n", crosstab)

Crosstab of gender and smoker: 
 smoker  No  Yes
gender         
Female  54   33
Male    97   60


In [146]:
crosstab = pd.crosstab(df['gender'], df['day'])
print("\nCrosstab of gender and day: \n", crosstab)


Crosstab of gender and day: 
 day     Fri  Sat  Sun  Thur
gender                     
Female    9   28   18    32
Male     10   59   58    30


In [147]:
crosstab = pd.crosstab(df['gender'], df['time'])
print("\nCrosstab of gender and day: \n", crosstab)


Crosstab of gender and day: 
 time    Dinner  Lunch
gender               
Female      52     35
Male       124     33


In [149]:
# ERROR! Uncommenting this will result ain an error
# crosstab = pd.crosstab(df['gender'], df['day'], df['time'])
# print("\n Crosstab of gender , day, and time: \n", crosstab)

In [150]:
# Cross tab will aggregation : Sum of total_bill by gender and day
crosstab_total_bill = pd.crosstab(df['gender'], df['day'], values=df['total_bill'], aggfunc='sum')
print("Sum of total bill by gender and day. \n", crosstab_total_bill)

Sum of total bill by gender and day. 
 day        Fri      Sat      Sun    Thur
gender                                  
Female  127.31   551.05   357.70  534.89
Male    198.57  1227.35  1269.46  561.44


In [151]:
# Crosstab with aggregation: average tip by gender and smoker
crosstab_avg_tip = pd.crosstab(df['gender'], df['smoker'], values=df['tip'], aggfunc='mean')
print("\n Average tip by gender and smoker: \n", crosstab_avg_tip)


 Average tip by gender and smoker: 
 smoker        No       Yes
gender                    
Female  2.773519  2.931515
Male    3.113402  3.051167


In [152]:
# Crosstab with aggregation: Max tip percentage by day and time
df['tip_percentage'] = (df['tip'] / df['total_bill'])*100
crosstab_max_tip_percentage = pd.crosstab(df['day'], df['time'], values= df['tip_percentage'], aggfunc='max')
print("\n Max tip percentage by day and time : \n ", crosstab_max_tip_percentage)


 Max tip percentage by day and time : 
  time     Dinner      Lunch
day                       
Fri   26.348039  25.931446
Sat   32.573290        NaN
Sun   71.034483        NaN
Thur  15.974441  26.631158


In [154]:
student_df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [155]:
crosstab = pd.crosstab(student_df['race/ethnicity'], student_df['test preparation course'])

In [158]:
# Crosstab with aggregation : Average tip by gender and smoker
crosstab_avg_marks = pd.crosstab(student_df['race/ethnicity'], student_df['test preparation course'], values=student_df['math score'], aggfunc=['mean', 'min','max'])
crosstab_avg_marks

Unnamed: 0_level_0,mean,mean,min,min,max,max
test preparation course,completed,none,completed,none,completed,none
race/ethnicity,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
group A,68.258065,58.086207,34,28,100,91
group B,67.191176,61.368852,23,8,94,97
group C,67.495726,62.707921,29,0,98,97
group D,69.792683,66.255556,35,26,100,98
group E,77.433333,71.1125,42,30,100,100


### Chi-square test of independence

In [169]:
from scipy.stats import chi2_contingency

In [170]:
df = pd.read_csv("./data_pd/airline_passenger_satisfaction.csv")
df.head()

Unnamed: 0,ID,Gender,Age,Customer Type,Type of Travel,Class,Flight Distance,Departure Delay,Arrival Delay,Departure and Arrival Time Convenience,...,On-board Service,Seat Comfort,Leg Room Service,Cleanliness,Food and Drink,In-flight Service,In-flight Wifi Service,In-flight Entertainment,Baggage Handling,Satisfaction
0,1,Male,48,First-time,Business,Business,821,2,5.0,3,...,3,5,2,5,5,5,3,5,5,Neutral or Dissatisfied
1,2,Female,35,Returning,Business,Business,821,26,39.0,2,...,5,4,5,5,3,5,2,5,5,Satisfied
2,3,Male,41,Returning,Business,Business,853,0,0.0,4,...,3,5,3,5,5,3,4,3,3,Satisfied
3,4,Male,50,Returning,Business,Business,1905,0,0.0,2,...,5,5,5,4,4,5,2,5,5,Satisfied
4,5,Female,49,Returning,Business,Business,3470,0,1.0,3,...,3,4,4,5,4,3,3,3,3,Satisfied


In [184]:
male_df = df[df['Gender'] == "Male"]        # filter by gender
female_df = df[df['Gender'] == "Female"]        # filter by gender


In [185]:
# Analysize On-board Services and class (Independence test - Male Passenger)
category1 = "Class"
category2 = "On-board Service"

# Create contigency table
crosstab = pd.crosstab(male_df[category1], male_df[category2])


In [186]:
# Perform Chi-square test
chi2, pval, degree_of_freedom, expected_counts = chi2_contingency(crosstab)

In [187]:
# Print Result
print(f"\nCategory 1: {category1}, Category 2: {category2} (Male Passengers)")
print(crosstab)
print(f"\nChi-square : {chi2:.2f}, p-value: {pval:.2f} , Degree of freedom: {degree_of_freedom}")



Category 1: Class, Category 2: On-board Service (Male Passengers)
On-board Service  0     1     2     3      4     5
Class                                             
Business          2  2053  3379  5826  10743  8894
Economy           0  4420  4430  7403   7535  4851
Economy Plus      0   791   792  1118   1076   668

Chi-square : 3277.66, p-value: 0.00 , Degree of freedom: 10


In [188]:
# Interpretation
if pval < 0.05:
    print("Reject null hypothesis: There's a significant association between class and on-board service rating (p < 0.05)")
else:
    print("Fail to reject null hypothesis: Class and on-board service rating might be independent (p >= 0.05)")

Reject null hypothesis: There's a significant association between class and on-board service rating (p < 0.05)


# Test on cricket dataset

- 1. Combine the cricket dataset
- 2. Filter the data of two countries (Country column) : India and Australia
- 3. Find the list of bowlers who have taken atleast 25 wickets overall (sum). Check if your results are correct by comparing with sites such as CricInfo (e.g. Wickets taken by a bowler who retired before say 2020)
- 4. Take a random sample of 30 such Indian and Australian Bowlers
- 5. Find the bowling average of these sampled Indians and Australians shows a statistically significant difference (Two - sampled T-test).
-       Note: Bowling average = Runs conceded / Wickets taken
- 6. Extend this to include one more country (e.g. England). Now find if the average of these sampled Indians , Australians and English Bowelers shows a statistically significant difference (ANOVA)

In [285]:
df1 = pd.read_csv("./data_pd/Men ODI Player Innings Stats - 20th Century.csv")
df2 = pd.read_csv("./data_pd/Men ODI Player Innings Stats - 21st Century.csv")
cricket_df = pd.concat([df1, df2])
cricket_df.head()

Unnamed: 0,Innings Player,Innings Runs Scored,Innings Runs Scored Num,Innings Minutes Batted,Innings Batted Flag,Innings Not Out Flag,Innings Balls Faced,Innings Boundary Fours,Innings Boundary Sixes,Innings Batting Strike Rate,...,Innings Overs Bowled,Innings Bowled Flag,Innings Maidens Bowled,Innings Runs Conceded,Innings Wickets Taken,4 Wickets,5 Wickets,10 Wickets,Innings Wickets Taken Buckets,Innings Economy Rate
0,RA Smith,167*,167,208,1.0,1.0,163,17,3,102.45,...,,,,,,,,,,
1,DI Gower,158,158,177,1.0,0.0,118,18,4,133.89,...,,,,,,,,,,
2,CWJ Athey,142*,142,208,1.0,1.0,172,14,0,82.55,...,,,,,,,,,,
3,GA Gooch,142,142,-,1.0,0.0,134,14,0,105.97,...,,,,,,,,,,
4,DL Amiss,137,137,-,1.0,0.0,147,18,0,93.19,...,,,,,,,,,,


In [286]:
india_df = cricket_df[cricket_df['Country'] == "India"]
australia_df = cricket_df[cricket_df['Country'] == "Australia"]

In [298]:
india_df = india_df[(india_df['Innings Wickets Taken'].notna()) & (india_df['Innings Wickets Taken'] != '-')]
australia_df = australia_df[(australia_df['Innings Wickets Taken'].notna()) & (australia_df['Innings Wickets Taken'] != '-')]


In [320]:
india_df['Innings Wickets Taken'] = india_df['Innings Wickets Taken'].astype(int)
australia_df['Innings Wickets Taken'] = australia_df['Innings Wickets Taken'].astype(int)


In [338]:
df2 = india_df.groupby("Innings Player").agg({"Innings Wickets Taken": "sum", "Innings Runs Conceded": "sum"}).reset_index()
df3 = australia_df.groupby("Innings Player").agg({"Innings Wickets Taken": "sum", "Innings Runs Conceded": "sum"}).reset_index()

In [339]:
df2 = df2[df2['Innings Wickets Taken'] >= 25]
df3 = df3[df3['Innings Wickets Taken'] >= 25]

In [340]:
np.random.seed(3)
ind_bowler = df2.sample(n=30, replace=False)
aus_bowler = df3.sample(n=30, replace=False)

In [341]:
ind_bowler

Unnamed: 0,Innings Player,Innings Wickets Taken,Innings Runs Conceded
16,AR Patel,135,3438394092426393941455116222630353839404044454...
141,SR Tendulkar,266,3234385634364345824292931323439404449615613141...
132,SB Joshi,69,6381740162329303032333636363743434757692527333...
165,YK Pathan,99,4956112737511791314182223272836374145495152566...
124,S Madan Lal,73,2037111527273142192330333939415051535665693111...
61,KM Jadhav,81,2329622264101112131727273133343538484811111314...
79,Maninder Singh,66,2221232434404754171923252630404391919212424282...
83,N Kapil Dev,253,4330315417232326262635363637384144485410161617...
64,L Balaji,102,4820275262283237376432374851525256602632323340...
37,DS Mohanty,65,5615283031335258253135364142435476223131353638...
