In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [7]:
df = pd.read_csv('C:\\Users\\Mubasshira\\Downloads\\Student_performance_data _.csv')
df.head(3)

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4.0


# One-Sample Z-Test

In [12]:
#Is the average GPA significantly different from 3.0?
#H₀: μ = 3.0
#H₁: μ ≠ 3.0

from statsmodels.stats.weightstats import ztest
zstat, pval = ztest(df['GPA'], value=3.0)
print(f"Z stat : {zstat}")
print(f"P value : {pval}")

if pval < 0.05:
    print("Average GPA is significantly different from 3.0")
    average_gpa = np.mean(df['GPA'])
    print(f"Average GPA is {average_gpa}")
else:
    print("Yes, Average GPA is exactly 3.0")

Z stat : -58.45597371325348
P value : 0.0
Average GPA is significantly different from 3.0
Average GPA is 1.9061863027265407


In [29]:
#On average, students are expected to study at least 10 hours a week. Is this true?
#H₀: μ ≥ 10
#H₁: μ < 10

zstat, pval = ztest(round(df['StudyTimeWeekly'],0), value=10, alternative="smaller")
print(f"Z stat : {zstat}")
print(f"P value : {pval}")

if pval < 0.05:
    print("Students are studying less than 10 hours")
    average_studytime = np.mean(df['StudyTimeWeekly'])
    print(f"Average Study Time is {average_studytime}")
else:
    print("Yes, Students are expected to study at least 10 hours a week")

Z stat : -2.00155857494018
P value : 0.022666114031068583
Students are studying less than 10 hours
Average Study Time is 9.771991919431738


In [28]:
#The school sets a target of no more than 5 absences per student. Is the average higher?
#H₀: μ ≤ 5
#H₁: μ > 5

zstat, pval = ztest(df['Absences'], value=5, alternative='larger')
print(f"Z stat : {zstat}")
print(f"P value : {pval}")

if pval < 0.05:
    print("The average number of absences is significantly higher than 5")
    avg_absences = np.mean(df['Absences'])
    print(f"Average Absences: {avg_absences:.2f}")
else:
    print("The average number of absences is NOT significantly higher than 5")
    avg_absences = np.mean(df['Absences'])
    print(f"Average Absences: {avg_absences:.2f}")

Z stat : 55.111365897797334
P value : 0.0
The average number of absences is significantly higher than 5
Average Absences: 14.54


# Two-Sample Z-Test

In [18]:
#Do male and female students have the same average GPA?
#H₀: μ₁ = μ₂
#H₁: μ₁ ≠ μ₂

ms = df[df['Gender'] == 0]["GPA"]
fs = df[df['Gender'] == 1]["GPA"]

zstat, pval = ztest(x1 = ms, x2 = fs, value= 0)
print(f"Z stat : {zstat}")
print(f"P value : {pval}")

if pval < 0.05:
    print("Male and Female students have different average GPA")
else:
    print("Yes, Average GPA is exactly same for Male and Female students")
    
male_average = np.mean(df[df['Gender'] == 0]["GPA"])
female_average = np.mean(df[df['Gender'] == 1]["GPA"])
    
print(f"Average Male Students GPA is {male_average}")
print(f"Average Female Students GPA is {female_average}")

Z stat : 0.6531957576950806
P value : 0.5136300848367689
Yes, Average GPA is exactly same for Male and Female students
Average Male Students GPA is 1.9186788948802054
Average Female Students GPA is 1.8942253102389894


In [20]:
#Do students who receive tutoring have higher average GPA than those who don't?
#H₀: μ₁ ≤ μ₂
#H₁: μ₁ > μ₂

no_tutor = df[df['Tutoring'] == 0]["GPA"]
tutor = df[df['Tutoring'] == 1]["GPA"]

zstat, pval = ztest(x1 = no_tutor, x2 = tutor, value= 0)
print(f"Z stat : {zstat}")
print(f"P value : {pval}")

if pval < 0.05:
    print("Students who receive tutoring have higher average GPA")
else:
    print("Students with tutoring have less than or equal average GPA compared to students without tutoring")
    
st = np.mean(df[df['Tutoring'] == 1]["GPA"])
nst = np.mean(df[df['Tutoring'] == 0]["GPA"])
    
print(f"Tutored Students average GPA is {st}")
print(f"Untutored Students average GPA is {nst}")

Z stat : -7.170424050973337
P value : 7.476581508315615e-13
Students who receive tutoring have higher average GPA
Tutored Students average GPA is 2.1083247273414276
Untutored Students average GPA is 1.818967987856802


In [30]:
#Do students with 'High' or 'Very High' parental support study more hours weekly than those with lower support?
#H₀: μ₁ ≤ μ₂
#H₁: μ₁ > μ₂

highsupport = df[df['ParentalSupport'] >= 3]["StudyTimeWeekly"]
lowsupport = df[df['ParentalSupport'] < 3]["StudyTimeWeekly"]

zstat, pval = ztest(x1 = highsupport, x2 = lowsupport, value= 0, alternative='smaller')
print(f"Z stat : {zstat}")
print(f"P value : {pval}")

if pval < 0.05:
    print("Students with 'High' or 'Very High' parent support study less than or equal weekly hours compared to other students")
else:
    print("Students with 'High' or 'Very High' parental support study more hours")
    
hs = np.mean(highsupport)
ls = np.mean(lowsupport)

print(f"High Support Avg Study Time: {hs:.2f} hours")
print(f"Low Support Avg Study Time: {ls:.2f} hours")

Z stat : 1.9068749597512826
P value : 0.9717316092680018
Students with 'High' or 'Very High' parental support study more hours
High Support Avg Study Time: 10.04 hours
Low Support Avg Study Time: 9.59 hours


# One-Sample Z-Proportion Test

In [42]:
from statsmodels.stats.proportion import proportions_ztest
#It’s believed that 50% of the students are female. Is the actual proportion different?
#H₀: p = 0.50
#H₁: p ≠ 0.50

success = (df['Gender'] == 1).sum()
total = len(df)
zstat, pval = proportions_ztest(success, total, value=0.50, alternative='two-sided')

print(f"Z stat : {zstat}")
print(f"P value : {pval}")

if pval < 0.05:
    print("50% students are not female")
else:
    print("50% Students are female")
    
male_s = (df['Gender'] == 0).sum()
print(f"Female students % count is {success/total * 100}")
print(f"Male students % count is {male_s/total * 100}")

Z stat : 1.0634703884638677
P value : 0.28756868687945203
50% Students are female
Female students % count is 51.08695652173913
Male students % count is 48.91304347826087


In [41]:
#More than 40% of students are receiving tutoring. Is this true?
#H₀: p ≤ 0.40
#H₁: p > 0.40

success = (df['Tutoring'] == 1).sum()
other = (df['Tutoring'] == 0).sum()
total = len(df)
zstat, pval = proportions_ztest(success, total, value=0.40, alternative='larger')

print(f"Z stat : {zstat}")
print(f"P value : {pval}")

if pval < 0.05:
    print("More than 40% students are receiving tutoring")
else:
    print("Less than or equal to 40% of students are receiving tutoring")
    
print(f"Tutored students % count is {success /total * 100}")
print(f"Untutored students % count is {other / total * 100}")


Z stat : -10.506753203888538
P value : 1.0
Lessthan or equal to 40% of students are receiving tutoring
Tutored students % count is 30.142140468227424
Tutored students % count is 69.85785953177258


In [54]:
#Is less than 50% of students involved in extracurricular activities?
#H₀: p ≥ 0.50
#H₁: p < 0.50

success = (df['Extracurricular'] == 1).sum()
other = (df['Extracurricular'] == 0).sum()
total = len(df)
zstat, pval = proportions_ztest(success, total, value=0.50, alternative='smaller')

print(f"Z stat : {zstat}")
print(f"P value : {pval}")

if pval < 0.05:
    print("less than 50% of students participate in extracurricular activities")
else:
    print("Greater than or equal to 50% of students are involved in extracurricular activities")
    
print(f"Extracurricular activities students % count is {success/total * 100}")
print(f"No Extracurricular activities students % count is {other / total * 100}")


Z stat : -11.73286606152908
P value : 4.3239988114566006e-32
less than 50% of students involved in extracurricular activities
Extracurricular activities students % count is 38.336120401337794
No Extracurricular activities students % count is 61.663879598662206


# Two-Sample Z-Proportion Test

In [60]:
#Is the proportion of females receiving tutoring higher than that of males?
#H₀: p₁ ≤ p₂
#H₁: p₁ > p₂

female = len(df[(df['Gender'] == 1) & (df['Tutoring'] == 1)])
male = len(df[(df['Gender'] == 0) & (df['Tutoring'] == 1)])

female_count = (df['Gender'] == 1).sum()
male_count = (df['Gender'] == 0).sum()
zstat, pval = proportions_ztest([female, male], [female_count, male_count], value=0, alternative='larger')

print(f"Z stat : {zstat}")
print(f"P value : {pval}")

if pval < 0.05:
    print("The proportion of females receiving tutoring is higher than that of males")
else:
    print("The proportion of females receiving tutoring is Lower than or equal to male students receiving tutoring")
    
print(f"Female students receiving tutoring % count is {female/female_count * 100}")
print(f"Male students receiving tutoring % count is {male/male_count * 100}")


Z stat : -1.5453620107587478
P value : 0.9388706380794596
The proportion of females receiving tutoring is Lower than or equal to male students receiving tutoring
Female students receiving tutoring % count is (29, 2)
Male students receiving tutoring % count is (32, 2)


In [61]:
df.head()

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4.0
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0


In [68]:
#Is the proportion of students involved in sports higher than those volunteering?
#H₀: p₁ ≤ p₂
#H₁: p₁ > p₂

sports = len(df[df['Sports'] == 1])
volunteering = len(df[df['Volunteering'] == 1])
total = len(df)
zstat, pval = proportions_ztest([sports, volunteering], [total, total], value=0, alternative='larger')

print(f"Z stat : {zstat}")
print(f"P value : {pval}")

if pval < 0.05:
    print("The proportion of students involved in sports are higher than those volunteering")
else:
    print("The proportion of students involved in sports is less than or equal to students who are volunteering")
    
print(f"Sports Students % count is {sports/total * 100}")
print(f"Volunterring Students % count is {volunteering/total * 100}")


Z stat : 12.017965155470112
P value : 2.859260696687429e-33
The proportion of students involved in sports are higher than those volunteering
Sports Students % count is 30.351170568561876
Volunterring Students % count is 15.719063545150503


In [70]:
#Do students with parents having a bachelor's degree or higher participate more in extracurricular activities than those with
#less educated parents?
#H₀: p₁ ≤ p₂
#H₁: p₁ > p₂
high_group = df[df['ParentalEducation'] >= 3]
low_group = df[df['ParentalEducation'] < 3]

high_success = (high_group['Extracurricular'] == 1).sum()
low_success = (low_group['Extracurricular'] == 1).sum()

high_total = len(high_group)
low_total = len(low_group)
zstat, pval = proportions_ztest([high_success, low_success], [high_total, low_total], value=0, alternative='larger')

print(f"Z stat : {zstat}")
print(f"P value : {pval}")

if pval < 0.05:
    print("students with parents having a bachelor's degree or higher participate more in extracurricular")
else:
    print("students with parents having a bachelor's degree or higher participate less or equally in extracurriculars than students with college eduction parents ")
    
print(f"Students with High degree parents participating in extracurriculars % count is {highdegree/total * 100}")
print(f"Students with Low degree parents participating in extracurriculars % count is {lowdegree/total * 100}")


Z stat : 0.34496100215257164
P value : 0.3650618444781315
students with parents having a bachelor's degree or higher participate less or equally in extracurriculars than students with college eduction parents 
Students with High degree parents participating in extracurriculars % count is 7.94314381270903
Students with Low degree parents participating in extracurriculars % count is 30.392976588628763
