In [8]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, spearmanr, ttest_ind
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [9]:
df=pd.read_csv("cleaned_students_performance.csv")
df.head(5)

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,family_support,alcohol_consumption
0,female,group B,bachelor's degree,standard,not_completed,72.0,72.0,74.0,no,yes
1,female,group C,some college,standard,completed,69.0,90.0,88.0,yes,yes
2,female,group B,master's degree,standard,not_completed,90.0,95.0,93.0,no,yes
3,male,group A,associate's degree,free/reduced,not_completed,47.0,57.0,44.0,no,yes
4,male,group C,some college,standard,not_completed,76.0,78.0,75.0,no,no


In [10]:
df.columns

Index(['gender', 'race/ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course', 'math_score', 'reading_score',
       'writing_score', 'family_support', 'alcohol_consumption'],
      dtype='object')

In [11]:
df.dtypes

gender                          object
race/ethnicity                  object
parental_level_of_education     object
lunch                           object
test_preparation_course         object
math_score                     float64
reading_score                  float64
writing_score                  float64
family_support                  object
alcohol_consumption             object
dtype: object

In [12]:
# 1. Correlation (Pearson and Spearman)
print("Pearson Correlation (Math vs Reading):", pearsonr(df['math_score'], df['reading_score']))
print("Spearman Correlation (Math vs Writing):", spearmanr(df['math_score'], df['writing_score']))

Pearson Correlation (Math vs Reading): PearsonRResult(statistic=0.7988810193945513, pvalue=1.831074448784342e-219)
Spearman Correlation (Math vs Writing): SignificanceResult(statistic=0.7688018514907428, pvalue=2.868621472261735e-193)


In [13]:
# 2. T-test: Performance vs Test Preparation
completed = df[df['test_preparation_course'] == 'completed']['math_score']
not_completed = df[df['test_preparation_course'] == 'not_completed']['math_score']
t_stat, p_val = ttest_ind(completed, not_completed)
print(f"T-test for Math Score based on Test Preparation: t={t_stat:.2f}, p={p_val:.4f}")


T-test for Math Score based on Test Preparation: t=5.23, p=0.0000


In [14]:
# 3. One-way ANOVA: Math score across lunch types
anova = smf.ols('math_score ~ C(lunch)', data=df).fit()
anova_table = sm.stats.anova_lm(anova, typ=2)
print("\nANOVA: Math Score ~ Lunch Type")
print(anova_table)



ANOVA: Math Score ~ Lunch Type
                 sum_sq     df           F        PR(>F)
C(lunch)   23127.172572    1.0  126.818906  9.339249e-28
Residual  179445.940004  984.0         NaN           NaN


In [15]:
# 4. Regression: Predicting math score
model = smf.ols('math_score ~ reading_score + writing_score', data=df).fit()
print("\nRegression Summary:")
print(model.summary())


Regression Summary:
                            OLS Regression Results                            
Dep. Variable:             math_score   R-squared:                       0.643
Model:                            OLS   Adj. R-squared:                  0.642
Method:                 Least Squares   F-statistic:                     885.6
Date:                Sat, 07 Jun 2025   Prob (F-statistic):          1.20e-220
Time:                        22:41:24   Log-Likelihood:                -3516.5
No. Observations:                 986   AIC:                             7039.
Df Residuals:                     983   BIC:                             7054.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         9.2803 