In [31]:
import pandas as pd
import numpy as np
from scipy.stats import ranksums, ttest_ind, chi2_contingency

In [32]:
college_data = pd.read_csv('collegeData.csv')
college_data

Unnamed: 0,SexCode,MaritalCode,PrevEdCode,DDVeteran,DaysEnrollToStart,AgeAtStart,AgeAtGrad,GPA,MinutesAttended,HoursAttempt,HoursEarned,HoursReq,MinutesAbsent,TransferCredits,TransferGPA,MinEFC,MaxENTEntranceScore,gradFlag
0,M,M,BACH,0,55,24,27,3.22,145953,2925.0,2550.0,2565,3475,19.00,2.55,0.0,81.00,1
1,F,M,BACH,0,143,22,25,3.02,129045,2640.0,2565.0,2565,11840,12.00,,0.0,89.50,1
2,F,S,BACH,0,98,30,33,3.47,111385,2559.0,2514.0,2565,935,37.67,2.84,0.0,,1
3,F,UN,BACH,0,101,24,27,3.19,135401,2520.0,2520.0,2565,4549,6.00,,0.0,87.50,1
4,M,,SOMECOLL,0,61,19,22,3.84,115660,2520.0,2520.0,2565,1340,22.00,,3141.0,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2779,F,UN,SOMECOLL,0,101,26,29,3.11,117301,2325.0,2295.0,2250,9619,10.00,,0.0,86.00,1
2780,F,S,HS,0,109,23,25,2.50,99735,1890.0,1620.0,2565,13950,,,5562.0,80.87,0
2781,M,S,SOMECOLL,0,103,22,23,3.30,165378,3135.0,2460.0,0,6042,,,0.0,91.91,1
2782,F,UN,SOMECOLL,0,47,26,28,3.19,31915,840.0,690.0,2415,4995,26.00,,5772.0,84.50,0


In [33]:
dropouts_clean = college_data[college_data['gradFlag'] == 0]
graduates_clean = college_data[college_data['gradFlag'] == 1]

In [34]:
# Test 1: Comparing medians of GPA (Wilcoxon Rank-Sum Test)
graduates_gpa = graduates_clean['GPA'].dropna()
dropouts_gpa = dropouts_clean['GPA'].dropna()
test_stat_gpa, p_value_gpa = ranksums(graduates_gpa, dropouts_gpa)

In [35]:
# Test 2: Comparing means of AgeAtStart (2-Sample t-Test)
graduates_age = graduates_clean['AgeAtStart'].dropna()
dropouts_age = dropouts_clean['AgeAtStart'].dropna()
test_stat_age, p_value_age = ttest_ind(graduates_age, dropouts_age, equal_var=False)

In [36]:
# Test 3: Comparing medians of TransferGPA (Wilcoxon Rank-Sum Test)
graduates_transfer_gpa = graduates_clean['TransferGPA'].dropna()
dropouts_transfer_gpa = dropouts_clean['TransferGPA'].dropna()
test_stat_transfer_gpa, p_value_transfer_gpa = ranksums(graduates_transfer_gpa, dropouts_transfer_gpa)

In [37]:
# Test 4: Comparing means of TransferCredits (2-Sample t-Test)
graduates_transfer_credits = graduates_clean['TransferCredits'].dropna()
dropouts_transfer_credits = dropouts_clean['TransferCredits'].dropna()
test_stat_transfer_credits, p_value_transfer_credits = ttest_ind(
    graduates_transfer_credits, dropouts_transfer_credits, equal_var=False
)

In [38]:
# Results for tests 1-4
results_tests_1_to_4 = {
    "GPA Medians Test Statistic": test_stat_gpa,
    "GPA Medians P-Value": p_value_gpa,
    "Age Means Test Statistic": test_stat_age,
    "Age Means P-Value": p_value_age,
    "Transfer GPA Medians Test Statistic": test_stat_transfer_gpa,
    "Transfer GPA Medians P-Value": p_value_transfer_gpa,
    "Transfer Credits Means Test Statistic": test_stat_transfer_credits,
    "Transfer Credits Means P-Value": p_value_transfer_credits,
}
results_tests_1_to_4

{'GPA Medians Test Statistic': 38.12823902794451,
 'GPA Medians P-Value': 0.0,
 'Age Means Test Statistic': 1.2551304992860344,
 'Age Means P-Value': 0.20960262234043067,
 'Transfer GPA Medians Test Statistic': 7.572647069553644,
 'Transfer GPA Medians P-Value': 3.656948357918701e-14,
 'Transfer Credits Means Test Statistic': 8.034523946203917,
 'Transfer Credits Means P-Value': 2.4435242045889162e-15}

In [39]:
# Test 5: Association between Gender and Graduation (Chi-Square Test)
gender_contingency = pd.crosstab(college_data['SexCode'], college_data['gradFlag'])
chi2_gender, p_value_gender, _, _ = chi2_contingency(gender_contingency)

# graduate_gender = graduates_clean['SexCode'].dropna()
# gender_contingency = pd.crosstab(graduate_gender, graduates_clean['gradFlag'])
# chi2_gender, p_value_gender, _, _ = chi2_contingency(gender_contingency)

In [40]:
# Test 6: Association between Marital Status and Graduation (Chi-Square Test)
marital_contingency = pd.crosstab(college_data['MaritalCode'], college_data['gradFlag'])
chi2_marital, p_value_marital, _, _ = chi2_contingency(marital_contingency)

# graduate_marital_contingency = graduates_clean['MaritalCode'].dropna()
# marital_contingency = pd.crosstab(graduate_marital_contingency, graduates_clean['gradFlag'])
# chi2_marital, p_value_marital, _, _ = chi2_contingency(marital_contingency)

In [41]:
# Test 7: Association between Previous Education and Graduation (Chi-Square Test)
prev_ed_contingency = pd.crosstab(college_data['PrevEdCode'], college_data['gradFlag'])
chi2_prev_ed, p_value_prev_ed, _, _ = chi2_contingency(prev_ed_contingency)

# graduate_prev_ed_contingency = graduates_clean['PrevEdCode'].dropna()
# prev_ed_contingency = pd.crosstab(graduate_prev_ed_contingency, graduates_clean['gradFlag'])
# chi2_prev_ed, p_value_prev_ed, _, _ = chi2_contingency(prev_ed_contingency)

In [42]:
# Results for tests 5-7
results_tests_5_to_7 = {
    "Gender Chi-Square Statistic": chi2_gender,
    "Gender P-Value": p_value_gender,
    "Marital Status Chi-Square Statistic": chi2_marital,
    "Marital Status P-Value": p_value_marital,
    "Previous Education Chi-Square Statistic": chi2_prev_ed,
    "Previous Education P-Value": p_value_prev_ed,
}
results_tests_5_to_7

{'Gender Chi-Square Statistic': 18.947781730572082,
 'Gender P-Value': 1.3434539080515035e-05,
 'Marital Status Chi-Square Statistic': 76.0543852268735,
 'Marital Status P-Value': 1.1922501823455327e-15,
 'Previous Education Chi-Square Statistic': 239.19525107943014,
 'Previous Education P-Value': 5.511096347201127e-48}