<a href="https://colab.research.google.com/github/MarrinXia/MSSP-607/blob/main/Week12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
from google.colab import drive
import pandas as pd
import scipy.stats as stats
import os

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Set the data path
# Make sure this path is exactly where your file is located
DATA_PATH = "/content/drive/MyDrive/MSSP607/data/"
file_name = "Student Performance Data.csv"
full_path = os.path.join(DATA_PATH, file_name)

# 3. Load the data directly
df = pd.read_csv(full_path)
print("Data loaded successfully.")

# --- Task 2: Males (Prep Completed) vs Others ---
print("\n" + "="*40)
print("Task 2: Males (Prep Completed) vs Others")
print("="*40)

# Define Group A: Males who completed the prep course
group_male_prep = df[(df['gender'] == 'male') &
                     (df['test preparation course'] == 'completed')]['total score']

# Define Group B: Everyone else (The inverse of Group A)
group_others = df[~((df['gender'] == 'male') &
                    (df['test preparation course'] == 'completed'))]['total score']

# Perform the T-test
t_stat_2, p_val_2 = stats.ttest_ind(group_male_prep, group_others, equal_var=False)

print(f"Mean Score (Male + Prep): {group_male_prep.mean():.2f}")
print(f"Mean Score (Others):      {group_others.mean():.2f}")
print(f"P-value: {p_val_2:.5f}")

if p_val_2 < 0.05:
    print("Conclusion: The difference is statistically significant.")
else:
    print("Conclusion: No significant difference found.")


# --- Task 3: Other Interesting Differences ---
print("\n" + "="*40)
print("Task 3: Other Interesting Differences")
print("="*40)

# 3.1 Lunch Type
print("\n[Analysis 1] Lunch Type (Standard vs Free/Reduced)")
g_lunch_std = df[df['lunch'] == 'standard']['total score']
g_lunch_free = df[df['lunch'] == 'free/reduced']['total score']
t_lunch, p_lunch = stats.ttest_ind(g_lunch_std, g_lunch_free, equal_var=False)
print(f"Mean (Standard): {g_lunch_std.mean():.2f}")
print(f"Mean (Free/Reduced): {g_lunch_free.mean():.2f}")
print(f"P-value: {p_lunch:.5e}")

# 3.2 Prep Course (Overall)
print("\n[Analysis 2] Test Prep Course (Completed vs None)")
g_prep = df[df['test preparation course'] == 'completed']['total score']
g_noprep = df[df['test preparation course'] == 'none']['total score']
t_prep, p_prep = stats.ttest_ind(g_prep, g_noprep, equal_var=False)
print(f"Mean (Completed): {g_prep.mean():.2f}")
print(f"Mean (None): {g_noprep.mean():.2f}")
print(f"P-value: {p_prep:.5e}")

# 3.3 Gender
print("\n[Analysis 3] Gender (Female vs Male)")
g_female = df[df['gender'] == 'female']['total score']
g_male = df[df['gender'] == 'male']['total score']
t_gen, p_gen = stats.ttest_ind(g_female, g_male, equal_var=False)
print(f"Mean (Female): {g_female.mean():.2f}")
print(f"Mean (Male): {g_male.mean():.2f}")
print(f"P-value: {p_gen:.5e}")

Mounted at /content/drive
Data loaded successfully.

Task 2: Males (Prep Completed) vs Others
Mean Score (Male + Prep): 212.34
Mean Score (Others):      201.41
P-value: 0.00127
Conclusion: The difference is statistically significant.

Task 3: Other Interesting Differences

[Analysis 1] Lunch Type (Standard vs Free/Reduced)
Mean (Standard): 212.51
Mean (Free/Reduced): 186.60
P-value: 1.58297e-19

[Analysis 2] Test Prep Course (Completed vs None)
Mean (Completed): 218.01
Mean (None): 195.12
P-value: 4.42673e-17

[Analysis 3] Gender (Female vs Male)
Mean (Female): 208.71
Mean (Male): 197.51
P-value: 3.18620e-05
