**Correlation-Based Analysis**

Correlation helps identify the strength and direction of linear relationships between variables and exam scores, making it a good first step for feature importance analysis before applying predictive models.

In [None]:
import pandas as pd

In [None]:
df = pd.read_excel("/content/StudentPerformanceFactors.xlsx")

In [None]:
# Encode categorical variables needed for correlation
encoding_maps = {
    "Distance_from_Home": {"Near": 1, "Moderate": 2, "Far": 3},
    "Learning_Disabilities": {"No": 0, "Yes": 1},
    "Access_to_Resources": {"Low": 1, "Medium": 2, "High": 3},
    "Parental_Involvement": {"Low": 1, "Medium": 2, "High": 3}
}

for col, mapping in encoding_maps.items():
    df[col + "_enc"] = df[col].map(mapping)

# Select numeric & encoded columns
numeric_cols = [
    "Hours_Studied",
    "Attendance",
    "Previous_Scores",
    "Tutoring_Sessions",
    "Sleep_Hours",
    "Distance_from_Home_enc",
    "Learning_Disabilities_enc",
    "Access_to_Resources_enc",
    "Parental_Involvement_enc"
]

# Compute Pearson correlation with Exam_Score
correlations = df[numeric_cols].corrwith(df["Exam_Score"])

In [None]:
# Factors reducing performance

negative_factors = correlations[correlations < 0].sort_values()

print("Factors reducing performance (Negative Correlation)")
print(negative_factors)
print()

In [None]:
# Q22: Top 5 strongest predictors

top_5_predictors = correlations.abs().sort_values(ascending=False).head(5)

print("Top 5 strongest predictors (Absolute Correlation)")
print(top_5_predictors)