In [56]:
import pandas as pd
data = pd.read_csv("data/StudentPerformanceFactors.csv")
df = data.copy()
df.fillna({"Distance_from_Home":"Moderate","Parental_Education_Level":"College","Teacher_Quality":"Medium"},inplace=True)

In [57]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

nominal_features = ["Extracurricular_Activities","Internet_Access","School_Type","Learning_Disabilities","Gender"]
numeric_features = ["Hours_Studied","Attendance","Sleep_Hours","Previous_Scores","Tutoring_Sessions","Physical_Activity"]
# Your ordinal columns
categorical_ordinal = [
    "Parental_Involvement",
    "Access_to_Resources",
    "Motivation_Level",
    "Family_Income",
    "Teacher_Quality",
    "Peer_Influence",
    "Parental_Education_Level",
    "Distance_from_Home"
]

# Define order for each column
ordinal_categories = [
    ["Low", "Medium", "High"],         # Parental_Involvement
    ["Low", "Medium", "High"],         # Access_to_Resources
    ["Low", "Medium", "High"],         # Motivation_Level
    ["Low", "Medium", "High"],         # Family_Income
    ["Low", "Medium", "High"],         # Teacher_Quality
    ["Negative", "Neutral", "Positive"], # Peer_Influence
    ["High School", "College", "Postgraduate"], # Parental_Education_Level
    ["Near", "Moderate", "Far"]        # Distance_from_Home
]

ordinal_transformer = OrdinalEncoder(categories=ordinal_categories)
numeric_transformer = StandardScaler()
nominal_transformer = OneHotEncoder(handle_unknown="ignore")

In [58]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression

preprocessor = ColumnTransformer(
    transformers=[
        ("num",numeric_transformer,numeric_features),
        ("nom",nominal_transformer,nominal_features),
        ("ord", ordinal_transformer, categorical_ordinal)
    ]
)
clf = Pipeline(steps=[
    ("preprocessor",preprocessor),
    ("classifier",LinearRegression())
])

In [59]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=["Exam_Score"],axis=1,inplace=False)
y = df["Exam_Score"]

X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.9,test_size=0.1,random_state=42)
clf.fit(X_train,y_train)
prediction = clf.predict(X_test)

In [60]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

mae = mean_absolute_error(y_test, prediction)
mse = mean_squared_error(y_test, prediction)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, prediction)

print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R² Score:", r2)


Mean Absolute Error (MAE): 0.38729788275799215
Mean Squared Error (MSE): 1.9401266495082075
Root Mean Squared Error (RMSE): 1.3928842915002695
R² Score: 0.8557371641602192
