In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from IPython.display import display

import warnings 
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("Exam_Score_Prediction.csv")

In [3]:
df = df.drop("student_id", axis=1)

In [4]:
df["internet_access"] = df["internet_access"].map({"yes": 1, "no": 2})

In [5]:
df1 = df[df["gender"].isin(["male", "female"])]

In [6]:
df1["gender"] = df1["gender"].map({"male": 0, "female": 1})

In [7]:
X_df1 = df1.drop("exam_score",axis=1)
y_df1 = df1["exam_score"]

In [8]:
from sklearn.model_selection import train_test_split

X_train_df1, X_test_df1, y_train_df1, y_test_df1 = train_test_split(X_df1, y_df1, test_size=0.2, random_state=25)

X_train_df1.shape, X_test_df1.shape, y_train_df1.shape, y_test_df1.shape

((10619, 11), (2655, 11), (10619,), (2655,))

In [9]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

In [10]:
encoders = {}

encoders["sleep_quality"] = OrdinalEncoder(categories=[["poor", "average", "good"]])
X_train_df1["sleep_quality"] = encoders["sleep_quality"].fit_transform(X_train_df1[["sleep_quality"]])
X_test_df1["sleep_quality"] = encoders["sleep_quality"].transform(X_test_df1[["sleep_quality"]])

encoders["facility_rating"] = OrdinalEncoder(categories=[["low", "medium", "high"]])
X_train_df1["facility_rating"] = encoders["facility_rating"].fit_transform(X_train_df1[["facility_rating"]])
X_test_df1["facility_rating"] = encoders["facility_rating"].transform(X_test_df1[["facility_rating"]])

encoders["exam_difficulty"] = OrdinalEncoder(categories=[["easy", "moderate", "hard"]])
X_train_df1["exam_difficulty"] = encoders["exam_difficulty"].fit_transform(X_train_df1[["exam_difficulty"]])
X_test_df1["exam_difficulty"] = encoders["exam_difficulty"].transform(X_test_df1[["exam_difficulty"]])


encoders["course"] = OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False)
X_train_df1["course"] = encoders["course"].fit_transform(X_train_df1[["course"]])
X_test_df1["course"] = encoders["course"].transform(X_test_df1[["course"]])

encoders["study_method"] = OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False)
X_train_df1["study_method"] = encoders["study_method"].fit_transform(X_train_df1[["study_method"]])
X_test_df1["study_method"] = encoders["study_method"].transform(X_test_df1[["study_method"]])

In [11]:
numerical_columns = [col for col in X_train_df1 if X_train_df1[col].dtype != 'O']
scaler = StandardScaler()
X_train_df1[numerical_columns] = scaler.fit_transform(X_train_df1[numerical_columns])
X_test_df1[numerical_columns] = scaler.transform(X_test_df1[numerical_columns])

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [13]:
scaler

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [14]:
X_train_df1

Unnamed: 0,age,gender,course,study_hours,class_attendance,internet_access,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty
13017,0.232212,1.010128,-0.407620,1.225681,1.615560,-0.420728,-0.289100,-0.000693,2.047600,0.003706,-1.279551
6685,-1.079183,-0.989974,-0.407620,1.542858,-1.339835,2.376835,-0.231162,-0.000693,2.047600,-1.226037,0.150868
10420,1.543607,1.010128,-0.407620,-0.729520,-0.264095,-0.420728,1.333159,-0.000693,-0.488377,-1.226037,0.150868
7654,-1.079183,1.010128,2.453264,-1.637601,0.215940,-0.420728,0.232340,1.225207,-0.488377,0.003706,-1.279551
5508,0.232212,1.010128,-0.407620,1.382097,-1.345619,2.376835,-1.563732,1.225207,-0.488377,-1.226037,0.150868
...,...,...,...,...,...,...,...,...,...,...,...
12767,1.106475,-0.989974,-0.407620,1.638445,1.684962,-0.420728,0.695843,1.225207,-0.488377,1.233449,-1.279551
4453,0.669344,-0.989974,2.453264,-0.881591,-0.397116,2.376835,0.348216,1.225207,-0.488377,1.233449,0.150868
15665,-1.516314,1.010128,-0.407620,1.156162,1.615560,-0.420728,-0.462913,1.225207,-0.488377,-1.226037,0.150868
9987,-0.642051,1.010128,-0.407620,-1.042352,0.823213,-0.420728,-0.926416,-0.000693,-0.488377,1.233449,0.150868


In [16]:
linear = LinearRegression()
linear.fit(X_train_df1, y_train_df1)
y_pred = linear.predict(X_test_df1)

score = r2_score(y_test_df1, y_pred)

print("R2 Score: ", score)

R2 Score:  0.7004442051113344


In [17]:
encoders

{'sleep_quality': OrdinalEncoder(categories=[['poor', 'average', 'good']]),
 'facility_rating': OrdinalEncoder(categories=[['low', 'medium', 'high']]),
 'exam_difficulty': OrdinalEncoder(categories=[['easy', 'moderate', 'hard']]),
 'course': OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False),
 'study_method': OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)}

In [18]:
scaler

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [19]:
import pickle

In [20]:
with open("ExamScorePrediction.pkl", "wb") as f:
    pickle.dump(
        {
            'model': linear,
            'encoders': encoders,
            'scaler': scaler
        }
        ,f)

In [21]:
pd.DataFrame(X_test_df1).to_csv("ExamScorePrediction-testdatascaled.csv", index=False)