In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from IPython.display import display

import warnings 
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("Exam_Score_Prediction.csv")

In [3]:
df = df.drop("student_id", axis=1)

In [4]:
df["internet_access"] = df["internet_access"].map({"yes": 1, "no": 2})

In [5]:
df = df[df["gender"].isin(["male", "female"])]

In [6]:
df["gender"] = df["gender"].map({"male": 0, "female": 1})

In [7]:
X = df.drop("exam_score",axis=1)
y = df["exam_score"]

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((10619, 11), (2655, 11), (10619,), (2655,))

In [9]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

In [10]:
encoders = {}


for col in ["course", "study_method"]:
    encoders[col] = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    train_encoded = encoders[col].fit_transform(X_train[[col]])
    test_encoded = encoders[col].transform(X_test[[col]])

    new_cols = encoders[col].get_feature_names_out([col])
    new_cols = [c.replace(" ", "_") for c in new_cols]

    X_train = X_train.drop(columns=[col])
    X_test = X_test.drop(columns=[col])
    X_train[new_cols] = train_encoded
    X_test[new_cols] = test_encoded

# OrdinalEncoder ile diğer kolonlar
encoders["sleep_quality"] = OrdinalEncoder(categories=[["poor", "average", "good"]])
X_train["sleep_quality"] = encoders["sleep_quality"].fit_transform(X_train[["sleep_quality"]])
X_test["sleep_quality"] = encoders["sleep_quality"].transform(X_test[["sleep_quality"]])

encoders["facility_rating"] = OrdinalEncoder(categories=[["low", "medium", "high"]])
X_train["facility_rating"] = encoders["facility_rating"].fit_transform(X_train[["facility_rating"]])
X_test["facility_rating"] = encoders["facility_rating"].transform(X_test[["facility_rating"]])

encoders["exam_difficulty"] = OrdinalEncoder(categories=[["easy", "moderate", "hard"]])
X_train["exam_difficulty"] = encoders["exam_difficulty"].fit_transform(X_train[["exam_difficulty"]])
X_test["exam_difficulty"] = encoders["exam_difficulty"].transform(X_test[["exam_difficulty"]])

In [11]:
scaling_cols = ["age", "study_hours", "class_attendance", "sleep_hours", ]
scaler = StandardScaler()
X_train[scaling_cols] = scaler.fit_transform(X_train[scaling_cols])
X_test[scaling_cols] = scaler.transform(X_test[scaling_cols])

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [13]:
scaler

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [14]:
scaler.feature_names_in_

array(['age', 'study_hours', 'class_attendance', 'sleep_hours'],
      dtype=object)

In [15]:
X_train

Unnamed: 0,age,gender,study_hours,class_attendance,internet_access,sleep_hours,sleep_quality,facility_rating,exam_difficulty,course_b.com,...,course_b.tech,course_ba,course_bba,course_bca,course_diploma,study_method_coaching,study_method_group_study,study_method_mixed,study_method_online_videos,study_method_self-study
13017,0.232212,1,1.225681,1.615560,1,-0.289100,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6685,-1.079183,0,1.542858,-1.339835,2,-0.231162,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
10420,1.543607,1,-0.729520,-0.264095,1,1.333159,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7654,-1.079183,1,-1.637601,0.215940,1,0.232340,2.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5508,0.232212,1,1.382097,-1.345619,2,-1.563732,2.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12767,1.106475,0,1.638445,1.684962,1,0.695843,2.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4453,0.669344,0,-0.881591,-0.397116,2,0.348216,2.0,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
15665,-1.516314,1,1.156162,1.615560,1,-0.462913,2.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
9987,-0.642051,1,-1.042352,0.823213,1,-0.926416,1.0,2.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [16]:
linear = LinearRegression()
linear.fit(X_train, y_train)
y_pred = linear.predict(X_test)

score = r2_score(y_test, y_pred)

print("R2 Score: ", score)

R2 Score:  0.7299157107947192


In [17]:
encoders

{'course': OneHotEncoder(handle_unknown='ignore', sparse_output=False),
 'study_method': OneHotEncoder(handle_unknown='ignore', sparse_output=False),
 'sleep_quality': OrdinalEncoder(categories=[['poor', 'average', 'good']]),
 'facility_rating': OrdinalEncoder(categories=[['low', 'medium', 'high']]),
 'exam_difficulty': OrdinalEncoder(categories=[['easy', 'moderate', 'hard']])}

In [18]:
scaler

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [19]:
import pickle

In [20]:
with open("ExamScorePrediction.pkl", "wb") as f:
    pickle.dump(
        {
            'model': linear,
            'encoders': encoders,
            'scaler': scaler
        }
        ,f)

In [21]:
pd.DataFrame(X_test).to_csv("ExamScorePrediction-testdatascaled.csv", index=False)