In [32]:
import warnings
warnings.filterwarnings('ignore')

In [33]:
# Data loading
import pandas as pd

DATASET_PATH = 'StudentPerformanceFactors.csv'
DATASET_TARGET = 'Exam_Score'

df = pd.read_csv(DATASET_PATH)

df

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6602,25,69,High,Medium,No,7,76,Medium,Yes,1,High,Medium,Public,Positive,2,No,High School,Near,Female,68
6603,23,76,High,Medium,No,8,81,Medium,Yes,3,Low,High,Public,Positive,2,No,High School,Near,Female,69
6604,20,90,Medium,Low,Yes,6,65,Low,Yes,3,Low,Medium,Public,Negative,2,No,Postgraduate,Near,Female,68
6605,10,86,High,High,Yes,6,91,High,Yes,2,Low,Medium,Private,Positive,3,No,High School,Far,Female,68


In [34]:
categorical_vars = []
continuous_vars = []
binary_vars = []
ordinal_vars = []
dtype = []

for col in df.columns:
    unique_count = df[col].nunique()
    cols_type = None
    unique = None

    # Categorical variables
    if df[col].dtype == 'object':
        categorical_vars.append(col)
        unique = df[col].unique().tolist()

        # Binary variables
        if unique_count == 2:
            binary_vars.append(col)
            cols_type = 'Binary'

        # Ordinal and nominal variables
        elif unique_count <= 3 and df[col].dtype == 'object':
            ordinal_vars.append(col)
            cols_type = 'Ordinal'
    # Continous variables
    else:
        continuous_vars.append(col)
        cols_type = 'Continuous'

    dtype.append({
        "Name": col,
        "Type": cols_type,
        "Unique value": unique
    })

df_cols_dtype = pd.DataFrame(dtype)

df_cols_dtype

Unnamed: 0,Name,Type,Unique value
0,Hours_Studied,Continuous,
1,Attendance,Continuous,
2,Parental_Involvement,Ordinal,"[Low, Medium, High]"
3,Access_to_Resources,Ordinal,"[High, Medium, Low]"
4,Extracurricular_Activities,Binary,"[No, Yes]"
5,Sleep_Hours,Continuous,
6,Previous_Scores,Continuous,
7,Motivation_Level,Ordinal,"[Low, Medium, High]"
8,Internet_Access,Binary,"[Yes, No]"
9,Tutoring_Sessions,Continuous,


In [35]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [36]:
df.loc[(df[DATASET_TARGET] > 100, DATASET_TARGET)] = 100

In [37]:
X = df.drop(columns=[DATASET_TARGET])
y = df[DATASET_TARGET]

display(X)
display(y)

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6602,25,69,High,Medium,No,7,76,Medium,Yes,1,High,Medium,Public,Positive,2,No,High School,Near,Female
6603,23,76,High,Medium,No,8,81,Medium,Yes,3,Low,High,Public,Positive,2,No,High School,Near,Female
6604,20,90,Medium,Low,Yes,6,65,Low,Yes,3,Low,Medium,Public,Negative,2,No,Postgraduate,Near,Female
6605,10,86,High,High,Yes,6,91,High,Yes,2,Low,Medium,Private,Positive,3,No,High School,Far,Female


0       67
1       61
2       74
3       71
4       70
        ..
6602    68
6603    69
6604    68
6605    68
6606    64
Name: Exam_Score, Length: 6378, dtype: int64

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import yeojohnson
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted

class SkewKurtTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, skew_threshold=0.5, kurt_target=3.0,
                 weight_skew=1.0, weight_kurt=0.3):
        """
        columns: list of numeric columns to consider (None = all numeric)
        skew_threshold: below this abs(skew), keep original
        kurt_target: reference kurtosis (3 ~ normal)
        weight_skew, weight_kurt: importance in score
        """
        self.columns = columns
        self.skew_threshold = skew_threshold
        self.kurt_target = kurt_target
        self.weight_skew = weight_skew
        self.weight_kurt = weight_kurt

    def _score(self, series):
        s = series.dropna()
        if len(s) < 3:
            return np.inf, np.inf, np.inf
        skew = s.skew()
        kurt = s.kurtosis()
        return (self.weight_skew * abs(skew)
                + self.weight_kurt * abs(kurt - self.kurt_target)), skew, kurt

    def fit(self, X, y=None):
        X = pd.DataFrame(X).copy()

        # choose columns
        if self.columns is None:
            self.columns_ = X.select_dtypes(include="number").columns.tolist()
        else:
            self.columns_ = list(self.columns)

        self.methods_ = {}
        self.params_ = {}
        self.stats_ = {}

        for col in self.columns_:
            data = X[col].astype(float)

            # original
            best_score, best_skew, best_kurt = self._score(data)
            best_method = "none"
            best_params = {}

            # if already fine on skew, keep original
            if abs(best_skew) < self.skew_threshold:
                self.methods_[col] = best_method
                self.params_[col] = best_params
                self.stats_[col] = {"skew": best_skew, "kurt": best_kurt}
                continue

            # candidate 1: log (with shift if needed)
            d = data.copy()
            shift = 0.0
            if d.min() <= 0:
                shift = abs(d.min()) + 1.0
                d = d + shift
            d_log = np.log(d)
            score, s, k = self._score(d_log)
            if score < best_score:
                best_score, best_skew, best_kurt = score, s, k
                best_method = "log"
                best_params = {"shift": shift}

            # candidate 2: yeo-johnson
            d_nonnull = data.dropna()
            try:
                yj_vals, lam = yeojohnson(d_nonnull)
                d_yj = pd.Series(yj_vals, index=d_nonnull.index).reindex(data.index)
                score, s, k = self._score(d_yj)
                if score < best_score:
                    best_score, best_skew, best_kurt = score, s, k
                    best_method = "yeojohnson"
                    best_params = {"lambda": lam}
            except Exception:
                pass

            # store winner for this column
            self.methods_[col] = best_method
            self.params_[col] = best_params
            self.stats_[col] = {"skew": best_skew, "kurt": best_kurt}

        return self

    def transform(self, X):
        check_is_fitted(self, ["methods_", "params_", "columns_"])
        X = pd.DataFrame(X).copy()

        for col in self.columns_:
            method = self.methods_.get(col, "none")
            if method == "none":
                continue

            data = X[col].astype(float)

            if method == "log":
                shift = self.params_[col].get("shift", 0.0)
                X[col] = np.log(data + shift)

            elif method == "yeojohnson":
                lam = self.params_[col]["lambda"]
                # yeojohnson with fixed lambda
                X[col] = yeojohnson(data, lmbda=lam)

        return X

In [40]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn.decomposition import SparsePCA

cont_cols = [c for c in continuous_vars if c != DATASET_TARGET]

continuous_transformer_linear = Pipeline(steps=[
    ("skew_kurt", SkewKurtTransformer(columns=cont_cols,
                                      skew_threshold=0.5,
                                      weight_skew=1.0,
                                      weight_kurt=0.3)),
    ("scaler", StandardScaler())
])

continuous_transformer_non_linear = Pipeline(steps=[
    ("scaler", StandardScaler())
])

onehot_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

ordinal_transformer = Pipeline(steps=[
    ("scaler", OrdinalEncoder())
])

preprocessor_linear = ColumnTransformer(
    transformers=[
        ("cont", continuous_transformer_linear, cont_cols),
        ("onehot", onehot_transformer, binary_vars),
        ("ordinal", ordinal_transformer, ordinal_vars) 
    ],
    remainder="drop",
)

preprocessor_non_linear = ColumnTransformer(
    transformers=[
        ("cont", continuous_transformer_non_linear, cont_cols),
        ("onehot", onehot_transformer, binary_vars),
        ("ordinal", ordinal_transformer, ordinal_vars) 
    ],
    remainder="drop",
)

pca = SparsePCA(
        n_components=10,
        alpha=1.0,
        random_state=42,
        max_iter=1000,
        n_jobs=-1,
    )

In [47]:
from sklearn.model_selection import train_test_split
X_processed = preprocessor_non_linear.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8)

In [None]:
from catboost import CatBoostRegressor, Pool

catboost = CatBoostRegressor(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    loss_function="RMSE"
)

pool_train = Pool(data=X_train, label=y_train, cat_features=categorical_vars)
catboost.fit(pool_train)


0:	learn: 3.7104817	total: 212ms	remaining: 1m 45s
1:	learn: 3.5475065	total: 278ms	remaining: 1m 9s
2:	learn: 3.4132759	total: 348ms	remaining: 57.7s
3:	learn: 3.3175770	total: 405ms	remaining: 50.3s
4:	learn: 3.2213100	total: 478ms	remaining: 47.3s
5:	learn: 3.1347234	total: 591ms	remaining: 48.7s
6:	learn: 3.0497024	total: 662ms	remaining: 46.6s
7:	learn: 2.9874399	total: 748ms	remaining: 46s
8:	learn: 2.9112387	total: 819ms	remaining: 44.7s
9:	learn: 2.8491975	total: 892ms	remaining: 43.7s
10:	learn: 2.8004900	total: 964ms	remaining: 42.8s
11:	learn: 2.7457648	total: 1.1s	remaining: 44.9s
12:	learn: 2.7041625	total: 1.19s	remaining: 44.4s
13:	learn: 2.6643886	total: 1.32s	remaining: 45.8s
14:	learn: 2.6343705	total: 1.4s	remaining: 45.4s
15:	learn: 2.6012593	total: 1.48s	remaining: 44.9s
16:	learn: 2.5688691	total: 1.65s	remaining: 46.9s
17:	learn: 2.5418407	total: 1.75s	remaining: 47s
18:	learn: 2.5157816	total: 1.88s	remaining: 47.5s
19:	learn: 2.4906267	total: 1.96s	remaining: 4

<catboost.core.CatBoostRegressor at 0x191a2cac410>

In [45]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [46]:
print(evaluate_model(y_test, catboost.predict(X_test)))

(0.6428001555165848, np.float64(2.1650109260678243), 0.7086440964778449)
