In [62]:
import pandas as pd
import numpy as np
import seaborn as sns
from pathlib import Path
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [63]:
PROJECT_DIR = Path().resolve()

In [64]:
def categorical_gview(data: pd.DataFrame):
    Path("Graphic_var_view/Categorical").mkdir(parents=True, exist_ok=True)
    cat_vars = data.select_dtypes(include='object')
    for c in cat_vars:
        sns.countplot(data=data, x=c)
        plt.xticks(rotation=0)
        plt.show()
        plt.savefig(f"{PROJECT_DIR}/Graphic_var_view/Categorical/{c}.png", bbox_inches="tight")


In [65]:
def numeric_gview(data: pd.DataFrame):
    Path("Graphic_var_view/Numerical").mkdir(parents=True, exist_ok=True)
    num_vars = data.select_dtypes(include='number')
    for c in num_vars:
        sns.boxplot(data=data, y=c)
        plt.xticks(rotation=0)
        plt.show()
        plt.savefig(f"{PROJECT_DIR}/Graphic_var_view/Numerical/{c}.png", bbox_inches="tight")

In [66]:
df_train = pd.read_csv("train.csv")

In [67]:
df_test = pd.read_csv('test.csv')

In [68]:
df_train['study_sleep_coef'] = df_train['study_hours']/df_train['sleep_hours']

In [69]:
df_train.select_dtypes(include='number').corr()

Unnamed: 0,id,age,study_hours,class_attendance,sleep_hours,exam_score,study_sleep_coef
id,1.0,-0.000581,0.000346,0.000677,0.001416,0.000372,-0.00015
age,-0.000581,1.0,0.007545,0.005628,0.005864,0.010472,0.005523
study_hours,0.000346,0.007545,1.0,0.087617,0.042491,0.762267,0.81741
class_attendance,0.000677,0.005628,0.087617,1.0,0.029263,0.360954,0.387952
sleep_hours,0.001416,0.005864,0.042491,0.029263,1.0,0.16741,-0.318714
exam_score,0.000372,0.010472,0.762267,0.360954,0.16741,1.0,0.669976
study_sleep_coef,-0.00015,0.005523,0.81741,0.387952,-0.318714,0.669976,1.0


In [70]:
X = df_train.drop('exam_score', axis=1)
y = df_train['exam_score']

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40028922)

In [72]:
cat_features = X_train.select_dtypes(include='object').columns.tolist()

In [73]:
model = CatBoostRegressor(
    iterations=500, 
    learning_rate=0.05,
    depth=7,
    loss_function="RMSE",
    verbose=100)

In [74]:
model.fit(X_train, y_train, cat_features=cat_features, eval_set=(X_test, y_test), use_best_model=True)

0:	learn: 18.2892206	test: 18.2925366	best: 18.2925366 (0)	total: 535ms	remaining: 4m 27s
100:	learn: 8.8731382	test: 8.8411838	best: 8.8411838 (100)	total: 33s	remaining: 2m 10s
200:	learn: 8.8380467	test: 8.8112321	best: 8.8112321 (200)	total: 59.2s	remaining: 1m 28s
300:	learn: 8.8210852	test: 8.7987573	best: 8.7987573 (300)	total: 1m 25s	remaining: 56.6s
400:	learn: 8.8040680	test: 8.7863648	best: 8.7863648 (400)	total: 1m 54s	remaining: 28.3s
499:	learn: 8.7923219	test: 8.7791919	best: 8.7791919 (499)	total: 2m 22s	remaining: 0us

bestTest = 8.779191932
bestIteration = 499



<catboost.core.CatBoostRegressor at 0x1bce1b7ed70>

In [75]:
y_pred = model.predict(X_test)

**BestTest sem class_attendance = 8.778815002**

**BestTest com class_attendance = 8.7791919**


In [76]:
df_train.head(1)

Unnamed: 0,id,age,gender,course,study_hours,class_attendance,internet_access,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty,exam_score,study_sleep_coef
0,0,21,female,b.sc,7.91,98.8,no,4.9,average,online videos,low,easy,78.3,1.594914


In [77]:
df_train.columns

Index(['id', 'age', 'gender', 'course', 'study_hours', 'class_attendance',
       'internet_access', 'sleep_hours', 'sleep_quality', 'study_method',
       'facility_rating', 'exam_difficulty', 'exam_score', 'study_sleep_coef'],
      dtype='object')

In [2]:
import torch

print(torch.__version__)
print(torch.cuda.is_available())

2.9.1+cpu
False
