In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from pathlib import Path
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [22]:
def feature_changer(df):
    df['study_sleeping_range'] = (df['study_hours'] * df['sleep_hours']/8)
    df['sleep_quality_ord'] = df['sleep_quality'].map({
        'poor': 1,
        'average': 2,
        'good': 3})
    
    df['facility_ord'] = df['facility_rating'].map({
        'low': 1,
        'medium': 2,
        'high': 3
        })
    df['exam_difficulty_order'] = df['exam_difficulty'].map({
        'easy': 1,
        'moderate': 2,
        'hard': 3
    })
        
    return df


def loadmodel():
    model = CatBoostRegressor()
    model.load_model("catboost_best.cbm")
    params = model.get_params()
    return model, params


def get_dfs(df):
    X_full = df.drop(['exam_score', 'id'], axis=1)
    y_full = df['exam_score']
    cat_vars = X_full.select_dtypes(exclude="number").columns.tolist()
    return X_full, y_full, cat_vars


In [5]:
def start_model(X, y, cat_features):
    final_model = CatBoostRegressor(**params)
    final_model.fit(X, y, verbose=100, cat_features=cat_vars)
    return final_model

In [8]:
model, params = loadmodel()

In [9]:
df = pd.read_csv('train.csv')

In [10]:
df = feature_changer(df)

In [11]:
X_full, y_full, cat_vars = get_dfs(df)

In [12]:
model = start_model(X_full, y_full, cat_vars)

0:	learn: 18.4130546	total: 406ms	remaining: 21m 19s
100:	learn: 8.9298642	total: 31.6s	remaining: 15m 54s
200:	learn: 8.8499538	total: 58.1s	remaining: 14m 14s
300:	learn: 8.8342752	total: 1m 24s	remaining: 13m 20s
400:	learn: 8.8214335	total: 1m 51s	remaining: 12m 45s
500:	learn: 8.8126066	total: 2m 16s	remaining: 12m 5s
600:	learn: 8.8060792	total: 2m 41s	remaining: 11m 26s
700:	learn: 8.7997754	total: 3m 6s	remaining: 10m 54s
800:	learn: 8.7929899	total: 3m 34s	remaining: 10m 30s
900:	learn: 8.7874900	total: 4m	remaining: 10m
1000:	learn: 8.7821567	total: 4m 25s	remaining: 9m 32s
1100:	learn: 8.7771832	total: 4m 52s	remaining: 9m 5s
1200:	learn: 8.7724271	total: 5m 18s	remaining: 8m 39s
1300:	learn: 8.7680490	total: 5m 44s	remaining: 8m 11s
1400:	learn: 8.7636548	total: 6m 11s	remaining: 7m 45s
1500:	learn: 8.7592221	total: 6m 38s	remaining: 7m 19s
1600:	learn: 8.7550998	total: 7m 5s	remaining: 6m 53s
1700:	learn: 8.7511542	total: 7m 32s	remaining: 6m 27s
1800:	learn: 8.7477617	tot

In [13]:
# Modelo com 1100 iterations, study_hours com sleep quality order, facility order e exam_difficulty_ord RMSE = 8.777288


In [14]:
X_full['exam_difficulty'].value_counts()

exam_difficulty
moderate    353982
easy        176540
hard         99478
Name: count, dtype: int64

In [15]:
fi = model.get_feature_importance()
pd.Series(fi, index=X_full.columns).sort_values(ascending=False)

study_hours              30.836326
class_attendance         21.778427
study_sleeping_range     21.217599
sleep_quality_ord         9.779471
study_method              8.332998
facility_ord              6.417626
sleep_hours               0.830388
sleep_quality             0.243882
facility_rating           0.221533
course                    0.129051
age                       0.122264
exam_difficulty           0.033472
gender                    0.025092
internet_access           0.019912
exam_difficulty_order     0.011958
dtype: float64

In [18]:
model.save_model(fname='model_new_features.cbm')

In [19]:
feature_changer

<function __main__.feature_changer(df)>

In [20]:
import inspect

In [21]:
inspect.getsource(feature_changer)

"def feature_changer(df):\n    df['study_sleeping_range'] = (df['study_hours'] * df['sleep_hours']/8)\n    df['sleep_quality_ord'] = df['sleep_quality'].map({\n        'poor': 1,\n        'average': 2,\n        'good': 3})\n\n    df['facility_ord'] = df['facility_rating'].map({\n        'low': 1,\n        'medium': 2,\n        'high': 3\n        })\n    df['exam_difficulty_order'] = df['exam_difficulty'].map({\n        'easy': 1,\n        'moderate': 2,\n        'hard': 3\n    })\n\n    return df\n"