In [17]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from catboost import CatBoostClassifier

# 1. Carrega os dados
df = pd.read_csv('../docs/dados.csv')

# 2. Separa X e y
X = df.drop(['Unnamed: 0', 'notas'], axis=1)
y = df['notas'].astype(int)

# 3. Lista de colunas categóricas
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

# 4. Define o modelo
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    loss_function='MultiClass',    # classificador 5‑classes
    eval_metric='Accuracy',        # ou 'WKappa'
    cat_features=cat_cols,
    random_seed=42,
    verbose=100
)

# 5. Validação cruzada estratificada
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(
    model, X, y,
    cv=cv,
    scoring='neg_mean_absolute_error'
)

print("MAE médio:", -scores.mean())

0:	learn: 0.4125000	total: 58.8ms	remaining: 58.8s
100:	learn: 0.4437500	total: 1.41s	remaining: 12.5s
200:	learn: 0.6975000	total: 2.69s	remaining: 10.7s
300:	learn: 0.9325000	total: 3.95s	remaining: 9.16s
400:	learn: 0.9875000	total: 5.23s	remaining: 7.81s
500:	learn: 0.9962500	total: 6.51s	remaining: 6.48s
600:	learn: 0.9987500	total: 7.75s	remaining: 5.15s
700:	learn: 1.0000000	total: 8.96s	remaining: 3.82s
800:	learn: 1.0000000	total: 10.2s	remaining: 2.54s
900:	learn: 1.0000000	total: 11.5s	remaining: 1.26s
999:	learn: 1.0000000	total: 12.7s	remaining: 0us
0:	learn: 0.4125000	total: 776us	remaining: 776ms
100:	learn: 0.4662500	total: 1.2s	remaining: 10.7s
200:	learn: 0.7250000	total: 2.47s	remaining: 9.81s
300:	learn: 0.9325000	total: 3.72s	remaining: 8.63s
400:	learn: 0.9912500	total: 4.97s	remaining: 7.43s
500:	learn: 1.0000000	total: 6.22s	remaining: 6.2s
600:	learn: 1.0000000	total: 7.47s	remaining: 4.96s
700:	learn: 1.0000000	total: 8.75s	remaining: 3.73s
800:	learn: 1.00000

In [20]:
from sklearn.model_selection import KFold, cross_val_score
from catboost import CatBoostRegressor
import pandas as pd

df = pd.read_csv('../docs/dados.csv')
X = df.drop(['Unnamed: 0', 'notas'], axis=1)
y = df['notas']  # mantém float ou int, ambos funcionam

cat_cols = X.select_dtypes(include=['object']).columns.tolist()

model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    loss_function='MAE',       # otimiza diretamente o MAE
    cat_features=cat_cols,
    random_seed=42,
    verbose=100
)

cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(
    model, X, y,
    cv=cv,
    scoring='neg_mean_absolute_error'
)

print("MAE médio (regressor):", -scores.mean())

0:	learn: 0.8514367	total: 13.9ms	remaining: 13.9s
100:	learn: 0.7447874	total: 347ms	remaining: 3.09s
200:	learn: 0.6223942	total: 692ms	remaining: 2.75s
300:	learn: 0.5272827	total: 1.03s	remaining: 2.38s
400:	learn: 0.4705013	total: 1.35s	remaining: 2.01s
500:	learn: 0.4262035	total: 1.68s	remaining: 1.67s
600:	learn: 0.3892909	total: 2.02s	remaining: 1.34s
700:	learn: 0.3601218	total: 2.34s	remaining: 1000ms
800:	learn: 0.3366896	total: 2.67s	remaining: 664ms
900:	learn: 0.3176017	total: 3.01s	remaining: 331ms
999:	learn: 0.2978284	total: 3.34s	remaining: 0us
0:	learn: 0.8840617	total: 3.21ms	remaining: 3.21s
100:	learn: 0.7754473	total: 276ms	remaining: 2.45s
200:	learn: 0.6834782	total: 589ms	remaining: 2.34s
300:	learn: 0.6030060	total: 915ms	remaining: 2.12s
400:	learn: 0.5307996	total: 1.24s	remaining: 1.86s
500:	learn: 0.4864610	total: 1.58s	remaining: 1.57s
600:	learn: 0.4493609	total: 1.9s	remaining: 1.26s
700:	learn: 0.4233076	total: 2.23s	remaining: 952ms
800:	learn: 0.39

## Modelo Classifier apresentou um MAE menor o tornando mais atrativo

In [21]:
# Testando hiperparametros

from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from catboost import CatBoostClassifier

# 1. Carrega os dados
df = pd.read_csv('../docs/dados.csv')

# 2. Separa X e y
X = df.drop(['Unnamed: 0', 'notas'], axis=1)
y = df['notas'].astype(int)

# 3. Lista de colunas categóricas
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

# 1) Defina o modelo base
model = CatBoostClassifier(
    loss_function='MultiClass',
    eval_metric='Accuracy',
    cat_features=cat_cols,
    random_seed=42,
    verbose=0
)

# 2) Escolha o espaço de busca
param_distribs = {
    'iterations':       [200, 500, 1000],
    'learning_rate':    [0.01, 0.03, 0.05, 0.1],
    'depth':            [4, 6, 8, 10],
    'l2_leaf_reg':      [1, 3, 5, 7, 9],
    'border_count':     [32, 64, 128]
}

# 3) Configure o RandomizedSearchCV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_distribs,
    n_iter=30,                   # quantos conjuntos testar
    scoring='neg_mean_absolute_error',
    cv=cv,
    random_state=42,
    verbose=2,
    n_jobs=-1
)

# 4) Rode a busca
search.fit(X, y)

# 5) Veja os melhores parâmetros e o score
print("Melhores params:", search.best_params_)
print("Melhor MAE:", -search.best_score_)


Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] END border_count=64, depth=4, iterations=1000, l2_leaf_reg=5, learning_rate=0.05; total time=  29.2s
[CV] END border_count=64, depth=4, iterations=1000, l2_leaf_reg=5, learning_rate=0.05; total time=  29.8s
[CV] END border_count=64, depth=4, iterations=1000, l2_leaf_reg=5, learning_rate=0.05; total time=  30.2s
[CV] END border_count=64, depth=4, iterations=1000, l2_leaf_reg=5, learning_rate=0.05; total time=  28.6s
[CV] END border_count=64, depth=4, iterations=1000, l2_leaf_reg=5, learning_rate=0.05; total time=  29.0s
[CV] END border_count=32, depth=4, iterations=1000, l2_leaf_reg=7, learning_rate=0.05; total time=  28.7s
[CV] END border_count=64, depth=6, iterations=1000, l2_leaf_reg=1, learning_rate=0.01; total time= 1.0min
[CV] END border_count=64, depth=6, iterations=1000, l2_leaf_reg=1, learning_rate=0.01; total time= 1.0min
[CV] END border_count=64, depth=6, iterations=1000, l2_leaf_reg=1, learning_rate=0.01; tot



[CV] END border_count=32, depth=8, iterations=500, l2_leaf_reg=3, learning_rate=0.03; total time= 1.6min
[CV] END border_count=32, depth=8, iterations=500, l2_leaf_reg=3, learning_rate=0.03; total time= 1.6min
[CV] END border_count=64, depth=10, iterations=500, l2_leaf_reg=7, learning_rate=0.03; total time= 3.9min
[CV] END border_count=32, depth=8, iterations=500, l2_leaf_reg=3, learning_rate=0.03; total time= 1.6min
[CV] END border_count=64, depth=10, iterations=500, l2_leaf_reg=7, learning_rate=0.03; total time= 3.9min
[CV] END border_count=64, depth=10, iterations=500, l2_leaf_reg=7, learning_rate=0.03; total time= 3.9min
[CV] END border_count=64, depth=10, iterations=500, l2_leaf_reg=7, learning_rate=0.03; total time= 3.7min
[CV] END border_count=64, depth=10, iterations=500, l2_leaf_reg=7, learning_rate=0.03; total time= 3.7min
[CV] END border_count=64, depth=4, iterations=1000, l2_leaf_reg=7, learning_rate=0.01; total time=  32.5s
[CV] END border_count=64, depth=4, iterations=100

KeyboardInterrupt: 