## Imports

In [1]:
!pip list | grep -E "pandas|numpy|scikit-learn|joblib"

geopandas                                1.1.1
joblib                                   1.5.2
numpy                                    2.0.2
pandas                                   2.2.2
pandas-datareader                        0.10.0
pandas-gbq                               0.30.0
pandas-stubs                             2.2.2.240909
scikit-learn                             1.6.1
sklearn-pandas                           2.2.0


In [23]:
!pip install xgboost



In [2]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [54]:
import xgboost as xgb

## Dataset

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
path = "/content/drive/MyDrive/Projetos"
df = pd.read_csv(f'{path}/data/raw/train.csv')

In [5]:
df.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [6]:
df.shape

(200000, 202)

In [7]:
df = df.drop('ID_code', axis=1)

In [8]:
all(df.isnull().sum()) #verifica se tem valores nulos

False

In [9]:
X, y = df.drop('target', axis=1), df['target']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [36]:
df_train = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

import os
processed_data_path = f'{path}/data/processed'
os.makedirs(processed_data_path, exist_ok=True)

df_train.to_csv(f'{processed_data_path}/train_split.csv', index=False)
df_test.to_csv(f'{processed_data_path}/test_split.csv', index=False)

print(f'train_split.csv saved to {processed_data_path}/train_split.csv')
print(f'test_split.csv saved to {processed_data_path}/test_split.csv')

train_split.csv saved to /content/drive/MyDrive/Projetos/data/processed/train_split.csv
test_split.csv saved to /content/drive/MyDrive/Projetos/data/processed/test_split.csv


## Redução de dimensionalidade

In [16]:
pca = PCA(n_components=0.95)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)

X_reduced = pca.fit_transform(X_scaled)

In [17]:
X_reduced.shape

(160000, 190)

In [18]:
componentes = pca.components_
componentes.shape

(190, 200)

In [19]:
joblib.dump(scaler, f'{path}/models/scaler.pkl')
joblib.dump(pca, f'{path}/models/pca.pkl')

['/content/drive/MyDrive/Projetos/models/pca.pkl']

## Treinamento

In [48]:
clf = xgb.XGBClassifier(
    n_estimators=500,
    learning_rate=0.1,
    max_depth=20,
    device='cuda',
    tree_method='hist',
    random_state=0
).fit(X_reduced, y_train)

### Predição

In [49]:
X_test_scaled = scaler.transform(X_test)
X_test_reduced = pca.transform(X_test_scaled)
predictions = clf.predict(X_test_reduced)

In [50]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

In [51]:
matrix = confusion_matrix(y_test, predictions)
report = classification_report(y_test, predictions)
print('Confusion Matrix:')
print(matrix)
print('\nClassification Report:')
print(report)

Confusion Matrix:
[[35472   514]
 [ 2933  1081]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.95     35986
           1       0.68      0.27      0.39      4014

    accuracy                           0.91     40000
   macro avg       0.80      0.63      0.67     40000
weighted avg       0.90      0.91      0.90     40000



In [52]:
accuracy = accuracy_score(y_test, predictions)

roc_auc = roc_auc_score(y_test, predictions)

print(f'Accuracy: {accuracy:.2f}')
print(f'ROC AUC Score: {roc_auc:.2f}')

Accuracy: 0.91
ROC AUC Score: 0.63


In [53]:
joblib.dump(clf, f'{path}/models/xgb_model.pkl') # exporta o modelo

['/content/drive/MyDrive/Projetos/models/xgb_model.pkl']