In [18]:
import pandas as pd
from utils.categorical_encoders import CategoricalEncoders

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [15]:
# 1. Leer el dataset
DATASET_PATH = 'data/dataset.csv'
dataset = pd.read_csv(DATASET_PATH, encoding='latin-1' ,sep=';')
print(dataset['Target'].value_counts())
print(dataset['Target'].value_counts(normalize=True))

Target
0    8137
1     363
Name: count, dtype: int64
Target
0    0.957294
1    0.042706
Name: proportion, dtype: float64


In [16]:
# 2. Obtener las columnas binarias y categóricas
categorical = CategoricalEncoders(dataset=dataset)
binary_columns, categorical_columns = categorical.get_binary_categorical_columns()

# 3. Aplicar categorical encoders
methods = ['LabelEncoder', 'OneHotEncoder', 'OrdinalEncoder', 'FrequencyEncoder', 'BinaryEncoder', 'BackwardDifferenceEncoder']
data_encoded = categorical.provider(binary_columns, categorical_columns, method=methods[0])

# 4. Dividir el dataset en train y test
X,y = data_encoded.drop(['Target'], axis=1),data_encoded[["Target"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
print(y_train['Target'].value_counts())
print(y_test['Target'].value_counts())

Target
0    6103
1     272
Name: count, dtype: int64
Target
0    2034
1      91
Name: count, dtype: int64


In [20]:
# 5. Entrenar modelos
from utils.base_models import BaseModels

base_models = BaseModels()
name_models = ['logistic_regression', 'decision_tree', 'random_forest',
               'gradient_boosting', 'svm', 'knn', 'naive_bayes', 'mlp',
               'lgbm','catboost', 'xgboost']

model = base_models.provider(name_models[0])
model.fit(X_train, y_train)

predict_train = model.predict_proba(X_train)[:,1]
predict_test = model.predict_proba(X_test)[:,1]


print("auc on training in LogisticRegression data : {:.3f}"
      .format(roc_auc_score(y_train, predict_train) ))
print("auc on testing in LogisticRegression  data : {:.3f}"
      .format(roc_auc_score(y_test, predict_test) ))

auc on training in LogisticRegression data : 0.726
auc on testing in LogisticRegression  data : 0.711
