# Imports de bibliotecas

In [1]:
import kagglehub
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K

# Dataset binario do Kaggle

In [2]:
path = kagglehub.dataset_download("rohitudageri/credit-card-details")

# Features e labels
df_features = pd.read_csv(os.path.join(path, "Credit_card.csv"))
df_labels   = pd.read_csv(os.path.join(path, "Credit_card_label.csv"))

# Juntar pelo Ind_ID
df = df_features.merge(df_labels, on="Ind_ID", how="inner")

print("Formato do dataset:", df.shape)
print("Colunas:", df.columns.tolist())
print("Distribuição da label:\n", df["label"].value_counts(normalize=True))

Formato do dataset: (1548, 19)
Colunas: ['Ind_ID', 'GENDER', 'Car_Owner', 'Propert_Owner', 'CHILDREN', 'Annual_income', 'Type_Income', 'EDUCATION', 'Marital_status', 'Housing_type', 'Birthday_count', 'Employed_days', 'Mobile_phone', 'Work_Phone', 'Phone', 'EMAIL_ID', 'Type_Occupation', 'Family_Members', 'label']
Distribuição da label:
 label
0    0.886951
1    0.113049
Name: proportion, dtype: float64


# Pre processamento de dados

In [3]:
X = df.drop(columns=["Ind_ID", "label"])
y = df["label"].values

# Codificar variáveis categóricas
for col in X.select_dtypes(include="object").columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# Normalizar variáveis numéricas
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split treino/teste
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Definindo Metricas

In [4]:
def f1_metric(y_true, y_pred):
    y_true = K.cast(y_true, "float32")
    y_pred_bin = K.round(y_pred)

    tp = K.sum(K.cast(y_true * y_pred_bin, "float32"))
    fp = K.sum(K.cast((1 - y_true) * y_pred_bin, "float32"))
    fn = K.sum(K.cast(y_true * (1 - y_pred_bin), "float32"))

    precision = tp / (tp + fp + K.epsilon())
    recall = tp / (tp + fn + K.epsilon())
    f1 = 2 * (precision * recall) / (precision + recall + K.epsilon())
    return f1


# Construcao do modelo

In [5]:
model = Sequential()
model.add(Dense(1, activation="sigmoid", input_shape=(X_train.shape[1],)))

model.compile(
    optimizer=Adam(),
    loss="binary_crossentropy",
    metrics=["accuracy", f1_metric]
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


# Treinamento

In [6]:
history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=10,
    validation_split=0.2,
    class_weight={0:1, 1:10},
    verbose=0
)

# Teste do Modelo

In [7]:
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Acurácia no teste:", acc)
print("F1 no teste:", f1)

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
Acurácia no teste: 0.8870967741935484
F1 no teste: 0.0


#Conclusoes

Como o F1 deu zero, isso significa que o modelo nao conseguiu prever a minoria dos labels - isso acontece pq o modelo esta desbalanceado, para resolver isso teriamos que rebalancear o dataset.

Eu adicionei o parametro class weight do keras para ele dar mais valor para a classe minoritaria - mas mesmo assim nao teve resultado - aqui o necessario seria rebalancear o dataset.