Importando bibliotecas

In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import chi2_contingency, spearmanr
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [45]:
df = pd.read_csv("D:\\github\\SIGAMA-data-analysis\\data\\dataset_normalizado_anonimizado.csv")

Chi-Square Test

In [46]:
df.columns = df.columns.str.strip().str.lower()

# Criar variável binária de aprovação
df["approved"] = (df["status"] == "APROVADO").astype(int)

In [47]:
contingency = pd.crosstab(df["tipo_de_conta"], df["approved"])

chi2, p, dof, expected = chi2_contingency(contingency)

print("Chi-square:", chi2)
print("p-value:", p)

Chi-square: 9.123288150594922
p-value: 0.0025237593125928787


Spearman Correlation

In [48]:
df = df.sort_values(["cpf_hash", "ano", "mês", "dia"])

df["attempt_number"] = df.groupby("cpf_hash").cumcount() + 1

df.head()

Unnamed: 0,status,tipo_de_conta,ano,mês,dia,cpf_hash,approved,attempt_number
1420,REPROVADO,Pessoa Física,2025,Dezembro,19,00225a26400ca5b3947a50f9484809f0935b28d319371e...,0,1
422,APROVADO,Pessoa Física,2025,Novembro,11,003b6930bfaeeedb089bd7c0cee082b7f18de35d448ffb...,1,1
1359,APROVADO,Pessoa Física,2025,Dezembro,10,004a67a521ac8b9c58ed216bd940fddc674cd2782f055f...,1,1
1496,REPROVADO,Pessoa Física,2025,Agosto,12,006794d3a86baa884268a3f5fa6375685dd85c7e375555...,0,1
329,APROVADO,Pessoa Física,2025,Outubro,15,006794d3a86baa884268a3f5fa6375685dd85c7e375555...,1,2


In [49]:
corr_day, p_day = spearmanr(df["dia"], df["approved"])
corr_attempt, p_attempt = spearmanr(df["attempt_number"], df["approved"])

print("Spearman Day vs Approval:", corr_day, p_day)
print("Spearman Attempt vs Approval:", corr_attempt, p_attempt)

Spearman Day vs Approval: 0.013453087783753146 0.564880457304036
Spearman Attempt vs Approval: -0.0679804273371657 0.0035927574888419593


Logistic Regression

In [50]:
X = df[["tipo_de_conta", "dia", "mês", "attempt_number"]]
y = df["approved"]

X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.47      0.12      0.19       225
           1       0.60      0.91      0.72       325

    accuracy                           0.59       550
   macro avg       0.54      0.51      0.45       550
weighted avg       0.55      0.59      0.50       550

