In [None]:
import re
import polars as pl
import pathlib as p
from collections import Counter
from pathlib import Path
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

In [2]:
# Ajuste o caminho se necessário
train_base_path = p.Path("data/parquet_files/train/train_base.parquet")

df = pl.read_parquet(train_base_path)

In [3]:
# Ver o número de linhas e colunas
print("Tamanho da tabela:", df.shape)

# Ver os nomes das primeiras 20 colunas e seus tipos
for nome, tipo in zip(df.columns[:20], df.dtypes[:20]):
    print(f"{nome:40} → {tipo}")

Tamanho da tabela: (1526659, 5)
case_id                                  → Int64
date_decision                            → String
MONTH                                    → Int64
WEEK_NUM                                 → Int64
target                                   → Int64


In [4]:
#Converter date_decision para data
df = df.with_columns(
    pl.col("date_decision")
      .str.strptime(pl.Date, "%Y-%m-%d", strict=False)   # converte texto → data
      .alias("date_decision")
)

# Conferir rapidamente
print(df["date_decision"].head(5))
print("Tipo da coluna agora:", df["date_decision"].dtype)

shape: (5,)
Series: 'date_decision' [date]
[
	2019-01-03
	2019-01-03
	2019-01-04
	2019-01-03
	2019-01-04
]
Tipo da coluna agora: Date


In [5]:
#Ver quantos calotes existem
dist_target = df.select(pl.col("target").value_counts())
print(dist_target)
#O que observar: quantos 0 (pagou) e quantos 1 (inadimplente).
#Isso nos dirá se temos poucas linhas da classe “calote” (classe 1).

shape: (2, 1)
┌─────────────┐
│ target      │
│ ---         │
│ struct[2]   │
╞═════════════╡
│ {1,47994}   │
│ {0,1478665} │
└─────────────┘


In [6]:
# Conta quantos valores vazios existem em cada coluna
faltantes = df.null_count()
print(faltantes)

shape: (1, 5)
┌─────────┬───────────────┬───────┬──────────┬────────┐
│ case_id ┆ date_decision ┆ MONTH ┆ WEEK_NUM ┆ target │
│ ---     ┆ ---           ┆ ---   ┆ ---      ┆ ---    │
│ u32     ┆ u32           ┆ u32   ┆ u32      ┆ u32    │
╞═════════╪═══════════════╪═══════╪══════════╪════════╡
│ 0       ┆ 0             ┆ 0     ┆ 0        ┆ 0      │
└─────────┴───────────────┴───────┴──────────┴────────┘


In [7]:
# 1) Abrir o arquivo de depósitos
deposit_path = Path("data/parquet_files/train/train_deposit_1.parquet")
dep = pl.read_parquet(deposit_path)

# 2) Espiar tamanho e primeiros dados
print("Depósitos – shape:", dep.shape)
print(dep.head(5))

Depósitos – shape: (145086, 5)
shape: (5, 5)
┌─────────┬─────────────┬──────────────────────┬────────────┬──────────────────┐
│ case_id ┆ amount_416A ┆ contractenddate_991D ┆ num_group1 ┆ openingdate_313D │
│ ---     ┆ ---         ┆ ---                  ┆ ---        ┆ ---              │
│ i64     ┆ f64         ┆ str                  ┆ i64        ┆ str              │
╞═════════╪═════════════╪══════════════════════╪════════════╪══════════════════╡
│ 225     ┆ 0.0         ┆ null                 ┆ 0          ┆ 2016-08-16       │
│ 331     ┆ 260.374     ┆ 2018-03-18           ┆ 0          ┆ 2015-03-19       │
│ 358     ┆ 0.0         ┆ null                 ┆ 0          ┆ 2014-09-02       │
│ 390     ┆ 203.602     ┆ 2017-09-30           ┆ 1          ┆ 2015-10-01       │
│ 390     ┆ 223.68001   ┆ null                 ┆ 2          ┆ 2016-06-08       │
└─────────┴─────────────┴──────────────────────┴────────────┴──────────────────┘


In [8]:
# 3) Contar nulos por coluna
nulls = dep.null_count()
# Transforma em formato longo
nulls = nulls.melt(variable_name="coluna", value_name="qtd_nulos")

# Ordena pelas colunas com mais nulos
nulls = nulls.sort("qtd_nulos", descending=True)

# Mostra as 10 com mais nulos
print(nulls.head(10))


shape: (5, 2)
┌──────────────────────┬───────────┐
│ coluna               ┆ qtd_nulos │
│ ---                  ┆ ---       │
│ str                  ┆ u32       │
╞══════════════════════╪═══════════╡
│ contractenddate_991D ┆ 79682     │
│ case_id              ┆ 0         │
│ amount_416A          ┆ 0         │
│ num_group1           ┆ 0         │
│ openingdate_313D     ┆ 0         │
└──────────────────────┴───────────┘


  nulls = nulls.melt(variable_name="coluna", value_name="qtd_nulos")


In [9]:
# 2) Converter datas de texto/inteiro para Date
date_cols = ["openingdate_313D", "contractenddate_991D"]
for col in date_cols:
    dep = dep.with_columns(
        pl.col(col)
          .str.strptime(pl.Date, "%Y-%m-%d", strict=False)  # ajuste formato se precisar
          .alias(col)
    )

In [10]:
# 3) Tratar nulos: flag + preenchimento provisório
dep = dep.with_columns([
    pl.col("contractenddate_991D").is_null().cast(pl.Int8).alias("dep_active_flag"),
    pl.when(pl.col("contractenddate_991D").is_null())
      .then(pl.col("openingdate_313D"))   # assume ainda ativo – usa data de abertura
      .otherwise(pl.col("contractenddate_991D"))
      .alias("contractenddate_991D")
])

In [11]:
# 4) Feature: duração do contrato (dias)
dep = dep.with_columns(
    (pl.col("contractenddate_991D") - pl.col("openingdate_313D"))
      .dt.total_days()
      .alias("dep_contract_duration")
)

In [12]:
# 5) Agregação por cliente
dep_agg = dep.group_by("case_id").agg([
    pl.col("amount_416A").mean().alias("dep_amt_mean"),
    pl.col("amount_416A").max().alias("dep_amt_max"),
    pl.count().alias("dep_ops_cnt"),
    pl.col("dep_active_flag").max().alias("dep_has_active"),
    pl.col("dep_contract_duration").mean().alias("dep_dur_mean")
])


(Deprecated in version 0.20.5)
  pl.count().alias("dep_ops_cnt"),


In [13]:
df = df.join(dep_agg, on="case_id", how="left")

In [14]:
print("Shape final da base principal:", df.shape)
print(df.select(["dep_amt_mean", "dep_has_active"]).head(30))

Shape final da base principal: (1526659, 10)
shape: (30, 2)
┌──────────────┬────────────────┐
│ dep_amt_mean ┆ dep_has_active │
│ ---          ┆ ---            │
│ f64          ┆ i8             │
╞══════════════╪════════════════╡
│ null         ┆ null           │
│ null         ┆ null           │
│ null         ┆ null           │
│ null         ┆ null           │
│ null         ┆ null           │
│ …            ┆ …              │
│ null         ┆ null           │
│ null         ┆ null           │
│ null         ┆ null           │
│ null         ┆ null           │
│ null         ┆ null           │
└──────────────┴────────────────┘


In [15]:
# 1. Deixar só colunas numéricas e remover o target
num_cols = [
    col for col, dtype in zip(df.columns, df.dtypes)
    if dtype in (pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16,
                 pl.UInt32, pl.UInt64, pl.Float32, pl.Float64)
    and col != "target"
]

# 2. Converter df Polars -> pandas (LightGBM aceita pandas DataFrame)
X = df.select(num_cols).to_pandas()
y = df["target"].to_pandas()


In [16]:
week = df["WEEK_NUM"].to_pandas()

train_mask = week <= 430
valid_mask = week > 430

X_train, X_valid = X[train_mask], X[valid_mask]
y_train, y_valid = y[train_mask], y[valid_mask]

print("Treino:", X_train.shape, "Validação:", X_valid.shape)

Treino: (1526659, 8) Validação: (0, 8)


In [24]:
print("Menor WEEK_NUM :", df['WEEK_NUM'].min())
print("Maior WEEK_NUM :", df['WEEK_NUM'].max())

Menor WEEK_NUM : 0
Maior WEEK_NUM : 91


In [25]:
from sklearn.model_selection import train_test_split

# Convertendo df Polars → pandas
X = df.select(num_cols).to_pandas()
y = df["target"].to_pandas()

# Separando 80% treino / 20% validação de forma estratificada (mantém a proporção da classe 1)
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Treino:", X_train.shape, "Validação:", X_valid.shape)


Treino: (1221327, 8) Validação: (305332, 8)


In [27]:
# 1. Conjuntos LightGBM
train_set = lgb.Dataset(X_train, y_train)
valid_set = lgb.Dataset(X_valid, y_valid, reference=train_set)

# 2. Parâmetros básicos
params = dict(
    objective="binary",
    learning_rate=0.05,
    num_leaves=64,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=1,
    metric="auc",
    verbose=-1,
    # lidar com classe 1 rara → aplicar peso maior
    is_unbalance=True           # opcional, mas ajuda quando target=1 é escasso
)

# 3. Treinamento com early stopping
model = lgb.train(
    params,
    train_set,
    valid_sets=[valid_set],
    num_boost_round=500,
    callbacks=[lgb.early_stopping(stopping_rounds=50)]
)

# 4. Avaliação rápida
pred_valid = model.predict(X_valid, num_iteration=model.best_iteration)
auc  = roc_auc_score(y_valid, pred_valid)
gini = 2 * auc - 1
print(f"\nAUC  : {auc:.4f}")
print(f"Gini : {gini:.4f}")


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[453]	valid_0's auc: 0.578928

AUC  : 0.5789
Gini : 0.1579
