In [2]:
import pandas as pd
import requests, json

url = "https://raw.githubusercontent.com/H12-25-L-Equipo-70/ChurnInsight/refs/heads/Data/dataset_empresas_fintech_v2.7.json"
# alternativa (a veces más “limpia”):
# url = "https://raw.githubusercontent.com/H12-25-L-Equipo-70/ChurnInsight/Data/dataset_empresas_fintech_v2.7.json"

r = requests.get(url, timeout=60)
r.raise_for_status()

# Intenta JSON normal (lista/dict). Si falla, intenta NDJSON (1 json por línea)
try:
    data = r.json()
except ValueError:
    data = [json.loads(line) for line in r.text.splitlines() if line.strip()]

# Si es dict, intenta extraer lista interna
if isinstance(data, dict):
    # si existiera alguna key típica: "data", "records", etc.
    for k in ["data", "records", "items"]:
        if k in data and isinstance(data[k], list):
            data = data[k]
            break

df = pd.DataFrame(data)
print("Shape:", df.shape)
df.head()


Shape: (2080, 35)


Unnamed: 0,CUIT,Nombre_Empresa,Tipo_Sociedad,Sector,Provincia,Año_Fundación,Empleados,Periodo_Fiscal,Ingresos,Gastos,...,Total_Login_Dia,Transferencias,Pagos,Creditos,Inversiones,Servicios_Utilizados,Churn,Churn_Date,Telefono,Direccion
0,20748123114,Godoy Tech,S.A.,Tecnología,Río Negro,1999,219,2022-Q1,83320313.96,45830772.92,...,123,False,True,True,False,2,False,,543516408435,Pasaje Corrientes 7169
1,20748123114,Godoy Tech,S.A.,Tecnología,Río Negro,1999,219,2022-Q2,80886622.67,52069210.99,...,24,False,True,True,True,3,False,,543516408435,Pasaje Corrientes 7169
2,20748123114,Godoy Tech,S.A.,Tecnología,Río Negro,1999,219,2022-Q3,76065908.53,48230650.87,...,425,False,True,True,True,3,False,,543516408435,Pasaje Corrientes 7169
3,20748123114,Godoy Tech,S.A.,Tecnología,Río Negro,1999,219,2022-Q4,84048459.33,52531435.05,...,576,True,True,False,True,3,False,,543516408435,Pasaje Corrientes 7169
4,20748123114,Godoy Tech,S.A.,Tecnología,Río Negro,1999,219,2023-Q1,78163714.15,50194170.93,...,348,True,False,True,True,3,True,2023-03-31,543516408435,Pasaje Corrientes 7169


In [3]:
import re
import numpy as np

# 1) Normalizar Churn a 0/1
df["Churn"] = df["Churn"].astype(int)

# 2) Convertir Periodo_Fiscal "2022-Q4" -> índice numérico
def fiscal_to_index(s: str):
    m = re.match(r"^\s*(\d{4})-Q([1-4])\s*$", str(s))
    if not m:
        return np.nan
    return int(m.group(1)) * 4 + int(m.group(2))

df["Periodo_Index"] = df["Periodo_Fiscal"].apply(fiscal_to_index)
df = df.dropna(subset=["Periodo_Index"]).copy()

# 3) Ordenar por empresa y tiempo
df = df.sort_values(["CUIT", "Periodo_Index"]).copy()

# 4) Target recomendado: churn en el siguiente trimestre
df["Churn_Next_Quarter"] = df.groupby("CUIT")["Churn"].shift(-1)
df = df.dropna(subset=["Churn_Next_Quarter"]).copy()
df["Churn_Next_Quarter"] = df["Churn_Next_Quarter"].astype(int)

# 5) Convertir booleans de uso a 0/1 (si existen)
bool_cols = [c for c in df.columns if df[c].dtype == "bool"]
for c in bool_cols:
    df[c] = df[c].astype(int)

print("OK. Shape:", df.shape)
df[["CUIT", "Periodo_Fiscal", "Churn", "Churn_Next_Quarter"]].head(10)


OK. Shape: (1950, 37)


Unnamed: 0,CUIT,Periodo_Fiscal,Churn,Churn_Next_Quarter
1328,20101575005,2022-Q1,0,0
1329,20101575005,2022-Q2,0,0
1330,20101575005,2022-Q3,0,0
1331,20101575005,2022-Q4,0,0
1332,20101575005,2023-Q1,0,0
1333,20101575005,2023-Q2,0,0
1334,20101575005,2023-Q3,0,0
1335,20101575005,2023-Q4,0,0
1336,20101575005,2024-Q1,0,0
1337,20101575005,2024-Q2,0,0


In [5]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import joblib, os, json

# Columns a excluir:
DROP = [
    "Churn", "Churn_Next_Quarter",
    "Churn_Date",       # leakage
    "Telefono", "Direccion", "Nombre_Empresa",  # PII / overfit
    "CUIT",             # solo para agrupar/split
    "Periodo_Fiscal"    # usamos Periodo_Index
]

X = df.drop(columns=[c for c in DROP if c in df.columns]).copy()
y = df["Churn_Next_Quarter"].astype(int)
groups = df["CUIT"]

# num/cat
num_cols = X.select_dtypes(include=["number"]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ]
)

pipe = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LogisticRegression(max_iter=2000, class_weight="balanced"))
])

# Split por empresa (evita fuga)
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=groups))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)

print(classification_report(y_test, pred))
print("Confusion matrix:\n", confusion_matrix(y_test, pred))

# Export
os.makedirs("artifacts", exist_ok=True)
joblib.dump(pipe, "artifacts/churn_pipeline.joblib")

with open("artifacts/features.json", "w") as f:
    json.dump({
        "features": X.columns.tolist(),
        "numeric": num_cols,
        "categorical": cat_cols,
        "target": "Churn_Next_Quarter"
    }, f, indent=2, ensure_ascii=False)

print("✅ Exportado: artifacts/churn_pipeline.joblib + artifacts/features.json")


              precision    recall  f1-score   support

           0       0.93      0.64      0.76       356
           1       0.11      0.47      0.18        34

    accuracy                           0.63       390
   macro avg       0.52      0.56      0.47       390
weighted avg       0.86      0.63      0.71       390

Confusion matrix:
 [[229 127]
 [ 18  16]]
✅ Exportado: artifacts/churn_pipeline.joblib + artifacts/features.json
