# ABP Modulo 6 p

Nombre: Marco Neira

In [1]:
# Importar librerias
import argparse
import json
import joblib
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix,
                             classification_report)
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression

# Link de datos
https://archive.ics.uci.edu/dataset/27/credit+approval

In [3]:
# 1) Cargar crx.data 

df = pd.read_csv("crx.data", header=None, na_values=["?"], dtype=str)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0,u,g,w,v,1.25,t,t,01,f,g,00202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,06,f,g,00043,560,+
2,a,24.50,0.5,u,g,q,h,1.5,t,f,0,f,g,00280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,05,t,g,00100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,00120,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,00260,0,-
686,a,22.67,0.75,u,g,c,v,2,f,t,02,t,g,00200,394,-
687,a,25.25,13.5,y,p,ff,ff,2,f,t,01,t,g,00200,1,-
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,00280,750,-


In [4]:
# Renombrar columnas como en UCI: A1..A15 + class
df.columns = [f"A{i}" for i in range(1, 16)] + ["class"]

In [5]:
# Mapear class: '+' -> 1 (aprobado), '-' -> 0 (rechazado)
df["class"] = df["class"].map({"+": 1, "-": 0}).astype(int)

In [7]:
#2) Tipificar: intentar convertir numéricas
def to_numeric_if_possible(s: pd.Series) -> pd.Series:
    try:
        return pd.to_numeric(s, errors="raise")
    except Exception:
        return s  # queda como objeto (categórica)

for c in df.columns[:-1]:  # excepto 'class'
    df[c] = to_numeric_if_possible(df[c])

In [8]:
# Separar X/y
y = df["class"]
X = df.drop(columns=["class"])

In [9]:
# Detectar tipos finales
numeric_features = X.select_dtypes(include=["int64", "float64", "int32", "float32"]).columns.tolist()
categorical_features = X.select_dtypes(exclude=["int64", "float64", "int32", "float32"]).columns.tolist()

print("Numéricas:", numeric_features)
print("Categóricas:", categorical_features)

Numéricas: ['A2', 'A3', 'A8', 'A11', 'A14', 'A15']
Categóricas: ['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']


In [10]:
#3) Preprocesamiento 
num_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipe, numeric_features),
        ("cat", cat_pipe, categorical_features)
    ],
    remainder="drop"
)


In [11]:
# 4) Modelo 
clf = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",   # ayuda si hay desbalance
    n_jobs=None
)

pipeline = Pipeline(steps=[
    ("prep", preprocessor),
    ("clf", clf)
])

In [12]:
# 5) Train/Test + CV 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def cv_mean(scoring):
    return cross_val_score(pipeline, X_train, y_train, scoring=scoring, cv=cv, n_jobs=-1).mean()

cv_results = {
    "accuracy_cv": float(cv_mean("accuracy")),
    "precision_cv": float(cv_mean("precision")),
    "recall_cv": float(cv_mean("recall")),
    "f1_cv": float(cv_mean("f1")),
    "roc_auc_cv": float(cv_mean("roc_auc")),
}

# Entrenar y evaluar en test
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

try:
    y_proba = pipeline.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, y_proba)
except Exception:
    y_proba, roc_auc = None, None

metrics_test = {
    "accuracy": float(accuracy_score(y_test, y_pred)),
    "precision": float(precision_score(y_test, y_pred)),
    "recall": float(recall_score(y_test, y_pred)),
    "f1": float(f1_score(y_test, y_pred)),
    "roc_auc": (None if roc_auc is None else float(roc_auc)),
}

print("\n=== CV (train) ===")
for k, v in cv_results.items():
    print(f"{k}: {v:.4f}")

print("\n=== Test ===")
for k, v in metrics_test.items():
    print(f"{k}: {v if v is None else round(v,4)}")

print("\nMatriz de confusión (test):")
print(confusion_matrix(y_test, y_pred))

print("\nReporte (test):")
print(classification_report(y_test, y_pred, digits=4))


=== CV (train) ===
accuracy_cv: 0.8587
precision_cv: 0.8233
recall_cv: 0.8698
f1_cv: 0.8456
roc_auc_cv: 0.9142

=== Test ===
accuracy: 0.8696
precision: 0.8308
recall: 0.8852
f1: 0.8571
roc_auc: 0.9602

Matriz de confusión (test):
[[66 11]
 [ 7 54]]

Reporte (test):
              precision    recall  f1-score   support

           0     0.9041    0.8571    0.8800        77
           1     0.8308    0.8852    0.8571        61

    accuracy                         0.8696       138
   macro avg     0.8674    0.8712    0.8686       138
weighted avg     0.8717    0.8696    0.8699       138



In [13]:
# 6) Guardar modelo + esquema 
joblib.dump(pipeline, "model_crx.joblib")

schema = {
    "target": "class",
    "numeric_features": numeric_features,
    "categorical_features": categorical_features,
    "expected_columns": X.columns.tolist()   # A1..A15
}
with open("schema_crx.json", "w", encoding="utf-8") as f:
    json.dump(schema, f, ensure_ascii=False, indent=2)

print("\nModelo guardado en: model_crx.joblib")
print("Esquema guardado en: schema_crx.json")


Modelo guardado en: model_crx.joblib
Esquema guardado en: schema_crx.json


In [14]:
# API simulada
# Cargar modelo entrenado
pipeline = joblib.load("model_crx.joblib")

# Definir función que simula el endpoint /predict
def fake_api_predict(payload: dict):
    # Convertir dict a DataFrame (simulando input JSON)
    df = pd.DataFrame([payload])
    pred = pipeline.predict(df).tolist()
    try:
        proba = pipeline.predict_proba(df)[:, 1].tolist()
    except:
        proba = None
    return {
        "ok": True,
        "predictions": pred,
        "probabilities": proba
    }

# ===== Ejemplo de consulta simulada =====
payload = {
    "A1":"b", "A2":30.83, "A3":0.0, "A4":"u", "A5":"g",
    "A6":"w", "A7":"v", "A8":1.25, "A9":"t", "A10":"t",
    "A11":1, "A12":"f", "A13":"g", "A14":"00202", "A15":0
}

response = fake_api_predict(payload)
print(response)

{'ok': True, 'predictions': [1], 'probabilities': [0.8390354601894723]}
