In [32]:
# -----------------------------
# Imports
# -----------------------------
import os
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold
import numpy as np



In [33]:
# -----------------------------
# SPRINT 1
# -----------------------------

In [34]:
os.chdir("C:/MP_ML")
print("Directorio actual:", os.getcwd())

Directorio actual: C:\MP_ML


In [35]:
#✅ Paso 2: Explorar columnas y entender el dataset
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
print(train.columns)
train.head()

Index(['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10',
       'x11', 'target'],
      dtype='object')


Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,target
0,1.600058,-0.255107,4.539854,-1.328546,1.427454,2.334626,1.62733,1.593896,0.982328,0.036516,-1.019118,-1.436765,1
1,2.435731,-2.776778,-5.613316,1.258602,-0.526397,-0.756025,1.627382,3.04333,0.975805,-3.93725,-0.894762,5.067277,1
2,4.622693,-2.302317,3.153156,-2.652171,0.057744,0.171464,2.205094,2.796084,4.3502,0.730463,1.059515,-2.035405,1
3,-0.250656,-1.709246,1.81881,-1.60155,-1.129601,-1.111861,-2.383055,-1.867726,0.142418,1.828026,1.036688,-1.556186,0
4,2.945809,-0.630331,-3.624051,0.709297,0.077099,-4.380572,-0.938039,0.010142,0.834341,0.504602,0.868038,-0.970574,0


In [36]:
#🔧 Paso 3: Armar baseline (modelo inicial)
#Celda: Separar features y target

# Cambia "target" si tu columna objetivo tiene otro nombre
X = train.drop(columns=["target"])  
y = train["target"]

# Separar train y validación
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

#Celda: Escalar y entrenar modelo

# Escalado
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Entrenar baseline
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_scaled, y_train)

# Evaluar en validación
y_val_pred = clf.predict(X_val_scaled)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92       124
           1       0.78      0.84      0.81        37
           2       0.79      0.58      0.67        19

    accuracy                           0.87       180
   macro avg       0.82      0.78      0.80       180
weighted avg       0.87      0.87      0.87       180



In [37]:
#📤 Paso 4: Predecir en test.csv y guardar submission

# Asegúrate de que 'id' está en test
X_test = test.drop(columns=["id"])  # Mantener como DataFrame
X_test_scaled = scaler.transform(X_test)

y_test_pred = clf.predict(X_test_scaled)

submission = pd.DataFrame({
    "id": test["id"],
    "pred": y_test_pred
})

# Crear carpeta si no existe
import os
os.makedirs("submissions", exist_ok=True)

# Guardar CSV
submission.to_csv("submissions/equipo1.csv", index=False)
pd.read_csv("submissions/equipo1.csv").head()


Unnamed: 0,id,pred
0,0,0
1,1,1
2,2,2
3,3,2
4,4,0


In [38]:
#🏁 Paso 5: Evaluar y registrar en el leaderboard
!python scripts/evaluate.py submissions/equipo1.csv --team equipo1

=== RESULTADOS (TEST) ===
Accuracy     : 0.873
F1 (macro)   : 0.814
F1 (weighted): 0.871
F1 (micro)   : 0.873

Classification report:
               precision    recall  f1-score   support

           0      0.902     0.937     0.919       207
           1      0.830     0.721     0.772        61
           2      0.750     0.750     0.750        32

    accuracy                          0.873       300
   macro avg      0.828     0.803     0.814       300
weighted avg      0.871     0.873     0.871       300

Confusion matrix:
 [[194   8   5]
 [ 14  44   3]
 [  7   1  24]]

Leaderboard actualizado en leaderboard.csv


In [39]:
# -----------------------------
# SPRINT 2
# -----------------------------

In [40]:
#🚀 Paso 1: Preparar transformaciones y pipeline
#Definir las columnas numéricas (ejemplo: todas las que tienes)
num_features = X_train.columns.tolist()

#Crear pipeline con PolynomialFeatures, escalado y regresión logística
# Pipeline para las features numéricas
numeric_transformer = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features)
    ]
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', LogisticRegression(max_iter=1000, C=1.0))
])

In [41]:
#🚀 Paso 2: Validación cruzada con 5 folds
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='f1_macro')

print("Macro-F1 CV scores:", scores)
print("Media Macro-F1:", np.mean(scores))


Macro-F1 CV scores: [0.83719334 0.83378071 0.85649432 0.86461251 0.86883001]
Media Macro-F1: 0.8521821790994473


In [42]:
#🚀 Paso 3: Entrenar pipeline con todo el train y predecir test
pipeline.fit(X_train, y_train)

# Validar localmente
y_val_pred = pipeline.predict(X_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.95      0.98      0.96       124
           1       0.84      0.86      0.85        37
           2       0.87      0.68      0.76        19

    accuracy                           0.92       180
   macro avg       0.89      0.84      0.86       180
weighted avg       0.92      0.92      0.92       180



In [43]:
#🚀 Paso 4: Predecir en test y guardar submission
y_test_pred = pipeline.predict(test[X_train.columns])

submission = pd.DataFrame({
    "id": test["id"],
    "pred": y_test_pred
})

submission.to_csv("submissions/equipo1_cv.csv", index=False)
submission.head()


Unnamed: 0,id,pred
0,0,0
1,1,1
2,2,2
3,3,2
4,4,0


In [44]:
#🚀 Paso 5: Evaluar y actualizar leaderboard
!python scripts/evaluate.py submissions/equipo1_cv.csv --team equipo1

=== RESULTADOS (TEST) ===
Accuracy     : 0.910
F1 (macro)   : 0.863
F1 (weighted): 0.909
F1 (micro)   : 0.910

Classification report:
               precision    recall  f1-score   support

           0      0.926     0.961     0.943       207
           1      0.907     0.803     0.852        61
           2      0.806     0.781     0.794        32

    accuracy                          0.910       300
   macro avg      0.880     0.849     0.863       300
weighted avg      0.909     0.910     0.909       300

Confusion matrix:
 [[199   2   6]
 [ 12  49   0]
 [  4   3  25]]

Leaderboard actualizado en leaderboard.csv
