In [None]:
# ðŸ“Œ Preprocesamiento de Datos para la PredicciÃ³n de Churn

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# ðŸ“Œ Cargar dataset
df = pd.read_csv("../dataset.csv")

# ðŸ“Œ 1. Identificar y eliminar columnas irrelevantes
df.drop(["customerID", "Churn Reason"], axis=1, inplace=True, errors='ignore')

# ðŸ“Œ 2. Convertir columnas numÃ©ricas incorrectas
df["Total Charges"] = pd.to_numeric(df["Total Charges"], errors="coerce")

# ðŸ“Œ 3. Manejo de valores nulos
imputer = SimpleImputer(strategy="median")
df["Total Charges"] = imputer.fit_transform(df[["Total Charges"]])

# ðŸ“Œ 4. CodificaciÃ³n de variables categÃ³ricas
cat_cols = df.select_dtypes(include=["object"]).columns
encoder = OneHotEncoder(drop="first", sparse=False)
encoded_data = pd.DataFrame(encoder.fit_transform(df[cat_cols]))
encoded_data.columns = encoder.get_feature_names_out(cat_cols)

df = df.drop(cat_cols, axis=1).reset_index(drop=True)
df = pd.concat([df, encoded_data], axis=1)

# ðŸ“Œ 5. NormalizaciÃ³n de variables numÃ©ricas
num_cols = ["Tenure Months", "Monthly Charges", "Total Charges"]
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# ðŸ“Œ 6. Separar variables predictoras y objetivo
X = df.drop(columns=["Churn Value"])
y = df["Churn Value"]

# ðŸ“Œ 7. Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# ðŸ“Œ 8. Balanceo de clases con SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# ðŸ“Œ Guardar conjuntos preprocesados
X_train_balanced.to_csv("../data/X_train.csv", index=False)
X_test.to_csv("../data/X_test.csv", index=False)
y_train_balanced.to_csv("../data/y_train.csv", index=False)
y_test.to_csv("../data/y_test.csv", index=False)

print("âœ… Preprocesamiento completado y datos guardados.")