In [18]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
import os
from sklearn.pipeline import Pipeline
import numpy as np

In [19]:
FORESIGHT_DIRECTORY = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
DATA_INTERIM_DIRECTORY = os.path.join(FORESIGHT_DIRECTORY, "data", "interim")
DATA_PROCESSED_DIRECTORY = os.path.join(FORESIGHT_DIRECTORY, "data", "processed")

# Nombre de archivo
DATA_FILENAME = "combat_results.csv"

# Rutas completas
FILE_PATH = os.path.join(DATA_INTERIM_DIRECTORY, DATA_FILENAME)

In [20]:
df = pd.read_csv(FILE_PATH)

In [21]:
y = df['difficulty']

In [22]:
df, X_test, y_train, y_test = train_test_split(df, y, test_size=.15, random_state=42, stratify=y)

In [23]:
column_names_with_nulls = df.columns[df.isnull().any()].tolist()

# Diccionario global para guardar scalers por columna y nivel
_scalers_by_level = {}

def preprocess_df(df, is_train=True):
    df = df.copy()
    all_cols = df.columns.tolist()
    players = [all_cols[i:i+10] for i in range(0, 100, 10)]

    # Transformadores por índice de columna (excepto el índice 0)
    transformers = {
        1: lambda: MinMaxScaler(),                         # log(1 + x) + MinMax
        2: lambda: MinMaxScaler(),
        3: lambda: StandardScaler(),
        4: lambda: StandardScaler(),
        5: lambda: StandardScaler(),
        6: lambda: RobustScaler(),
        7: lambda: MinMaxScaler(),
        8: lambda: RobustScaler()
    }

    for player_cols in players:
        level_col = player_cols[1]

        for i in range(1, 9):  # del índice 1 al 8
            attr_col = player_cols[i]

            df[attr_col] = df[attr_col].astype(float)
            valid_mask = df[attr_col].notna() & df[level_col].notna()
            if not valid_mask.any():
                continue

            for level in df.loc[valid_mask, level_col].unique():
                level_mask = (df[level_col] == level) & df[attr_col].notna()
                values = df.loc[level_mask, [attr_col]].copy()

                if i == 1:  # log(1 + x) para level
                    values[attr_col] = np.log1p(values[attr_col])

                key = f"{attr_col}_lvl{level}"
                scaler = _scalers_by_level.get(key)

                if is_train:
                    scaler = transformers[i]()
                    df.loc[level_mask, attr_col] = scaler.fit_transform(values)
                    _scalers_by_level[key] = scaler
                elif scaler:
                    df.loc[level_mask, attr_col] = scaler.transform(values)
    

    # ---------- Imputación ----------
    cat_cols = [col for col in column_names_with_nulls if df[col].dtype == 'object']
    num_cols = [col for col in column_names_with_nulls if df[col].dtype != 'object']

    for col in cat_cols:
        df[col] = df[col].fillna('None')

    df[num_cols] = df[num_cols].fillna(-1)


    df["winner"] = df["winner"].map({"party": 1, "monsters": 0})
    cols_to_drop = ["party_hp_ratio", "not_conscious_players_ratio"] + [df.columns[i] for i in range(70, 141, 10)]

    df = df.drop(columns=cols_to_drop)
    cols_to_encode_idx = [0, 10, 20, 30, 40, 50, 60]
    cols_to_encode = [df.columns[i] for i in cols_to_encode_idx]

    # Aplicar One-Hot Encoding solo a esas columnas
    df = pd.get_dummies(df, columns=cols_to_encode, drop_first=False)
    return df

In [24]:
df_train = preprocess_df(df, is_train=True)

In [25]:
X_test =  preprocess_df(X_test, is_train=False)

In [26]:
df_train

Unnamed: 0,pc1_level,pc1_hp_max,pc1_ac,pc1_STR,pc1_DEX,pc1_CON,pc1_INT,pc1_WIS,pc1_CHA,pc2_level,...,pc5_class_FighterStr,pc5_class_None,pc6_class_Barbarian,pc6_class_Bard,pc6_class_FighterStr,pc6_class_None,pc7_class_Barbarian,pc7_class_Bard,pc7_class_FighterStr,pc7_class_None
379198,0.0,0.064103,-0.440123,-2.109383,-0.469217,-1.0,0.166667,1.000000,3,0.0,...,False,True,False,False,False,True,False,False,False,True
68354,0.0,0.102564,2.210057,0.059114,1.771458,0.5,0.166667,1.000000,0,0.0,...,False,False,False,False,True,False,False,True,False,False
379083,0.0,0.153846,1.149985,0.059114,1.024566,0.0,0.333333,0.000000,0,0.0,...,False,True,False,False,False,True,False,False,False,True
386446,0.0,0.064103,0.089913,-1.025135,0.277675,-1.5,0.166667,0.000000,4,0.0,...,False,False,False,False,False,True,False,False,False,True
131826,0.0,0.205128,0.619949,-1.025135,1.024566,0.0,0.666667,1.333333,4,0.0,...,True,False,False,True,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177206,0.0,0.025641,1.149985,-2.109383,1.771458,-0.5,0.500000,0.000000,3,0.0,...,False,True,False,False,False,True,False,False,False,True
87950,0.0,0.089744,0.089913,0.059114,-0.469217,0.0,0.333333,0.666667,1,-1.0,...,False,True,False,False,False,True,False,False,False,True
137368,0.0,0.371795,-1.500196,1.143362,-1.216109,0.0,0.000000,0.333333,-1,0.0,...,False,True,False,False,False,True,False,False,False,True
81341,0.0,0.153846,1.680021,0.059114,1.024566,0.5,0.333333,0.333333,2,0.0,...,False,False,True,False,False,False,False,False,False,True


In [27]:
df_train.to_csv(os.path.join(DATA_PROCESSED_DIRECTORY, 'X_train.csv'), index=False)
X_test.to_csv(os.path.join(DATA_PROCESSED_DIRECTORY, 'X_test.csv'), index=False)