In [54]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
import os
from sklearn.pipeline import Pipeline
import numpy as np

In [55]:
FORESIGHT_DIRECTORY = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
DATA_INTERIM_DIRECTORY = os.path.join(FORESIGHT_DIRECTORY, "data", "interim")
DATA_PROCESSED_DIRECTORY = os.path.join(FORESIGHT_DIRECTORY, "data", "processed")

# Nombre de archivo
DATA_FILENAME = "combat_results.csv"

# Rutas completas
FILE_PATH = os.path.join(DATA_INTERIM_DIRECTORY, DATA_FILENAME)

In [56]:
df = pd.read_csv(FILE_PATH)

In [57]:
y = df['difficulty']

In [58]:
df, X_test, y_train, y_test = train_test_split(df, y, test_size=.15, random_state=42, stratify=y)

In [59]:
column_names_with_nulls = df.columns[df.isnull().any()].tolist()

# Diccionario global para guardar scalers por columna y nivel
_scalers_by_level = {}

def preprocess_df(df, is_train=True):
    df = df.copy()
    all_cols = df.columns.tolist()
    players = [all_cols[i:i+10] for i in range(0, 100, 10)]

    # Transformadores por índice de columna (excepto el índice 0)
    transformers = {
        1: lambda: MinMaxScaler(),                         # log(1 + x) + MinMax
        2: lambda: MinMaxScaler(),
        3: lambda: StandardScaler(),
        4: lambda: StandardScaler(),
        5: lambda: StandardScaler(),
        6: lambda: RobustScaler(),
        7: lambda: MinMaxScaler(),
        8: lambda: RobustScaler()
    }

    for player_cols in players:
        level_col = player_cols[1]

        for i in range(1, 9):  # del índice 1 al 8
            attr_col = player_cols[i]

            df[attr_col] = df[attr_col].astype(float)
            valid_mask = df[attr_col].notna() & df[level_col].notna()
            if not valid_mask.any():
                continue

            for level in df.loc[valid_mask, level_col].unique():
                level_mask = (df[level_col] == level) & df[attr_col].notna()
                values = df.loc[level_mask, [attr_col]].copy()

                if i == 1:  # log(1 + x) para level
                    values[attr_col] = np.log1p(values[attr_col])

                key = f"{attr_col}_lvl{level}"
                scaler = _scalers_by_level.get(key)

                if is_train:
                    scaler = transformers[i]()
                    df.loc[level_mask, attr_col] = scaler.fit_transform(values)
                    _scalers_by_level[key] = scaler
                elif scaler:
                    df.loc[level_mask, attr_col] = scaler.transform(values)
    

    # ---------- Imputación ----------
    cat_cols = [col for col in column_names_with_nulls if df[col].dtype == 'object']
    num_cols = [col for col in column_names_with_nulls if df[col].dtype != 'object']

    for col in cat_cols:
        df[col] = df[col].fillna('None')

    df[num_cols] = df[num_cols].fillna(-1)


    df["winner"] = df["winner"].map({"party": 1, "monsters": 0})
    cols_to_drop = ["party_hp_ratio", "not_conscious_players_ratio", "pc2_level", "pc3_level", "pc4_level", "pc5_level", "pc6_level", "pc7_level", ] \
        + [df.columns[i] for i in range(70, 141, 10)]

    df = df.drop(columns=cols_to_drop)
    cols_to_encode = ["pc1_class", "pc2_class", "pc3_class", "pc4_class", "pc5_class", "pc6_class", "pc7_class", ]
    df = df.drop(columns=cols_to_encode)
    # Aplicar One-Hot Encoding solo a esas columnas
    # df = pd.get_dummies(df, columns=cols_to_encode, drop_first=False).astype('int')
    borrar = []
    for i in range(7):
        for atr in ["DEX", "INT", "WIS", "CHA"]:
            borrar.append("pc" + str(i+1) + "_" + atr)
    df = df.drop(columns=borrar)
    return df

In [60]:
df_train = preprocess_df(df, is_train=True)

In [61]:
X_test =  preprocess_df(X_test, is_train=False)

In [62]:
df_train

Unnamed: 0,pc1_level,pc1_hp_max,pc1_ac,pc1_STR,pc1_CON,pc2_hp_max,pc2_ac,pc2_STR,pc2_CON,pc3_hp_max,...,monster7_hp_max,monster7_ac,monster7_STR,monster7_DEX,monster7_CON,monster7_INT,monster7_WIS,monster7_CHA,difficulty,num_players
153931,0.0,0.454545,2.210830,1.144709,1.0,-1.000000,-1.000000,-1.000000,-1.0,-1.000000,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,9,1
88453,0.0,0.012987,0.619598,-2.117773,-1.0,-1.000000,-1.000000,-1.000000,-1.0,-1.000000,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,9,1
276100,0.0,0.077922,0.089188,0.057215,0.5,0.064935,-1.499361,0.061659,0.0,0.092105,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,9,3
10329,0.0,0.142857,-0.971633,0.057215,0.5,0.064935,-0.968612,-2.111121,-0.5,0.052632,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,6,7
197051,0.0,0.285714,0.089188,0.600962,0.0,0.311688,-0.968612,0.061659,0.5,0.263158,...,68.0,14.0,3.0,3.0,3.0,-2.0,1.0,-1.0,7,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224866,0.0,0.233766,0.619598,0.600962,0.5,0.168831,-0.437862,0.061659,0.0,0.171053,...,45.0,14.0,2.0,2.0,3.0,0.0,1.0,2.0,9,7
83277,0.0,0.246753,0.619598,0.600962,0.5,0.103896,-0.437862,0.061659,0.0,0.197368,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,9,4
101385,0.0,0.493506,-0.441222,1.144709,0.5,0.220779,1.154386,-1.024731,-0.5,-1.000000,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2,2
294935,0.0,0.246753,1.150009,-0.486532,-1.0,0.350649,0.623637,0.061659,-0.5,0.605263,...,52.0,17.0,3.0,0.0,2.0,1.0,0.0,2.0,6,5


In [63]:
df_train.to_csv(os.path.join(DATA_PROCESSED_DIRECTORY, 'X_train.csv'), index=False)
X_test.to_csv(os.path.join(DATA_PROCESSED_DIRECTORY, 'X_test.csv'), index=False)

In [64]:
X_test.shape

(45000, 94)