## Data Cleaning

In [None]:
import polars as pl
import sklearn
from sklearn.model_selection import train_test_split
import os
import random

In [None]:
FORESIGHT_DIRECTORY = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

DATA_RAW_DIRECTORY = os.path.join(FORESIGHT_DIRECTORY, "data", "raw")
DATA_INTERIM_DIRECTORY = os.path.join(FORESIGHT_DIRECTORY, "data", "interim")

# Nombre de archivo
DATA_FILENAME = "num_monsters_3_combat_results.csv"

# Rutas completas
FILE_PATH = os.path.join(DATA_RAW_DIRECTORY, DATA_FILENAME)

In [None]:
df = pl.read_csv(FILE_PATH)
df = df[:,1:]

## Creating the `num_players` Column

In this section, we will create a new column called `num_players`. This column will capture the number of heroes present in each party based on the hero class columns (`pc2_class`, `pc3_class`, ..., `pc7_class`). The logic behind this is straightforward:

- If `pc2_class` is `"-"`, then only one hero is present, and `num_players` will be set to 1.
- If `pc2_class` is valid but `pc3_class` is `"-"`, then there are two heroes, and `num_players` will be 2.
- Similarly, we continue this pattern up to `pc7_class`. If none of these columns contain `"-"`, then `num_players` will be 7.

This derived column will provide us with a clear measure of the party's composition, which is critical for subsequent analysis and modeling of encounter difficulty.

In [None]:
# Create the new column "num_players" based on the conditions
df = df.with_columns([
    pl.when(pl.col("pc2_class") == "-").then(1)
      .otherwise(
          pl.when(pl.col("pc3_class") == "-").then(2)
          .otherwise(
              pl.when(pl.col("pc4_class") == "-").then(3)
              .otherwise(
                  pl.when(pl.col("pc5_class") == "-").then(4)
                  .otherwise(
                      pl.when(pl.col("pc6_class") == "-").then(5)
                      .otherwise(
                          pl.when(pl.col("pc7_class") == "-").then(6)
                          .otherwise(7)
                      )
                  )
              )
          )
      ).alias("num_players")
])

## Treating 0 and '-' as Null Values

In this section, we will classify any values that are either `0` or `"-"` as missing (null) in our dataset. This reclassification helps ensure that our subsequent analyses and visualizations work with accurate representations of the data, as these values often indicate the absence of meaningful data.

By converting `0` and `"-"` to null, we can more easily filter, impute, or exclude these values during our exploratory data analysis and modeling processes.

In [None]:
# Define the column indices to check (using zero-based indexing)
indices_to_check = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130]
total_cols = df.width  # total number of columns in the DataFrame

# Iterate over the specified indices
for idx in indices_to_check:
    # Get the column name at index "idx"
    check_col = df.columns[idx]
    
    # Replace '-' in the check column itself with null.
    df = df.with_columns([
        pl.when(pl.col(check_col) == "-")
          .then(pl.lit(None))
          .otherwise(pl.col(check_col))
          .alias(check_col)
    ])
    
    # Build a condition: True for rows where the value in the check column is now null (i.e. was '-')
    condition = pl.col(check_col).is_null()
    
    # Determine the range of the following nine columns (or until the end)
    start = idx + 1
    end = min(idx + 10, total_cols)  # ensures we don't go out-of-bound
    
    # Loop over each of the subsequent nine columns
    for j in range(start, end):
        col_to_update = df.columns[j]
        # Update the column: set to null if condition is met, otherwise keep original value
        df = df.with_columns([
            pl.when(condition)
              .then(pl.lit(None))
              .otherwise(pl.col(col_to_update))
              .alias(col_to_update)
        ])

## Removing Columns with All Null Values

In this section, we will eliminate any columns that contain only null values. Removing these columns helps clean the dataset by discarding features that do not provide any useful information for our analysis or modeling.

In [None]:
# Get the tuple of null counts for each column (ordered by df.columns)
null_counts = df.null_count().row(0)

# Get the total number of rows in the DataFrame
total_rows = df.height

# Identify columns where not all values are null using indices
columns_to_keep = [col for i, col in enumerate(df.columns) if null_counts[i] < total_rows]

# Select only the columns that have at least one non-null value
df = df.select(columns_to_keep)

Finally, we will save it in a dataset called `combat_results.csv`.

In [None]:
df.write_csv(os.path.join(DATA_INTERIM_DIRECTORY, "combat_results.csv"))