### ANTI PREPROCESSING

In [3]:
import pandas as pd
import numpy as np
import random
import string

def corrupt_dataset(df, typo_ratio=0.001, zero_ratio=0.002, outlier_ratio=0.0005, missing_ratio=0.001, random_state=42):
    """
    Corrupts the dataset by introducing typos, zeros, extreme outliers, and missing values.

    Args:
        df (pd.DataFrame): Original clean dataset.
        typo_ratio (float): Proportion of typos (letters in numbers).
        zero_ratio (float): Proportion of values replaced with 0.
        outlier_ratio (float): Proportion of extreme values added.
        missing_ratio (float): Proportion of missing values introduced.

    Returns:
        pd.DataFrame: Corrupted dataset.
    """

    np.random.seed(random_state)
    random.seed(random_state)
    
    df_corrupt = df.copy()
    total_rows, total_cols = df_corrupt.shape
    total_values = total_rows * total_cols

    # Convert proportions into absolute numbers
    num_typos = int(total_values * typo_ratio)
    num_zeros = int(total_values * zero_ratio)
    num_outliers = int(total_values * outlier_ratio)
    num_missing = int(total_values * missing_ratio)

    # 1. Introduce Typos (Only in "FP1-F7" and "FT10-T8")
    typo_columns = ["FP1-F7", "FT10-T8"]
    typo_rows = np.random.choice(df_corrupt.index, size=num_typos, replace=False)

    for row in typo_rows:
        col = random.choice(typo_columns)
        if col in df_corrupt.columns:
            original_value = str(df_corrupt.at[row, col])  # Convert to string first
            if original_value.replace('.', '', 1).isdigit():  # Ensure it's numeric
                typo_index = random.randint(0, len(original_value) - 1)
                random_letter = random.choice(string.ascii_letters)
                corrupted_value = original_value[:typo_index] + random_letter + original_value[typo_index + 1:]
                try:
                    df_corrupt.at[row, col] = float(corrupted_value)  # Convert back to float
                except ValueError:
                    pass  # If conversion fails, leave it as is

    # 2. Replace Some Values with 0 (Only in Time Range 600s - 601s)
    if "time" in df_corrupt.columns:
        zero_mask = (df_corrupt["time"] >= 600) & (df_corrupt["time"] <= 601)
        zero_indices = df_corrupt[zero_mask].sample(n=min(num_zeros, zero_mask.sum()), replace=False, random_state=random_state).index
        for col in df_corrupt.columns:
            if col not in ["time", "file", "label"]:  # Exclude important columns
                df_corrupt.loc[zero_indices, col] = np.where(np.random.rand(len(zero_indices)) < 0.02, 0, df_corrupt.loc[zero_indices, col])  # Only some become 0

    # 3. Introduce Outliers (Randomly across dataset)
    outlier_rows = np.random.choice(df_corrupt.index, size=num_outliers, replace=False)

    for row in outlier_rows:
        col = random.choice(df_corrupt.columns)
        if col not in ["time", "file", "label"]:  # Avoid corrupting key columns
            df_corrupt.at[row, col] = random.choice([1e6, -1e6])  # Large positive or negative

    # 4. Introduce Missing Values Randomly
    missing_rows = np.random.choice(df_corrupt.index, size=num_missing, replace=False)
    missing_cols = np.random.choice(df_corrupt.columns[df_corrupt.columns.isin(["time", "file", "label"]) == False], size=num_missing)

    for row, col in zip(missing_rows, missing_cols):
        df_corrupt.at[row, col] = np.nan

    return df_corrupt

df = pd.read_csv("../data/converted_df.csv")

df_corrupted = corrupt_dataset(df=df, typo_ratio=0.001, zero_ratio=0.02, outlier_ratio=0.001, missing_ratio=0.0001, random_state=42)

df_corrupted.to_csv("../data/CHB_MIT.csv", index=False)  # Save the corrupted dataset to a new file