In [1]:

import pandas as pd
import numpy as np

file_path = "/Users/keisipani/Desktop/BscAI/ML/ML/thyroidDF.csv"
df = pd.read_csv(file_path)

groups = df.groupby("target")



In [2]:


def add_numeric_noise_by_group(df, group_col, numeric_cols, noise_factor=0.1):
    """
    Adds Gaussian noise to numeric columns while preserving group-based distributions.
    
    - noise_factor: Determines the percentage of the standard deviation used as noise.
    - Each value will randomly ADD or SUBTRACT a sampled portion of the standard deviation.
    """
    noisy_dfs = []
    
    for _, group in df.groupby(group_col):
        group_noisy = group.copy()
        
        for col in numeric_cols:
            # Calculate standard deviation for this group/column
            std_dev = group[col].std()
            
            # Generate random signs (-1 or 1) for each row
            signs = np.random.choice([-1, 1], size=len(group))
            
            # Generate noise with the same standard deviation but randomly add or subtract
            noise = signs * np.random.normal(0, noise_factor * std_dev, size=len(group))
            
            # Apply noise
            group_noisy[col] += noise
        
        noisy_dfs.append(group_noisy)
    
    return pd.concat(noisy_dfs, ignore_index=True)


In [3]:
def add_boolean_noise(data, columns, flip_prob=0.05):
    """
    Flip boolean values in specified columns with a fixed probability.
    """
    noisy_data = data.copy()
    for col in columns:
        mask = np.random.rand(len(data)) < flip_prob
        noisy_data.loc[mask, col] = ~noisy_data.loc[mask, col]
    return noisy_data

In [4]:
def instance_based_augmentation(df, copies, diagnosis_col, numeric_cols, bool_cols, flip_prob=0.05):
    """
    Create multiple copies of the data, each with instance-based noise added.
    """
    augmented_list = []
    for i in range(copies):
        # For each copy, add numeric and boolean noise.
        noisy_numeric = add_numeric_noise_by_group(df, diagnosis_col, numeric_cols)
        noisy_full = add_boolean_noise(noisy_numeric, bool_cols, flip_prob=flip_prob)
        noisy_full["copy_id"] = i  # (Optional) add an identifier for the copy
        augmented_list.append(noisy_full)
    return pd.concat(augmented_list, ignore_index=True)

numeric_columns = ["age", "TSH", "T3", "TT4", "T4U", "FTI"]

bool_columns = [
    "on_thyroxine", "query_on_thyroxine", "on_antithyroid_meds", "sick", 
    "pregnant", "thyroid_surgery", "I131_treatment", "query_hypothyroid", 
    "query_hyperthyroid", "lithium", "goitre", "tumor", "hypopituitary", "psych"
]

for col in bool_columns:
    df[col] = df[col].map({"t": True, "f": False})


In [5]:
def clean_data(df, numeric_cols):
    """
    Cleans the dataset by:
    - Removing outliers in numeric features
    - Handling NaN values
    - Ensuring logical consistency
    """
    
    # Define valid ranges based on domain knowledge
    valid_ranges = {
        "age": (0, 120),
        "TSH": (0.01, 10),
        "T3": (0.5, 5),
        "TT4": (50, 200),
        "T4U": (0.5, 2),
        "FTI": (40, 150),
    }

    # Clip numeric values within valid ranges
    for col in numeric_cols:
        if col in valid_ranges:
            df[col] = np.clip(df[col], valid_ranges[col][0], valid_ranges[col][1])

    # Drop rows where critical values are missing
    df = df.dropna(subset=["age", "TSH", "T3", "TT4"])  # Add other critical columns as needed

    # Ensure logical consistency (Example: Pregnancy & Sex)
    df.loc[df["pregnant"] == True, "sex"] = "F"  # Pregnant patients must be female

    return df

# Apply cleaning BEFORE augmentation
df_cleaned = clean_data(df, numeric_columns)


In [6]:
df_augmented = instance_based_augmentation(df, copies=10, diagnosis_col="target",
                                           numeric_cols=numeric_columns,
                                           bool_cols=bool_columns,
                                           flip_prob=0.05)
print("Instance-based augmentation done, shape:", df_augmented.shape)

Instance-based augmentation done, shape: (91720, 32)


In [7]:
def adjust_boolean_proportions(aug_df, orig_df, bool_cols):
    """
    Adjust each boolean column in the augmented dataframe to have the same global
    proportion of True values as in the original dataset.
    """
    adj_df = aug_df.copy()
    n_total = len(adj_df)
    for col in bool_cols:
        # Compute original and augmented proportions
        orig_prop = orig_df[col].mean()  # Since booleans are 0/1, mean equals proportion of True
        current_prop = adj_df[col].mean()
        desired_count = int(round(orig_prop * n_total))
        current_count = int(adj_df[col].sum())
        difference = desired_count - current_count

        # If we need to add True values:
        if difference > 0:
            # Identify indices where the value is False; randomly choose some to flip to True.
            false_indices = adj_df.index[~adj_df[col]]
            if len(false_indices) >= difference:
                indices_to_flip = np.random.choice(false_indices, size=difference, replace=False)
                adj_df.loc[indices_to_flip, col] = True
        # If we need to remove True values:
        elif difference < 0:
            true_indices = adj_df.index[adj_df[col]]
            if len(true_indices) >= abs(difference):
                indices_to_flip = np.random.choice(true_indices, size=abs(difference), replace=False)
                adj_df.loc[indices_to_flip, col] = False
    return adj_df

# Adjust boolean proportions in the augmented dataset.
# df_augmented_adjusted = adjust_boolean_proportions(df_augmented, df, bool_columns)
# df_augmented_adjusted


In [8]:
def adjust_numeric_variance(aug_df, orig_df, numeric_cols):
    """
    Adjust each numeric column in the augmented dataframe so that its global variance
    matches that of the original dataset.
    """
    adj_df = aug_df.copy()
    for col in numeric_cols:
        orig_mean = orig_df[col].mean()
        orig_var = orig_df[col].var()
        aug_mean = adj_df[col].mean()
        aug_var = adj_df[col].var()
        if aug_var > 0:
            scale = np.sqrt(orig_var / aug_var)
            # Adjust augmented values while keeping the overall mean anchored at orig_mean.
            adj_df[col] = orig_mean + (adj_df[col] - aug_mean) * scale
        else:
            print(f"Warning: Zero variance in augmented column {col}")
    return adj_df

# Example usage:

# df_numeric_noisy = adjust_numeric_variance(df_augmented, df, numeric_columns)
# df_numeric_noisy 

In [9]:
# First, adjust boolean proportions
df_augmented_adjusted = adjust_boolean_proportions(df_augmented, df, bool_columns)

# Then, adjust numeric variance on the already adjusted dataset
df_final_augmented = adjust_numeric_variance(df_augmented_adjusted, df, numeric_columns)

# Save to CSV
df_final_augmented.to_csv("aug3.csv", index=False)
