In [1]:
import pandas as pd
import numpy as np
from random import random

# Load the dataset from CSV
df = pd.read_csv("thyroidDF.csv")

# Function to add noise to numerical values
def add_noise(value, noise_level=0.1):
    if pd.isnull(value) or isinstance(value, str):  # Ensure only numerical values are altered
        return value
    return round(value + np.random.normal(0, noise_level * value), 2)

# Function to randomly flip binary categorical values
def flip_binary(value, prob=0.05):
    if value in ["t", "f"] and random() < prob:
        return "t" if value == "f" else "f"
    return value

# Function to introduce missing values only in specific columns
def introduce_missing(value, prob=0.02):
    return None if random() < prob else value

# Columns to modify
numeric_cols = ["TSH", "T3", "TT4", "T4U", "FTI"]
binary_cols = ["on_thyroxine", "query_on_thyroxine", "on_antithyroid_meds",
               "sick", "pregnant", "thyroid_surgery", "I131_treatment",
               "query_hypothyroid", "query_hyperthyroid", "lithium",
               "goitre", "tumor", "hypopituitary", "psych"]
missing_value_cols = ["sex", "T4U", "FTI"]

# Generate 10x augmented data
augmented_data = []
for _ in range(10):  # Repeat 10 times
    temp_df = df.copy()

    # Apply noise to numeric columns
    for col in numeric_cols:
        if col in temp_df.columns:
            temp_df[col] = temp_df[col].apply(lambda x: add_noise(x, 0.1))

    # Apply flipping to binary categorical columns
    for col in binary_cols:
        if col in temp_df.columns:
            temp_df[col] = temp_df[col].apply(lambda x: flip_binary(x, 0.05))

    # Introduce missing values only in specific columns
    for col in missing_value_cols:
        if col in temp_df.columns:
            temp_df[col] = temp_df[col].apply(lambda x: introduce_missing(x, 0.02))

    augmented_data.append(temp_df)

# Combine all augmented data into a single DataFrame
final_augmented_df = pd.concat(augmented_data, ignore_index=True)

# Display first few rows
print(final_augmented_df.head())

# Save the final augmented dataset
final_augmented_df.to_csv("augmented_thyroid_data_unclean.csv", index=False)



   age sex on_thyroxine query_on_thyroxine on_antithyroid_meds sick pregnant  \
0   29   F            t                  t                   f    f        f   
1   29   F            f                  f                   f    f        f   
2   41   F            f                  f                   f    f        f   
3   36   F            f                  t                   f    f        t   
4   32   F            f                  f                   f    f        f   

  thyroid_surgery I131_treatment query_hypothyroid  ...     TT4 T4U_measured  \
0               f              f                 t  ...     NaN            f   
1               f              t                 f  ...  154.14            f   
2               f              f                 f  ...     NaN            f   
3               f              f                 f  ...     NaN            f   
4               f              f                 f  ...     NaN            f   

  T4U FTI_measured FTI TBG_measured   