In [4]:
import pandas as pd
import numpy as np
from random import random

# Load the dataset from CSV
df = pd.read_csv("thyroid_cleaned.csv")

# Function to add noise to numerical values
def add_noise(value, noise_level=0.1):
    if pd.isnull(value) or isinstance(value, str):  # Ensure only numerical values are altered
        return value
    return round(value + np.random.normal(0, noise_level * value), 2)

# Function to randomly flip binary categorical values
def flip_binary(value, prob=0.05):
    if value in ["t", "f"] and random() < prob:
        return "t" if value == "f" else "f"
    return value

# Function to randomly introduce missing values
def introduce_missing(value, prob=0.02):
    return None if random() < prob else value

# Apply transformations
augmented_df = df.copy()

# Apply noise to numeric columns
numeric_cols = ["TSH", "T3", "TT4", "T4U", "FTI"]
for col in numeric_cols:
    if col in augmented_df.columns:
        augmented_df[col] = augmented_df[col].apply(lambda x: add_noise(x, 0.1))

# Apply flipping to binary categorical columns
binary_cols = ["on_thyroxine", "query_on_thyroxine", "on_antithyroid_meds",
               "sick", "pregnant", "thyroid_surgery", "I131_treatment",
               "query_hypothyroid", "query_hyperthyroid", "lithium",
               "goitre", "tumor", "hypopituitary", "psych"]

for col in binary_cols:
    if col in augmented_df.columns:
        augmented_df[col] = augmented_df[col].apply(lambda x: flip_binary(x, 0.05))

# Introduce missing values randomly across all columns
for col in augmented_df.columns:
    augmented_df[col] = augmented_df[col].apply(lambda x: introduce_missing(x, 0.02))

# Display augmented data
print(augmented_df.head())

# Save the augmented dataset to a new CSV file
augmented_df.to_csv("augmented_thyroid_data.csv", index=False)


    age sex on_thyroxine query_on_thyroxine on_antithyroid_meds sick pregnant  \
0  29.0   F            f                  f                   f    f        f   
1  29.0   F            f                  f                   f    f        f   
2  41.0   F            f                  f                   f    f        f   
3  36.0   F            f                  f                   f    f        f   
4  32.0   F            f                  f                   f    f        f   

  thyroid_surgery I131_treatment query_hypothyroid  ...    T3 TT4_measured  \
0               f              f                 t  ...   NaN            f   
1               f              f                 f  ...  1.84            t   
2               f              f                 f  ...   NaN            f   
3               f              f                 f  ...   NaN            f   
4               f              f                 f  ...   NaN            f   

      TT4 T4U_measured T4U FTI_measured FTI 