In [1]:
import pandas as pd
import numpy as np

# Load dataset
file_path = "/mnt/data/thyroid_cleaned.csv"
df = pd.read_csv(file_path)

# Identify numerical and categorical columns
numerical_cols = ["TSH", "T3", "TT4", "T4U", "FTI"]
missing_value_cols = ["sex", "T4U", "FTI"]  # Columns where missing values will be introduced

# Calculate realistic noise levels based on standard deviations
noise_levels = df[numerical_cols].std() * 0.1  # 10% of standard deviation

# Function to add realistic noise within the observed range
def add_noise(value, col_name):
    if pd.isnull(value):
        return value  # Don't modify NaN values
    noise = np.random.normal(0, noise_levels[col_name])
    new_value = value + noise
    return np.clip(new_value, df[col_name].min(), df[col_name].max())  # Keep within realistic range

# Function to introduce missing values only in specified columns
def introduce_missing(value, prob=0.02):
    return np.nan if np.random.random() < prob else value

# Generate 10x more data
augmented_data = []
for _ in range(10):  # Repeat augmentation 10 times
    temp_df = df.copy()

    # Apply noise to numerical columns
    for col in numerical_cols:
        temp_df[col] = temp_df[col].apply(lambda x: add_noise(x, col))

    # Introduce missing values only in specific columns
    for col in missing_value_cols:
        temp_df[col] = temp_df[col].apply(lambda x: introduce_missing(x, 0.02))

    augmented_data.append(temp_df)

# Combine all augmented datasets
augmented_df = pd.concat(augmented_data, ignore_index=True)

# Save to CSV
output_file = "/mnt/data/augmented_thyroid_data_optimized.csv"
augmented_df.to_csv(output_file, index=False)

print(f"Augmented dataset saved to {output_file}")


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/thyroid_cleaned.csv'