# Data Augmentation

In [24]:
from imblearn.over_sampling import SMOTENC
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [25]:
file_path = "data/CRFs.csv"
data = pd.read_csv(file_path)
data = data.replace("n/a", np.nan).dropna()

In [26]:
def add_noise(series, noise_level):
    return series + np.random.normal(0, noise_level, size=series.shape)

In [27]:
data['SBP'] = add_noise(data['SBP'], 5)
data['DBP'] = add_noise(data['DBP'], 3)
data['Weight'] = add_noise(data['Weight'], 1.5)
data['Height'] = add_noise(data['Height'], 2)

In [28]:
data['SBP'] = np.clip(data['SBP'], 90, 200)
data['DBP'] = np.clip(data['DBP'], 50, 120)
data['Weight'] = np.clip(data['Weight'], 30, 200)
data['Height'] = np.clip(data['Height'], 140, 200)

In [29]:
X = data[['Age', 'Weight', 'Height', 'SBP', 'DBP', 'BMI', 'Smoker', 'Gender']]
y = data['Vascular event']
smote = SMOTENC(sampling_strategy={'myocardial infarction': 100, 'stroke': 100, 'syncope': 100}, random_state=42, k_neighbors=2, categorical_features=['Gender', 'Smoker'])
X_resampled, y_resampled = smote.fit_resample(X, y)
augmented_data = pd.DataFrame(X_resampled, columns=X.columns)
augmented_data['Vascular event'] = y_resampled

In [30]:
print("Balanced class distribution:")
print(augmented_data['Vascular event'].value_counts())

Balanced class distribution:
Vascular event
none                     104
myocardial infarction    100
stroke                   100
syncope                  100
Name: count, dtype: int64


In [34]:
num_stroke_samples = 50
stroke_samples = data.sample(num_stroke_samples).copy()
stroke_samples['Vascular event'] = 'stroke'
stroke_samples['SBP'] = np.random.uniform(140, 180, size=num_stroke_samples)  # High SBP
stroke_samples['DBP'] = np.random.uniform(80, 120, size=num_stroke_samples)  # High DBP
stroke_samples['BMI'] = np.random.uniform(25, 35, size=num_stroke_samples)  # Overweight range
stroke_samples['Age'] = np.random.uniform(60, 90, size=num_stroke_samples)  # Older age group
stroke_samples['Smoker'] = np.random.choice(data['Smoker'].unique(), size=num_stroke_samples)  # Random smoker status
stroke_samples['Gender'] = np.random.choice(data['Gender'].unique(), size=num_stroke_samples)  # Random gender


In [35]:
data = pd.concat([data, stroke_samples])

In [None]:
# Interpolation to create new synthetic samples
interpolated_samples = []
for _ in range(50):
    sample1 = data.sample(1).iloc[0]
    sample2 = data.sample(1).iloc[0]

    # Interpolate numerical values
    numeric_features = ['Age', 'Weight', 'Height', 'SBP', 'DBP', 'BMI']
    interpolated_sample = sample1.copy()
    interpolated_sample[numeric_features] = (sample1[numeric_features] + sample2[numeric_features]) / 2

    # Select categorical values randomly
    categorical_features = ['Smoker', 'Gender', 'Vascular event']
    for col in categorical_features:
        interpolated_sample[col] = np.random.choice([sample1[col], sample2[col]])

    interpolated_samples.append(interpolated_sample)

# Convert to DataFrame and concatenate
interpolated_df = pd.DataFrame(interpolated_samples, columns=data.columns)
data = pd.concat([data, interpolated_df])


TypeError: unsupported operand type(s) for /: 'str' and 'int'

In [None]:
data['Vascular event'].value_counts().plot(kind='bar', title="Class Distribution")
plt.show()

In [None]:
data.to_csv("augmented_dataset.csv", index=False)
print("Augmented dataset saved to 'augmented_dataset.csv'")