# PCA Augmentation

In [2]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [3]:
# Load the dataset
file_path = "../data/combined_mutation_CNV.csv"
data = pd.read_csv(file_path)

# Separate features and target variable
X = data.drop(columns=["Subtype", "Sample"])
y = data["Subtype"]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA to reduce dimensionality
pca = PCA()  # Adjust the number of components as needed
pca.fit(X_scaled)
# Calculate explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

# Find the number of components needed to explain 90% variance
n_components_90 = np.argmax(cumulative_explained_variance >= 0.99) + 1
# Print the number of components needed to explain 90% of the variance
print(f"Number of components needed to explain 99% of the variance: {n_components_90}")

# Apply PCA to reduce dimensionality
pca = PCA(n_components=n_components_90)  # Adjust the number of components as needed
X_pca = pca.fit_transform(X_scaled)

# Generate synthetic samples by perturbing the principal components
def generate_synthetic_samples(X_pca, num_samples=1000):
    synthetic_samples = []
    for _ in range(num_samples):
        sample = np.copy(X_pca[np.random.randint(0, X_pca.shape[0])])
        perturbation = np.random.normal(
            0, 0.1, sample.shape
        )  # Adjust the noise level as needed
        synthetic_sample = sample + perturbation
        synthetic_samples.append(synthetic_sample)
    return np.array(synthetic_samples)


synthetic_samples_pca = generate_synthetic_samples(X_pca)

# Inverse transform the synthetic samples back to the original feature space
synthetic_samples = scaler.inverse_transform(
    pca.inverse_transform(synthetic_samples_pca)
)

# Create a DataFrame for the synthetic samples
synthetic_df = pd.DataFrame(synthetic_samples, columns=X.columns)
synthetic_df["Subtype"] = np.random.choice(
    y.unique(), size=synthetic_df.shape[0]
)  # Assign random subtypes

# Combine the synthetic samples with the original dataset
augmented_data = pd.concat([data, synthetic_df], ignore_index=True)

# Save the augmented dataset to a new CSV file
augmented_data.to_csv("../data/augmented_combined_mutation_CNV.csv", index=False)


Number of components needed to explain 99% of the variance: 37


# SMOTE Augmentation

In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE

In [None]:
# Load your data
file_path = "../data/combined_mutation_CNV.csv"
data = pd.read_csv(file_path)
data = data.drop(columns=["Sample"])

# SMOTE Function to generate additional data
def apply_smote(data, target_column, n_samples=50):
    smote = SMOTE(sampling_strategy={data[target_column].value_counts().idxmin(): n_samples}, random_state=42)
    X = data.drop(columns=[target_column])
    y = data[target_column]
    X_res, y_res = smote.fit_resample(X, y)
    return pd.concat([X_res, y_res], axis=1)

# Apply SMOTE to generate 50 additional samples (assuming 'Subtype' is the target column)
# Here we are creating additional samples only for the minority class.
balanced_data = apply_smote(augmented_data, 'Subtype', n_samples=50)
balanced_data = apply_smote(balanced_data, 'Subtype', n_samples=50)

balanced_data
