In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import math

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import umap.plot
from apopfail.model import clean

# needs umap install with plots dependency: pip install 'umap[plots]'
from umap import UMAP

sns.set_style("ticks")

sklearn.set_config(transform_output="pandas")

np.random.seed(0)

In [None]:
X = pd.read_parquet("../data/train_set_p53mutant.parquet")
y = pd.read_csv(
    "../data/train_labels_p53mutant.csv", index_col=0, skiprows=1, names=["target"]
)

In [None]:
X, y = clean(X, y)

In [None]:
y.value_counts(normalize=True)

In [None]:
from apopfail.model import get_pipeline

get_pipeline()

In [None]:
pipe = get_pipeline()
X_processed = pipe.fit_transform(X, y)

In [None]:
pca = pipe.named_steps["reducer"]

pca.n_components_

In [None]:
sns.lineplot(pca.singular_values_**2)

In [None]:
g = sns.lineplot(pca.explained_variance_ratio_.cumsum())
g.axhline(0.95, color="red")
g.axhline(0.99, color="green")

In [None]:
# choosing the sampling strategy

current_ratio = y.value_counts(normalize=True).iloc[1]
current_ratio

In [None]:
desired_ratio = 0.1

In [None]:
def print_ratios(y):
    """Print the value counts and the ratio of the minority class."""
    print(y.value_counts())
    print(y.value_counts(normalize=True).iloc[1])

In [None]:
print_ratios(y)

In [None]:
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.over_sampling import ADASYN, SMOTE
from imblearn.under_sampling import (
    EditedNearestNeighbours,
    RandomUnderSampler,
    TomekLinks,
)
from sklearn.neighbors import NearestNeighbors

k_neighbors = NearestNeighbors(n_neighbors=10, n_jobs=-1)

smote = SMOTE(random_state=0, sampling_strategy=desired_ratio, k_neighbors=k_neighbors)
tomek = TomekLinks(n_jobs=-1)
enn = EditedNearestNeighbours(n_jobs=-1)

samplers = {
    "SMOTE": smote,
    "ADASYN": ADASYN(
        random_state=0,
        n_jobs=-1,
        sampling_strategy=desired_ratio,
        n_neighbors=k_neighbors,
    ),
    "RandomUnderSampler": RandomUnderSampler(
        random_state=0, sampling_strategy=desired_ratio
    ),
    "TomekLinks": tomek,
    "EditedNearestNeighbors": enn,
    "SMOTEENN": SMOTEENN(smote=smote, enn=enn),
    "SMOTETomek": SMOTETomek(smote=smote, tomek=tomek),
}

In [None]:
# Calculate the grid size
num_samplers = len(samplers)
cols = math.ceil(math.sqrt(num_samplers))
rows = math.ceil(num_samplers / cols)

# Create a figure and set of subplots
fig, axs = plt.subplots(rows, cols, figsize=(15, 10))  # Adjust figsize as needed
axs = axs.flatten()  # Flatten the 2D array of axes to make iteration easier

for i, (name, sampler) in enumerate(samplers.items()):
    print("Using sampler", name)
    X_resampled, y_resampled = sampler.fit_resample(X_processed, y)
    print_ratios(y_resampled)
    mapper = UMAP(n_components=2).fit(X_resampled)
    ax = axs[i]
    umap.plot.points(mapper, labels=y_resampled, ax=ax)
    ax.set_title(f"{name}")

# Hide any unused subplots
for j in range(i + 1, len(axs)):
    fig.delaxes(axs[j])

plt.tight_layout()
plt.savefig("../output/sampling_strategies.svg", bbox_inches="tight")
plt.show()