# Data splitting

This notebook is used to generate the train-test splits with and without SMOTE technique.

In [1]:
import os
import numpy as np
from collections import Counter
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [2]:
import warnings
warnings.filterwarnings(action='ignore', category=pd.errors.PerformanceWarning)

# Load dataset

In [3]:
ecfp4_dataframe = pd.read_csv("../data/fingerprints/ecfp4.tsv", sep="\t")
rdkit_dataframe = pd.read_csv("../data/fingerprints/rdkit.tsv", sep="\t")
maccs_dataframe = pd.read_csv("../data/fingerprints/maccs.tsv", sep="\t")
mhfp6_dataframe = pd.read_csv("../data/fingerprints/mhfp6.tsv", sep="\t")
erg_dataframe = pd.read_csv("../data/fingerprints/erg.tsv", sep="\t")
chemphys_dataframe = pd.read_csv("../data/fingerprints/chemphys.tsv", sep='\t')

In [4]:
# remove rows with column value as inf (prominent in BPI column)
rows_to_drop = []
chemphys_dataframe = chemphys_dataframe[~chemphys_dataframe.isin([np.inf, -np.inf]).any(1)]
chemphys_dataframe.shape

  chemphys_dataframe = chemphys_dataframe[~chemphys_dataframe.isin([np.inf, -np.inf]).any(1)]


(107695, 37)

# Split the dataset into train-test

The split ratio choose for this purpose was 80-10. Since we want to handle just integers, we convert the classes into integers with: 
* gram-negative - 1
* gram-positive - 2
* acid-fast - 3
* unselective - 4

Since a high imbalance between the classes exists, we apply the SMOTE technique to rebalance the classes.

In [5]:
os.makedirs('../data/splits', exist_ok=True)

In [6]:
def smote_base_sampling(df: pd.DataFrame, name: str):
    print(f'\n Processing {name} dataset')
    # Drop non-relevant columns
    df.drop(columns=['cmp_id'], inplace=True)

    # Split the data into training and testing sets
    train, test = train_test_split(
        df, 
        test_size=0.2, # 80% training and 20% testing
        random_state=42, # to ensure that the split is always the same
        shuffle=True,
        stratify=df['label']  # to ensure that the distribution of the labels is the same in both splits
    )

    # Saving the orginal splits
    train.to_csv(f'../data/splits/{name}_train.csv', index=False)
    test.to_csv(f'../data/splits/{name}_test.csv', index=False)

    print("Original dataset shape %s" % Counter(train['label']))

    # Map the labels to integers (SMOTE only works with integers)
    train['label'] = train['label'].map({
        'gram-negative': 1,
        'gram-positive': 2,
        'acid-fast': 3,
        'unselective': 4
    })

    # Split the training data 
    X_train = train.drop(columns=['label'])
    y_train = train['label']

    # Apply SMOTE to the training data
    sm = SMOTE(random_state=42)
    smote_sampled_train, smote_sampled_labels = sm.fit_resample(X_train, y_train)

    # Map the labels back to their original values
    smote_sampled_train['label'] = smote_sampled_labels
    smote_sampled_train['label'] = smote_sampled_train['label'].map({
        1: 'gram-negative',
        2: 'gram-positive',
        3: 'acid-fast',
        4: 'unselective'
    })

    print("SMOTE dataset shape %s" % Counter(smote_sampled_train['label']))

    # Saving the SMOTE splits
    smote_sampled_train.to_csv(f'../data/splits/{name}_smote_train.csv', index=False)

In [7]:
for df, fingerprint_name in tqdm([
    (chemphys_dataframe, 'chemphys'),
    (erg_dataframe, 'erg'),
    (ecfp4_dataframe, 'ecfp4'), 
    (rdkit_dataframe, 'rdkit'), 
    (maccs_dataframe, 'maccs'), 
    (mhfp6_dataframe, 'mhfp6'),
]):
    smote_base_sampling(df, fingerprint_name)

  0%|          | 0/6 [00:00<?, ?it/s]


 Processing chemphys dataset
Original dataset shape Counter({'gram-negative': 43974, 'gram-positive': 23187, 'acid-fast': 12771, 'unselective': 6224})
SMOTE dataset shape Counter({'gram-negative': 43974, 'gram-positive': 43974, 'unselective': 43974, 'acid-fast': 43974})


 17%|█▋        | 1/6 [00:02<00:13,  2.75s/it]


 Processing erg dataset
Original dataset shape Counter({'gram-negative': 21376, 'gram-positive': 11436, 'acid-fast': 6376, 'unselective': 3108})
SMOTE dataset shape Counter({'gram-negative': 21376, 'acid-fast': 21376, 'gram-positive': 21376, 'unselective': 21376})


 33%|███▎      | 2/6 [00:13<00:29,  7.34s/it]


 Processing ecfp4 dataset
Original dataset shape Counter({'gram-negative': 22198, 'gram-positive': 11611, 'acid-fast': 6386, 'unselective': 3114})
SMOTE dataset shape Counter({'gram-negative': 22198, 'unselective': 22198, 'acid-fast': 22198, 'gram-positive': 22198})


 50%|█████     | 3/6 [00:32<00:37, 12.54s/it]


 Processing rdkit dataset
Original dataset shape Counter({'gram-negative': 22198, 'gram-positive': 11611, 'acid-fast': 6386, 'unselective': 3114})
SMOTE dataset shape Counter({'gram-negative': 22198, 'unselective': 22198, 'acid-fast': 22198, 'gram-positive': 22198})


 67%|██████▋   | 4/6 [00:52<00:31, 15.52s/it]


 Processing maccs dataset
Original dataset shape Counter({'gram-negative': 22198, 'gram-positive': 11611, 'acid-fast': 6386, 'unselective': 3114})
SMOTE dataset shape Counter({'gram-negative': 22198, 'unselective': 22198, 'acid-fast': 22198, 'gram-positive': 22198})


 83%|████████▎ | 5/6 [00:56<00:11, 11.54s/it]


 Processing mhfp6 dataset
Original dataset shape Counter({'gram-negative': 22198, 'gram-positive': 11611, 'acid-fast': 6386, 'unselective': 3114})
SMOTE dataset shape Counter({'gram-negative': 22198, 'unselective': 22198, 'acid-fast': 22198, 'gram-positive': 22198})


100%|██████████| 6/6 [02:09<00:00, 21.54s/it]
