In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

In [3]:
data = pd.read_csv("../data/raw/HIV.csv")
columns = data.columns
print("Shape of the data:", data.shape)
print("Columns:\n", columns)
print("Class distribution:\n", data["HIV_active"].value_counts())

Shape of the data: (41127, 3)
Columns:
 Index(['smiles', 'activity', 'HIV_active'], dtype='object')
Class distribution:
 HIV_active
0    39684
1     1443
Name: count, dtype: int64


In [4]:
X = data.drop(columns=["HIV_active"], axis=1)
y = data["HIV_active"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
print("Shape of X_train, X_test, y_train, y_test:", X_train.shape, X_test.shape, y_train.shape, y_test.shape)

Shape of X_train, X_test, y_train, y_test: (32901, 2) (8226, 2) (32901,) (8226,)


In [6]:
train_data = pd.DataFrame(pd.concat([X_train, y_train], axis=1), columns=columns)
test_data = pd.DataFrame(pd.concat([X_test, y_test], axis=1), columns=columns)

print("Shape of train and test dataframes", train_data.shape, test_data.shape)
print("Head of train dataframe:\n", train_data.head())
print("Head of test dataframe:\n", test_data.head())

Shape of train and test dataframes (32901, 3) (8226, 3)
Head of train dataframe:
                                                   smiles activity  HIV_active
29361                   O=C(O)CC(NC(=O)OCc1ccccc1)C(=O)O       CI           0
10448               O=[N+]([O-])c1ccc(Nc2ccccc2)c2nonc12       CI           0
31039    CCOC(=O)C(=NNc1ccc(C)cc1)N1C(=S)N(C)N=C(C)C=C1S       CI           0
1311                     N#CSC1=C(SC#N)C(=O)c2ccccc2C1=O       CI           0
27834  COc1cc(C2C3=C(COC3=O)OC(C)(C)Oc3cc4c(cc32)OCO4...       CI           0
Head of test dataframe:
                                                   smiles activity  HIV_active
24748  O=C1C(=Cc2ccc(O)cc2)N=C(c2ccccc2)N1n1c(-c2cccc...       CI           0
25440  Cc1c(CCN=P(c2ccccc2)(c2ccccc2)c2ccccc2)c(=O)n2...       CI           0
9889                    O=C1OC(=O)C2C3CCCCC3=C3CCCCC3C12       CI           0
17972           CC(=O)NC(ON1C(=O)COc2ccccc21)C(Cl)(Cl)Cl       CI           0
15331  COc1ccc(CCN(C)CCCC(C#N)(c2cc

In [9]:
print("Class distribution for train data:\n", train_data["HIV_active"].value_counts() / train_data.shape[0])
print("Class distribution for test data:\n", test_data["HIV_active"].value_counts() / test_data.shape[0])

Class distribution for train data:
 HIV_active
0    0.965168
1    0.034832
Name: count, dtype: float64
Class distribution for test data:
 HIV_active
0    0.963895
1    0.036105
Name: count, dtype: float64


In [12]:
test_data.to_csv("../data/raw/HIV_test.csv", index=None)
print("Test data frame saved to a `.csv` file")

Test data frame saved to a `.csv` file


In [15]:
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
train_data = pd.DataFrame(pd.concat([X_resampled, y_resampled], axis=1), columns=columns)

print("Class distribution for train data:\n", train_data["HIV_active"].value_counts() / train_data.shape[0])

Class distribution for train data:
 HIV_active
0    0.5
1    0.5
Name: count, dtype: float64


In [16]:
print("Head of training data:", train_data.head())
print("Statistical description:", train_data.describe())

Head of training data:                                               smiles activity  HIV_active
0                   O=C(O)CC(NC(=O)OCc1ccccc1)C(=O)O       CI           0
1               O=[N+]([O-])c1ccc(Nc2ccccc2)c2nonc12       CI           0
2    CCOC(=O)C(=NNc1ccc(C)cc1)N1C(=S)N(C)N=C(C)C=C1S       CI           0
3                    N#CSC1=C(SC#N)C(=O)c2ccccc2C1=O       CI           0
4  COc1cc(C2C3=C(COC3=O)OC(C)(C)Oc3cc4c(cc32)OCO4...       CI           0
Statistical description:          HIV_active
count  63510.000000
mean       0.500000
std        0.500004
min        0.000000
25%        0.000000
50%        0.500000
75%        1.000000
max        1.000000


In [20]:
train_data.to_csv("../data/raw/HIV_train.csv", index=None)
print("Train Data exported to a `.csv` file successfully!")

Train Data exported to a `.csv` file successfully!
