In [None]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

from mordred import Calculator, descriptors

# 1. Train test split

In [2]:
dataset = pd.read_csv("../data_for_modeling/raw_data/v1/HDA2-unspec-removed.csv")

In [3]:
len(dataset)

779

In [4]:
def generate_train_test_file(output_file_path, random_state):
    train_file_path = output_file_path + "HDAC2_train_" + str(random_state) + ".csv"
    test_file_path =  output_file_path + "HDAC2_test_" + str(random_state) + ".csv"
    
    train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=random_state)
    y_Train = np.array(train_dataset['FINAL_LABEL'])
    y_Test = np.array(test_dataset['FINAL_LABEL'])
    
    encoder = OneHotEncoder()

    y_Train_2d = y_Train.reshape(-1, 1)
    encoder.fit(y_Train_2d)
    one_hot_encoded_data = encoder.transform(y_Train_2d).toarray()
    y_Train_sc = one_hot_encoded_data[:, 0]
    y_Train_sc.astype(np.int64)

    y_Test_2d = y_Test.reshape(-1, 1)
    encoder.fit(y_Test_2d)
    one_hot_encoded_data = encoder.transform(y_Test_2d).toarray()
    y_Test_sc = one_hot_encoded_data[:, 0]
    y_Train_sc.astype(np.int64)

    train_dataset.FINAL_LABEL = y_Train_sc
    test_dataset.FINAL_LABEL = y_Test_sc
    
    # Save train and test sets to files
    train_dataset.to_csv(train_file_path, index=False)
    test_dataset.to_csv(test_file_path, index=False)

In [5]:
output_file_path = "../data_for_modeling/filter_data/v1/"
generate_train_test_file(output_file_path=output_file_path, random_state=1)

# 2. Generated data properties

In [9]:
def show_activity_distribution(label, dataset):
    #Rows for specific labels
    active_rows = dataset.loc[dataset[label] == 1]
    inactive_rows = dataset.loc[dataset[label] == 0]

    dataset_length = len(dataset)    
    print("Total dataset")
    table = [['', 'Active', 'Inactive', 'Inconclusive', 'Unspecified'], 
            ['Number', len(active_rows), len(inactive_rows)],
            ['Percentage (%)', len(active_rows)/dataset_length*100, len(inactive_rows)/dataset_length*100]]
    print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [12]:
train_dataset = pd.read_csv('../data_for_modeling/filter_data/v1/HDAC2_train_unspec_removed(1).csv')
test_dataset = pd.read_csv('../data_for_modeling/filter_data/v1/HDAC2_test_unspec_removed.csv')
show_activity_distribution('FINAL_LABEL', train_dataset)

Total dataset
╒════════════════╤══════════╤════════════╕
│                │   Active │   Inactive │
╞════════════════╪══════════╪════════════╡
│ Number         │ 399      │   397      │
├────────────────┼──────────┼────────────┤
│ Percentage (%) │  50.1256 │    49.8744 │
╘════════════════╧══════════╧════════════╛


In [13]:
show_activity_distribution('FINAL_LABEL', test_dataset)

Total dataset
╒════════════════╤══════════╤════════════╕
│                │   Active │   Inactive │
╞════════════════╪══════════╪════════════╡
│ Number         │ 102      │    54      │
├────────────────┼──────────┼────────────┤
│ Percentage (%) │  65.3846 │    34.6154 │
╘════════════════╧══════════╧════════════╛
