### Creating custom datasets to run in Kedro
In this notebook, some feature enginering is performed to obtain a dataset containing only the four major labels and another dataset with a more balaned distribution of labels.

### Imports and loads

In [1]:
import os
import numpy as np
import pandas as pd

In [3]:
drugs = pd.read_csv(os.path.join('..', 'data', '03_primary', 'all_drugs_dataset.csv'))

### 4-label dataset

In [47]:
major_labels = drugs.groupby('MATC_Code_Explanation').count().sort_values('MATC_Code_Short', ascending=False)['MATC_Code_Short'][:4]

In [48]:
major_labels

MATC_Code_Explanation
Antiinfectives for systemic use               2552
Nervous system                                1475
Antineoplastic and immunomodulating agents    1414
Cardiovascular system                         1047
Name: MATC_Code_Short, dtype: int64

In [8]:
major_drugs = drugs.loc[drugs.apply(lambda x: x['MATC_Code_Explanation'] in major_labels, axis=1)].reset_index()

In [20]:
major_labels_list = major_drugs['MATC_Code_Short'].unique()

In [21]:
major_drugs.to_csv(os.path.join('..', 'data', '03_primary', 'major_drugs_dataset.csv'), index=None)

### Label-balanced dataset

In [17]:
drugs.groupby('MATC_Code_Short').count().sort_values('CID')['CID']

MATC_Code_Short
H     108
P     141
O     145
S     152
M     174
I     277
V     288
G     305
R     317
D     346
B     425
A     753
C    1047
L    1414
N    1475
J    2552
Name: CID, dtype: int64

In [18]:
drugs.groupby('MATC_Code_Short').count().sort_values('CID')['CID'].median()

311.0

In [50]:
major_labels = drugs.groupby('MATC_Code_Explanation').count().sort_values('MATC_Code_Short', ascending=False)['MATC_Code_Short'][:5]
major_drugs = drugs.loc[drugs.apply(lambda x: x['MATC_Code_Explanation'] in major_labels, axis=1)].reset_index()
major_labels_list = major_drugs['MATC_Code_Short'].unique()
major_labels_list

array(['C', 'J', 'N', 'L', 'A'], dtype=object)

In [51]:
balanced_labels = drugs.loc[drugs.apply(lambda x: x['MATC_Code_Short'] not in major_labels_list, axis=1)].reset_index()

In [52]:
balanced_labels['MATC_Code_Short'].unique()

array(['B', 'G', 'D', 'I', 'O', 'R', 'S', 'P', 'V', 'M', 'H'],
      dtype=object)

In [53]:
for label in major_labels_list:
    label_df = drugs[drugs['MATC_Code_Short']==label].sample(311)
    balanced_labels = pd.concat([balanced_labels, label_df], axis=0)

In [54]:
balanced_labels.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4233 entries, 0 to 2713
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   index                  2678 non-null   float64
 1   CID                    4233 non-null   int64  
 2   HBondAcceptorCount     4233 non-null   float64
 3   HBondDonorCount        4233 non-null   float64
 4   IsomericSMILES         4233 non-null   object 
 5   MolecularWeight        4233 non-null   float64
 6   LogP                   3134 non-null   float64
 7   RuleFive               4233 non-null   float64
 8   MATC_Code_Short        4233 non-null   object 
 9   MATC_Code_Explanation  4233 non-null   object 
dtypes: float64(6), int64(1), object(3)
memory usage: 363.8+ KB


In [55]:
balanced_labels.groupby('MATC_Code_Short').count().sort_values('CID')['CID']

MATC_Code_Short
H    108
P    141
O    145
S    152
M    174
I    277
V    288
G    305
A    311
C    311
J    311
L    311
N    311
R    317
D    346
B    425
Name: CID, dtype: int64

In [34]:
balanced_labels = balanced_labels.drop('index', axis=1)

In [56]:
balanced_labels.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4233 entries, 0 to 2713
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   index                  2678 non-null   float64
 1   CID                    4233 non-null   int64  
 2   HBondAcceptorCount     4233 non-null   float64
 3   HBondDonorCount        4233 non-null   float64
 4   IsomericSMILES         4233 non-null   object 
 5   MolecularWeight        4233 non-null   float64
 6   LogP                   3134 non-null   float64
 7   RuleFive               4233 non-null   float64
 8   MATC_Code_Short        4233 non-null   object 
 9   MATC_Code_Explanation  4233 non-null   object 
dtypes: float64(6), int64(1), object(3)
memory usage: 363.8+ KB


In [58]:
#balanced_labels.to_csv(os.path.join('..', 'data', '03_primary', 'balanced_dataset.csv'), index=None)