# BELKA: Shrinking the dataset

Source : https://www.kaggle.com/code/shlomoron/belka-shrinking-the-dataset/notebook

Author : GREYSNOW


This notebook's purpose is to shrink the size of the dataset for the [BELKA competition](https://www.kaggle.com/competitions/leash-BELKA/discussion?sort=published) .  
Shrinking strategy:

1. No ID column.
2. binds columns saved in bytes.
3. buildingblock1_smiles/buildingblock2_smiles/buildingblock3_smiles columns saved as int16, with encoded indices of the building blobks. I saved the building blocks and their indices in separate dictionaries.
4. I transformed the protein/label columns into three columns of labels per protein, shrinking the dataset length by three. (The other columns have identical values for each three consecutive rows).

NOTE: TPU is not intended for EDA and data manipulation. Using TPU notebooks for the RAM capacity is considered a misuse of the TPU resource by Kaggle rules. Also, as an avid user of TPU, it is in my interest that people don't misuse it. In creating this notebook, I tried to make minimal use of TPU, developing and debugging on a regular notebook, and I published this notebook only because I feel that on this specific occasion, it is in the community's best interest to get a normal-size dataset instead of the given bloated one. Please don't fork/rerun this notebook on TPU, and please don't make similar use of the TPU notebook resource as I did here. I did it once, for the community, so that we all have a dataset that we can work with it. Thank you.

If I see too many forks, I will turn this notebook to private (the dataset would still be public so don't worry). I hope I don't need to do this, so please don't fork.


In [None]:
import numpy as np
import pandas as pd
from pyarrow.parquet import ParquetFile
import pickle
import os

In [None]:
ParquetFile("/kaggle/input/leash-BELKA/train.parquet").metadata

In [None]:
ParquetFile("/kaggle/input/leash-BELKA/train.parquet").schema

In [None]:
DEBUG = False
if DEBUG:
    NUM_ROWS = 30000000
else:
    NUM_ROWS = 295246830

In [None]:
dataset_path = "/kaggle/input/leash-BELKA/train.parquet"

### A quick verifications that id column is what we expect it to be


In [None]:
def id_eda(dataset_path):
    id_arr = pd.read_parquet(
        dataset_path, engine="pyarrow", columns=["id"]
    ).id.to_numpy()
    id_arr_2 = range(295246830)
    print(np.mean(id_arr == id_arr_2))


id_eda(dataset_path)

### The dataset consists of three rows of the same small molecule with binding labels to the three different proteins, followed by three rows of the next small molecule, etc. We will verify it for each relevant column along the way.


In [None]:
def protein_name_eda(dataset_path):
    protein_name = pd.read_parquet(
        dataset_path, engine="pyarrow", columns=["protein_name"]
    ).protein_name.to_numpy()
    protein_name_reshaped = np.reshape(protein_name, [-1, 3])
    print(np.mean(protein_name_reshaped[:, 0] == "BRD4"))
    print(np.mean(protein_name_reshaped[:, 1] == "HSA"))
    print(np.mean(protein_name_reshaped[:, 2] == "sEH"))


protein_name_eda(dataset_path)

In [None]:
def get_binds(dataset_path):
    binds = pd.read_parquet(
        dataset_path, engine="pyarrow", columns=["binds"]
    ).binds.to_numpy()
    binds = binds[:NUM_ROWS]
    return np.reshape(binds.astype("byte"), [-1, 3])


binds = get_binds(dataset_path)

In [None]:
%%time

def get_unique_BB(dataset_path, col):
    ""
    BBs = pd.read_parquet(dataset_path, engine = 'pyarrow', columns=[col])
    BBs = BBs[:NUM_ROWS]
    BBs = BBs.to_numpy()[:, 0]
    BBs_reshaped = np.reshape(BBs, [-1, 3])
    
    # Check that the 3 columns are identical
    if np.mean(BBs_reshaped[:, 0] == BBs_reshaped[:, 1]) != 1:
        print('ERROR')
    if np.mean(BBs_reshaped[:, 0] == BBs_reshaped[:, 2]) != 1:
        print('ERROR')
    
    # Create the mapping dictionaries int <-> protein
    BBs_unique = np.unique(BBs_reshaped[:, 0])
    BBs_unique = list(BBs_unique)
    BBs_dict = {BBs_unique[i]:i for i in range(len(BBs_unique))}
    BBs_dict_reverse = {i:BBs_unique[i] for i in range(len(BBs_unique))}
    return BBs_dict, BBs_dict_reverse

BBs_dict_1, BBs_dict_reverse_1 = get_unique_BB(dataset_path, 'buildingblock1_smiles')
print(len(BBs_dict_1))
BBs_dict_2, BBs_dict_reverse_2 = get_unique_BB(dataset_path, 'buildingblock2_smiles')
print(len(BBs_dict_2))
BBs_dict_3, BBs_dict_reverse_3 = get_unique_BB(dataset_path, 'buildingblock3_smiles')
print(len(BBs_dict_3))

In [None]:
%%time

def get_encoded(dataset_path, col, BBs_dict):
    BBs = pd.read_parquet(dataset_path, engine = 'pyarrow', columns=[col])
    BBs = BBs[:NUM_ROWS]
    BBs = BBs[col].to_numpy()
    BBs_reshaped = np.reshape(BBs, [-1, 3])
    BBs = BBs_reshaped[:, 0]
    encoded_BBs = [BBs_dict[x] for x in BBs]
    encoded_BBs = np.asarray(encoded_BBs, dtype = np.int16)
    return encoded_BBs

encoded_BBs_1 = get_encoded(dataset_path, 'buildingblock1_smiles', BBs_dict_1)
encoded_BBs_2 = get_encoded(dataset_path, 'buildingblock2_smiles', BBs_dict_2)
encoded_BBs_3 = get_encoded(dataset_path, 'buildingblock3_smiles', BBs_dict_3)

In [None]:
def get_molecule_smiles(dataset_path):
    if DEBUG:
        molecule_smiles = pd.read_csv(
            f"{dataset_path[:-7]}csv", usecols=["molecule_smiles"], nrows=NUM_ROWS
        )
    else:
        molecule_smiles = pd.read_parquet(
            dataset_path, engine="pyarrow", columns=["molecule_smiles"]
        )
    molecule_smiles = molecule_smiles.molecule_smiles.to_numpy()
    molecule_smiles = np.reshape(molecule_smiles, [-1, 3])
    if np.mean(molecule_smiles[:, 0] == molecule_smiles[:, 1]) != 1:
        print("ERROR")
    if np.mean(molecule_smiles[:, 0] == molecule_smiles[:, 2]) != 1:
        print("ERROR")
    return molecule_smiles[:, 0]


molecule_smiles = get_molecule_smiles(dataset_path)

In [None]:
df = pd.read_csv("/kaggle/input/leash-BELKA/train.csv", nrows=2)
df.head()

In [None]:
data = {
    "buildingblock1_smiles": encoded_BBs_1,
    "buildingblock2_smiles": encoded_BBs_2,
    "buildingblock3_smiles": encoded_BBs_3,
    "molecule_smiles": molecule_smiles,
    "binds_BRD4": binds[:, 0],
    "binds_HSA": binds[:, 1],
    "binds_sEH": binds[:, 2],
}
df = pd.DataFrame(data=data)
df.head(2)

In [None]:
df.to_parquet("train.parquet", index=False)
df.to_csv("train.csv", index=False)

In [None]:
try:
    os.mkdir("train_dicts")
except:
    print("Folder exist")

pickle.dump(BBs_dict_1, open("train_dicts/BBs_dict_1.p", "bw"))
pickle.dump(BBs_dict_2, open("train_dicts/BBs_dict_2.p", "bw"))
pickle.dump(BBs_dict_3, open("train_dicts/BBs_dict_3.p", "bw"))
pickle.dump(BBs_dict_reverse_1, open("train_dicts/BBs_dict_reverse_1.p", "bw"))
pickle.dump(BBs_dict_reverse_2, open("train_dicts/BBs_dict_reverse_2.p", "bw"))
pickle.dump(BBs_dict_reverse_3, open("train_dicts/BBs_dict_reverse_3.p", "bw"))

# For the test set


In [None]:
test_path = "/kaggle/input/leash-BELKA/test.parquet"

df = pd.read_csv("/kaggle/input/leash-BELKA/test.csv", nrows=2)
df.head()

In [None]:
ParquetFile("/kaggle/input/leash-BELKA/test.parquet").metadata

In [None]:
def id_eda_test(dataset_path):
    id_arr = pd.read_parquet(
        dataset_path, engine="pyarrow", columns=["id"]
    ).id.to_numpy()
    id_arr_2 = range(295246830, 295246830 + 1674896)
    print(np.mean(id_arr == id_arr_2))


id_eda_test(test_path)

The length of the test set is not dividable by 3. So, for some small molecules, we need to predict only one or two proteins.


In [None]:
molecule_smiles = pd.read_parquet(
    test_path, engine="pyarrow", columns=["molecule_smiles"]
).molecule_smiles.to_numpy()
protein_name = pd.read_parquet(
    test_path, engine="pyarrow", columns=["protein_name"]
).protein_name.to_numpy()
first_unique_molecule_smiles_indices = []
molecule_smiles_unique = {}
is_BRD4 = {}
is_HSA = {}
is_sEH = {}

for i, x in enumerate(molecule_smiles):
    if x not in molecule_smiles_unique:
        # New molecule
        molecule_smiles_unique[x] = [i]
        # Save the indices of the molecules that were seen for the 1st time
        first_unique_molecule_smiles_indices.append(i)
        is_BRD4[x] = False
        is_HSA[x] = False
        is_sEH[x] = False
        # Add the molecule to the relevant protein dict
        if protein_name[i] == "BRD4":
            is_BRD4[x] = True
        if protein_name[i] == "HSA":
            is_HSA[x] = True
        if protein_name[i] == "sEH":
            is_sEH[x] = True
    else:
        # Molecule already seen
        molecule_smiles_unique[x].append(i)
        if protein_name[i] == "BRD4":
            is_BRD4[x] = True
        if protein_name[i] == "HSA":
            is_HSA[x] = True
        if protein_name[i] == "sEH":
            is_sEH[x] = True
first_unique_molecule_smiles_indices = np.asarray(first_unique_molecule_smiles_indices)
print(len(is_BRD4))
print(np.sum([is_BRD4[x] for x in is_BRD4]))
print(np.sum([is_HSA[x] for x in is_HSA]))
print(np.sum([is_sEH[x] for x in is_sEH]))

molecule_smiles_unique_arr = molecule_smiles[first_unique_molecule_smiles_indices]
print(len(np.unique(molecule_smiles_unique_arr)) == len(molecule_smiles_unique_arr))

In [None]:
is_BRD4_arr = np.asarray([is_BRD4[x] for x in molecule_smiles_unique])
is_HSA_arr = np.asarray([is_HSA[x] for x in molecule_smiles_unique])
is_sEH_arr = np.asarray([is_sEH[x] for x in molecule_smiles_unique])

print(np.sum(is_BRD4_arr))
print(np.sum(is_HSA_arr))
print(np.sum(is_sEH_arr))

In [None]:
def get_unique_BB_test(dataset_path, col):
    BBs = pd.read_parquet(dataset_path, engine="pyarrow", columns=[col])
    BBs = BBs[col].to_numpy()
    BBs_unique = np.unique(BBs)
    BBs_unique = list(BBs_unique)
    BBs_dict = {BBs_unique[i]: i for i in range(len(BBs_unique))}
    BBs_dict_reverse = {i: BBs_unique[i] for i in range(len(BBs_unique))}
    return BBs_dict, BBs_dict_reverse


BBs_dict_1_test, BBs_dict_reverse_1_test = get_unique_BB_test(
    test_path, "buildingblock1_smiles"
)
print(len(BBs_dict_1_test))
BBs_dict_2_test, BBs_dict_reverse_2_test = get_unique_BB_test(
    test_path, "buildingblock2_smiles"
)
print(len(BBs_dict_2_test))
BBs_dict_3_test, BBs_dict_reverse_3_test = get_unique_BB_test(
    test_path, "buildingblock3_smiles"
)
print(len(BBs_dict_3_test))

In [None]:
def get_encoded_test(dataset_path, col, BBs_dict):
    BBs = pd.read_parquet(dataset_path, engine="pyarrow", columns=[col])
    BBs = BBs[col].to_numpy()
    BBs = BBs[first_unique_molecule_smiles_indices]
    encoded_BBs = [BBs_dict[x] for x in BBs]
    encoded_BBs = np.asarray(encoded_BBs, dtype=np.int16)
    return encoded_BBs


encoded_BBs_1_test = get_encoded_test(
    test_path, "buildingblock1_smiles", BBs_dict_1_test
)
encoded_BBs_2_test = get_encoded_test(
    test_path, "buildingblock2_smiles", BBs_dict_2_test
)
encoded_BBs_3_test = get_encoded_test(
    test_path, "buildingblock3_smiles", BBs_dict_3_test
)

In [None]:
data = {
    "buildingblock1_smiles": encoded_BBs_1_test,
    "buildingblock2_smiles": encoded_BBs_2_test,
    "buildingblock3_smiles": encoded_BBs_3_test,
    "molecule_smiles": molecule_smiles_unique_arr,
    "is_BRD4": is_BRD4_arr,
    "is_HSA": is_HSA_arr,
    "is_sEH": is_sEH_arr,
}
df = pd.DataFrame(data=data)
df.head(2)

In [None]:
df.to_parquet("test.parquet", index=False)
df.to_csv("test.csv", index=False)

In [None]:
try:
    os.mkdir("test_dicts")
except:
    print("Folder exist")

pickle.dump(BBs_dict_1_test, open("test_dicts/BBs_dict_1_test.p", "bw"))
pickle.dump(BBs_dict_2_test, open("test_dicts/BBs_dict_2_test.p", "bw"))
pickle.dump(BBs_dict_3_test, open("test_dicts/BBs_dict_3_test.p", "bw"))
pickle.dump(BBs_dict_reverse_1_test, open("test_dicts/BBs_dict_reverse_1_test.p", "bw"))
pickle.dump(BBs_dict_reverse_2_test, open("test_dicts/BBs_dict_reverse_2_test.p", "bw"))
pickle.dump(BBs_dict_reverse_3_test, open("test_dicts/BBs_dict_reverse_3_test.p", "bw"))

In [None]:
pickle.dump(molecule_smiles_unique, open("test_dicts/molecule_smiles_unique.p", "bw"))