In [None]:
# Install librariess
!pip install supervenn
!pip install upsetplot

Collecting supervenn
  Downloading supervenn-0.5.0-py3-none-any.whl.metadata (1.2 kB)
Downloading supervenn-0.5.0-py3-none-any.whl (17 kB)
Installing collected packages: supervenn
Successfully installed supervenn-0.5.0
Collecting upsetplot
  Downloading UpSetPlot-0.9.0.tar.gz (23 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: upsetplot
  Building wheel for upsetplot (pyproject.toml) ... [?25ldone
[?25h  Created wheel for upsetplot: filename=UpSetPlot-0.9.0-py3-none-any.whl size=24817 sha256=38438c63fbe8cdb82838e6f2b166e6c71122c66477f9085774a295c50bfde16d
  Stored in directory: /root/.cache/pip/wheels/73/42/9f/1c9718ea27f30466d2787e0f7d88a7cb11942e3460c17e0ef6
Successfully built upsetplot
Installing collected packages: upsetplot
Successfully installed upsetplot-0.9.0


In [17]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm.auto import tqdm
from pathlib import Path
from supervenn import supervenn
import os
for dirname, _, filenames in os.walk('../input'):
    for filename in filenames[:3]:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


../input/leash-BELKA/sample_submission.csv
../input/leash-BELKA/train.parquet
../input/leash-BELKA/test.parquet


In [18]:
SAVEDIR = Path("train")
SAVEDIR.mkdir(exist_ok=True)

In [19]:
protein_names = ["BRD4", "HSA", "sEH"]
smiles_col_names = [
    "buildingblock1_smiles", 
    "buildingblock2_smiles", 
    "buildingblock3_smiles", 
    "molecule_smiles"
]

In [20]:
schema_single_target = pa.schema([
    (col_name, pa.string())
    for col_name in smiles_col_names
])

schema_mixed_target = pa.schema([
    (col_name, pa.string())
    for col_name in smiles_col_names
] + [(prot_name, pa.int8()) for prot_name in protein_names])

schema_unprocessed = pa.schema([
    (col_name, pa.string())
    for col_name in smiles_col_names
] + [
    ('protein_name', pa.string()),
    ('binds', pa.int8())
])

KEYS_INFO = [
    ("all0", schema_single_target), 
    ("all1", schema_single_target), 
    ("all_mixed", schema_mixed_target),
    ('unprocessed', schema_unprocessed)
]

In [21]:
class PqDataSaverWrapper:
    def __init__(self, savedir=".", keys_info=KEYS_INFO, prefix="train"):
        self.file_handlers = None
        if isinstance(savedir, str):
            savedir = Path(savedir)
        self.savedir = savedir
        self.keys_info = keys_info
        self.prefix = prefix

    def add_data(self, df_dict):
        if self.file_handlers is None:
            raise Exception("Attempt to call 'add_data' outside of with-block")
        for key in df_dict:
            if not key in self.file_handlers:
                raise Exception(f"Key should be equal to one of the provided in __init__ method, got '{key}' instead")
            if df_dict[key].shape[0] > 0:
                pq_batch = pa.RecordBatch.from_pandas(df_dict[key])
                self.file_handlers[key].write(pq_batch)

    def __enter__(self):
        self.file_handlers = {}
        for key, schema in self.keys_info:
            filename = self.savedir / f"{self.prefix}_{key}.parquet"
            if filename.exists():
                raise Exception(f"File '{filename}' for key '{key}' already exists, remove it if you want to regenerate everything")
            self.file_handlers[key] = pq.ParquetWriter(filename.as_posix(), schema)
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        for pq_writer in self.file_handlers.values():
            pq_writer.close()
        self.file_handlers = None

In [22]:
train_pq = pq.ParquetFile("../input/leash-BELKA/train.parquet")

partial_df = None
additional_records = []
with PqDataSaverWrapper(savedir=SAVEDIR, keys_info=KEYS_INFO) as pq_wrapper:
    for i in tqdm(np.arange(train_pq.num_row_groups), total=train_pq.num_row_groups):
        batch = train_pq.read_row_group(i)
        batch_df = batch.to_pandas()
        if partial_df is not None and partial_df.shape[0] > 0:
            batch_df = pd.concat([partial_df, batch_df], ignore_index=True)
            partial_df = None
        batch_df = batch_df.pivot_table(
            columns="protein_name", 
            values="binds", 
            index=smiles_col_names).reset_index(drop=False)
        batch_df.columns.name = None
        missing_protein_data_ids = batch_df[protein_names].isnull().any(axis=1)

        filename = SAVEDIR / "all_present"/ f"part_{i}.csv"
        new_data = {}
        if (~missing_protein_data_ids).any():
            all0_ids = batch_df[protein_names].max(1) == 0
            all1_ids = batch_df[protein_names].min(1) == 1
            new_data['all0'] = batch_df[(~missing_protein_data_ids) & all0_ids][smiles_col_names].reset_index(drop=True)
            new_data['all1'] = batch_df[(~missing_protein_data_ids) & all1_ids][smiles_col_names].reset_index(drop=True)
            all_mixed_df = batch_df[(~missing_protein_data_ids) & (~all1_ids) & (~all0_ids)].reset_index(drop=True)
            all_mixed_df[protein_names] = all_mixed_df[protein_names].astype(np.int8)
            new_data['all_mixed'] = all_mixed_df

        if missing_protein_data_ids.any():
            partial_df = batch_df[missing_protein_data_ids].melt(
                id_vars=smiles_col_names,
                value_vars=protein_names,
                value_name="binds",
                var_name="protein_name"
            )
            partial_df = partial_df[~partial_df.binds.isnull()].reset_index(drop=True)
            partial_df['binds'] = partial_df['binds'].astype(np.int8)
        pq_wrapper.add_data(new_data)

    if partial_df is not None and partial_df.shape[0] > 0:
        pq_wrapper.add_data({
            'unprocessed': partial_df
        })
        

  0%|          | 0/282 [00:00<?, ?it/s]