# Preprocessing

In [20]:
%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime


In [21]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = 'input/inchi-preprocess'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    
if not os.path.exists('input/pytorch-image-models'):
    os.makedirs('input/pytorch-image-models')
    
if not os.path.exists('input/pytorch-image-models/pytorch-image-models-master'):
    os.makedirs('input/pytorch-image-models/pytorch-image-models-master')
    
if not os.path.exists('input/inchi-resnet-lstm-with-attention-starter'):
    os.makedirs('input/inchi-resnet-lstm-with-attention-starter')
    
if not os.path.isfile('train_labels.csv') or not os.path.isfile('sample_submission.csv'):
    raise Exception('Please unzip the contents of this archive and place the train_labels.csv and sample_submission.csv file in the root directory! https://www.kaggle.com/c/bms-molecular-translation')

In [22]:
# ====================================================
# Library
# ====================================================
import os
import re
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import dill
tqdm.pandas()
import torch

# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv('train_labels.csv')
print(f'train.shape: {train.shape}')

# ====================================================
# Preprocess functions
# ====================================================
def split_form(form):
    string = ''
    for i in re.findall(r"[A-Z][^A-Z]*", form):
        elem = re.match(r"\D+", i).group()
        num = i.replace(elem, "")
        if num == "":
            string += f"{elem} "
        else:
            string += f"{elem} {str(num)} "
    return string.rstrip(' ')

def split_form2(form):
    string = ''
    for i in re.findall(r"[a-z][^a-z]*", form):
        elem = i[0]
        num = i.replace(elem, "").replace('/', "")
        num_string = ''
        for j in re.findall(r"[0-9]+[^0-9]*", num):
            num_list = list(re.findall(r'\d+', j))
            assert len(num_list) == 1, f"len(num_list) != 1"
            _num = num_list[0]
            if j == _num:
                num_string += f"{_num} "
            else:
                extra = j.replace(_num, "")
                num_string += f"{_num} {' '.join(list(extra))} "
        string += f"/{elem} {num_string}"
    return string.rstrip(' ')

# ====================================================
# Tokenizer
# ====================================================
class Tokenizer(object):
    
    def __init__(self):
        self.stoi = {}
        self.itos = {}

    def __len__(self):
        return len(self.stoi)
    
    def fit_on_texts(self, texts):
        vocab = set()
        for text in texts:
            vocab.update(text.split(' '))
        vocab = sorted(vocab)
        vocab.append('<sos>')
        vocab.append('<eos>')
        vocab.append('<pad>')
        for i, s in enumerate(vocab):
            self.stoi[s] = i
        self.itos = {item[1]: item[0] for item in self.stoi.items()}
        
    def text_to_sequence(self, text):
        sequence = []
        sequence.append(self.stoi['<sos>'])
        for s in text.split(' '):
            sequence.append(self.stoi[s])
        sequence.append(self.stoi['<eos>'])
        return sequence
    
    def texts_to_sequences(self, texts):
        sequences = []
        for text in texts:
            sequence = self.text_to_sequence(text)
            sequences.append(sequence)
        return sequences

    def sequence_to_text(self, sequence):
        return ''.join(list(map(lambda i: self.itos[i], sequence)))
    
    def sequences_to_texts(self, sequences):
        texts = []
        for sequence in sequences:
            text = self.sequence_to_text(sequence)
            texts.append(text)
        return texts
    
    def predict_caption(self, sequence):
        caption = ''
        for i in sequence:
            if i == self.stoi['<eos>'] or i == self.stoi['<pad>']:
                break
            caption += self.itos[i]
        return caption
    
    def predict_captions(self, sequences):
        captions = []
        for sequence in sequences:
            caption = self.predict_caption(sequence)
            captions.append(caption)
        return captions

train.shape: (2424186, 2)


# Main

In [23]:
# ====================================================
# main
# ====================================================
def main():
    # ====================================================
    # preprocess train.csv
    # ====================================================
    train['InChI_1'] = train['InChI'].progress_apply(lambda x: x.split('/')[1])
    train['InChI_text'] = train['InChI_1'].progress_apply(split_form) + ' ' + \
                            train['InChI'].apply(lambda x: '/'.join(x.split('/')[2:])).progress_apply(split_form2).values
    # ====================================================
    # create tokenizer
    # ====================================================
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train['InChI_text'].values)
    torch.save(tokenizer, f"{OUTPUT_DIR}/tokenizer.pth", pickle_module=dill)
    print('Saved tokenizer')
    # ====================================================
    # preprocess train.csv
    # ====================================================
    lengths = []
    tk0 = tqdm(train['InChI_text'].values, total=len(train))
    for text in tk0:
        seq = tokenizer.text_to_sequence(text)
        length = len(seq) - 2
        lengths.append(length)
    train['InChI_length'] = lengths
    train.to_json(f"{OUTPUT_DIR}/train.json")
    print('Saved preprocessed train.json')

In [None]:
if __name__ == '__main__':
    main()

  0%|          | 0/2424186 [00:00<?, ?it/s]

  0%|          | 0/2424186 [00:00<?, ?it/s]

  0%|          | 0/2424186 [00:00<?, ?it/s]