# How to use the SMILES corrector. 
This version demonstrates how to create a training set and train & apply the model

# Setup

In [None]:
import torch
import os

import pandas as pd
import numpy as np

import random

from src.preprocess import standardize, train_valid_test_split, remove_long_sequences
from src.invalidSMILES import get_invalid_smiles
from src.modelling import initialize_model, train_model, correct_SMILES

# set random seed, used for error generation & initiation transformer
SEED = 42
random.seed(SEED)

# for tutorial use shorter version of dataset
short = True

# Create standardized dataset

In [None]:
# directories & file for basing training & evaluation datasets on
folder_raw = "RawData/"
folder_out = "Data/"
data_source = "PAPYRUS.csv"

# indicate maximum length of sequences
threshold = 200

# create standardized dataset if not already present
if os.path.exists(
        f"{folder_out}{data_source.split('.')[0]}_{threshold}_standardized.csv"
):
    # Load dataset of standardized SMILES & of fragments
    df = pd.read_csv(
        f"{folder_out}{data_source.split('.')[0]}_{threshold}_standardized.csv",
        usecols=["STD_SMILES"],
        header=0,
        index_col=None,
    )

else:
    # standardize
    df = standardize(folder_raw, data_source, short = short)
    # remove long sequences
    df = remove_long_sequences(df,
                                subset="STD_SMILES",
                                threshold=threshold)
    # save standardized dataframe
    df.to_csv(
        f"Data/{data_source.split('.')[0]}_{threshold}_standardized.csv",
        index=False)
    df = df['STD_SMILES']

data_source = f"{data_source.split('.')[0]}_{threshold}"


# Create synthetic invalid SMILES

In [None]:
# indicate types of errors to create model for
invalid_type = "multiple"
# how many error to introduce (for )
num_errors = 12

# create synthetic invalid SMILES if not already present
if os.path.exists(
        f"{folder_out}errors/{data_source}_{invalid_type}_{num_errors}_errors.csv"
):
    # Load dataset of invalid and valid SMILES
    df = pd.read_csv(
        f"{folder_out}errors/{data_source}_{invalid_type}_{num_errors}_errors.csv",
        usecols=["STD_SMILES", "ERROR"],
        header=0,
        index_col=None,
    )

else:
    df_frag = pd.read_csv(f"{folder_raw}gbd_8.csv",
                            names=["FRAGMENT"],
                            usecols=[0],
                            header=0).dropna()
    # takes few minutes when using ray on ~24 CPUs
    print(df_frag)
    df = get_invalid_smiles(df, df_frag, SEED, invalid_type, num_errors)

    # remove long sequences
    df = remove_long_sequences(df,
                                subset="STD_SMILES",
                                threshold=threshold)
    df = remove_long_sequences(df, subset="ERROR", threshold=threshold)

    if not os.path.exists(f"{folder_out}errors"):
        os.makedirs(f"{folder_out}errors")

    df.to_csv(
        f"{folder_out}errors/{data_source}_{invalid_type}_{num_errors}_errors.csv",
        index=False)
    print(df)


# Split dataset into train & test

In [None]:
if not os.path.exists(
        f"{folder_out}errors/split/{data_source}_{invalid_type}_{num_errors}_errors_train.csv"
):
    # for splitting the data and turning it into a torchtext dataset
    train, valid, _ = train_valid_test_split(df, SEED=SEED)
    if not os.path.exists(f"{folder_out}errors/split"):
        os.makedirs(f"{folder_out}errors/split")
    train.to_csv(
        f"{folder_out}errors/split/{data_source}_{invalid_type}_{num_errors}_errors_train.csv",
        index=False)
    valid.to_csv(
        f"{folder_out}errors/split/{data_source}_{invalid_type}_{num_errors}_errors_dev.csv",
        index=False)

# Initialize SMILES corrector model

In [None]:
# source of de novo generated errors
error_source = 'Data/papyrus_rnn_XS.csv'

# gpu to use
gpu = '0'

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# define this in test.py
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)
print(torch.cuda.current_device())

#device = torch.device('cpu')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model, out, SRC = initialize_model(
    folder_out,
    data_source,
    error_source,
    device,
    threshold=threshold,
    epochs=20,
    layers=3,
    batch_size=16,
    invalid_type=invalid_type,
    num_errors=num_errors,
)

# Use SMILES corrector
Can train the SMILES corrector or just use it for fixing 

In [None]:
# train model
model = train_model(model, out, False)

In [None]:
# fix errors
error_source = 'Data/papyrus_rnn_XS.csv'

print(f"Fixing {error_source.split('/')[-1].split('.')[0]}")

valids, df_output = correct_SMILES(model, out, error_source, device,
                                    SRC)
df_output.to_csv(
    f"generated/{out.split('/')[-1]}_{error_source.split('/')[-1].split('.')[0]}_fixed.csv",
    index=False)