In [1]:
%%writefile spe_vectorizers.py

import pandas as pd
import numpy as np

import codecs
from SmilesPE.tokenizer import *
from SmilesPE.pretokenizer import atomwise_tokenizer
from SmilesPE.pretokenizer import kmer_tokenizer

from sklearn.feature_extraction.text import CountVectorizer


def spe_featurizer(train_data, test_data):
    """Creates datasets ready to input into ML models
       Uses SPE vocabulary and CountVectorizer to create features"""
    
    # load vocab
    spe_vob = codecs.open(r'SPE_ChEMBL.txt')
    spe = SPE_Tokenizer(spe_vob)
    
    # split SMILES strings to tokens
    train_spe = train_data.apply(lambda x: spe.tokenize(x))
    test_spe = test_data.apply(lambda x: spe.tokenize(x))
    
    # split tokenized strings into tokens
    # transform dataset into a matrix of vectors
    split_string = lambda x: x.split()
    vectorizer = CountVectorizer(preprocessor=None, stop_words=None, lowercase=False, tokenizer=split_string)

    x_train = vectorizer.fit_transform(train_spe)
    x_test = vectorizer.transform(test_spe)
    train_vocab = vectorizer.get_feature_names()
    
    return x_train, x_test, train_vocab


def spe_featurizer2(train_data, test_data):
    """Creates datasets ready to input into ML models
       Uses SPE vocabulary and CountVectorizer to create features
       Forces the use of whole vocabulary, not just fragments in train data"""
    
    # load vocab
    spe_vocab = pd.read_csv('SPE_ChEMBL.txt', header=None)
    spe_vocab = spe_vocab.rename(columns={0: 'fragments'})
    
    spe_vob = codecs.open(r'SPE_ChEMBL.txt')
    spe = SPE_Tokenizer(spe_vob)
    
    # split SMILES strings to tokens
    train_spe = train_data.apply(lambda x: spe.tokenize(x))
    test_spe = test_data.apply(lambda x: spe.tokenize(x))
    
    # split tokenized strings into tokens
    # transform dataset into a matrix of vectors
    split_string = lambda x: x.split()
    vectorizer = CountVectorizer(preprocessor=None, stop_words=None, lowercase=False, 
                                 tokenizer=split_string, vocabulary=spe_vocab.fragments)
    x_train = vectorizer.transform(train_spe)
    x_test = vectorizer.transform(test_spe)
    train_vocab = vectorizer.get_feature_names()
    
    return x_train, x_test, train_vocab


def atom_featurizer(train_data, test_data):
    """Creates datasets ready to input into ML models
       Uses atomwise tokenizer and CountVectorizer to create features"""
    
    # split SMILES strings into tokens
    train_atom = train_data.apply(lambda x: ' '.join(atomwise_tokenizer(x)))
    test_atom = test_data.apply(lambda x: ' '.join(atomwise_tokenizer(x)))
    
    split_string = lambda x: x.split()
    vectorizer = CountVectorizer(preprocessor=None, stop_words=None, 
                                 lowercase=False, tokenizer=split_string)

    x_train = vectorizer.fit_transform(train_atom)
    x_test = vectorizer.transform(test_atom)
    train_vocab = vectorizer.get_feature_names()
    
    return x_train, x_test, train_vocab


def kmer_featurizer(train_data, test_data):
    """Creates datasets ready to input into ML models
       Uses atomwise tokenizer a
       nd CountVectorizer to create features"""
    
    # split SMILES strings into tokens
    train_kmer = train_data.apply(lambda x: ' '.join(kmer_tokenizer(x)))
    test_kmer = test_data.apply(lambda x: ' '.join(kmer_tokenizer(x)))
    
    split_string = lambda x: x.split()
    vectorizer = CountVectorizer(preprocessor=None, stop_words=None, 
                                 lowercase=False, tokenizer=split_string)

    x_train = vectorizer.fit_transform(train_kmer)
    x_test = vectorizer.transform(test_kmer)
    train_vocab = vectorizer.get_feature_names()
    
    return x_train, x_test, train_vocab

Writing spe_vectorizers.py


In [2]:
%%writefile spe_vectorizers_tfidf.py

import pandas as pd
import numpy as np

import codecs
from SmilesPE.tokenizer import *
from SmilesPE.pretokenizer import atomwise_tokenizer
from SmilesPE.pretokenizer import kmer_tokenizer

from sklearn.feature_extraction.text import TfidfVectorizer


def spe_featurizer_tfidf(train_data, test_data):
    """Creates datasets ready to input into ML models
       Uses SPE vocabulary and CountVectorizer to create features"""
    
    # load vocab
    spe_vob = codecs.open(r'SPE_ChEMBL.txt')
    spe = SPE_Tokenizer(spe_vob)
    
    # split SMILES strings to tokens
    train_spe = train_data.apply(lambda x: spe.tokenize(x))
    test_spe = test_data.apply(lambda x: spe.tokenize(x))
    
    # split tokenized strings into tokens
    # transform dataset into a matrix of vectors
    split_string = lambda x: x.split()
    vectorizer = TfidfVectorizer(preprocessor=None, stop_words=None, lowercase=False, tokenizer=split_string)

    x_train = vectorizer.fit_transform(train_spe)
    x_test = vectorizer.transform(test_spe)
    train_vocab = vectorizer.get_feature_names()
    
    return x_train, x_test, train_vocab


def spe_featurizer_tfidf2(train_data, test_data):
    """Creates datasets ready to input into ML models
       Uses SPE vocabulary and CountVectorizer to create features
       Forces the use of whole vocabulary, not just fragments in train data"""
    
    # load vocab
    spe_vocab = pd.read_csv('SPE_ChEMBL.txt', header=None)
    spe_vocab = spe_vocab.rename(columns={0: 'fragments'})
    
    spe_vob = codecs.open(r'SPE_ChEMBL.txt')
    spe = SPE_Tokenizer(spe_vob)
    
    # split SMILES strings to tokens
    train_spe = train_data.apply(lambda x: spe.tokenize(x))
    test_spe = test_data.apply(lambda x: spe.tokenize(x))
    
    # split tokenized strings into tokens
    # transform dataset into a matrix of vectors
    split_string = lambda x: x.split()
    vectorizer = TfidfVectorizer(preprocessor=None, stop_words=None, lowercase=False, 
                                 tokenizer=split_string, vocabulary=spe_vocab.fragments)
    x_train = vectorizer.transform(train_spe)
    x_test = vectorizer.transform(test_spe)
    train_vocab = vectorizer.get_feature_names()
    
    return x_train, x_test, train_vocab


def atom_featurizer_tfidf(train_data, test_data):
    """Creates datasets ready to input into ML models
       Uses atomwise tokenizer and CountVectorizer to create features"""
    
    # split SMILES strings into tokens
    train_atom = train_data.apply(lambda x: ' '.join(atomwise_tokenizer(x)))
    test_atom = test_data.apply(lambda x: ' '.join(atomwise_tokenizer(x)))
    
    split_string = lambda x: x.split()
    vectorizer = TfidfVectorizer(preprocessor=None, stop_words=None, 
                                 lowercase=False, tokenizer=split_string)

    x_train = vectorizer.fit_transform(train_atom)
    x_test = vectorizer.transform(test_atom)
    train_vocab = vectorizer.get_feature_names()
    
    return x_train, x_test, train_vocab


def kmer_featurizer_tfidf(train_data, test_data):
    """Creates datasets ready to input into ML models
       Uses atomwise tokenizer a
       nd CountVectorizer to create features"""
    
    # split SMILES strings into tokens
    train_kmer = train_data.apply(lambda x: ' '.join(kmer_tokenizer(x)))
    test_kmer = test_data.apply(lambda x: ' '.join(kmer_tokenizer(x)))
    
    split_string = lambda x: x.split()
    vectorizer = TfidfVectorizer(preprocessor=None, stop_words=None, 
                                 lowercase=False, tokenizer=split_string)

    x_train = vectorizer.fit_transform(train_kmer)
    x_test = vectorizer.transform(test_kmer)
    train_vocab = vectorizer.get_feature_names()
    
    return x_train, x_test, train_vocab

Writing spe_vectorizers_tfidf.py


In [7]:
%%writefile kaggle_chem.py

import pandas as pd
import numpy as np

from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec
from gensim.models import word2vec
from rdkit.Chem import Descriptors

from rdkit import Chem
from rdkit.Chem import AllChem


def ecfp_featurizer(train, test):
    """Creates datasets ready to input into ML models
       Uses mol2vec to create ECFP features from SMILES strings"""
    
    # convert SMILES to RDKit Mol object
    train['mol'] = train['std_compounds'].apply(lambda x: Chem.MolFromSmiles(x))
    test['mol'] = test['std_compounds'].apply(lambda x: Chem.MolFromSmiles(x))
    
    model = word2vec.Word2Vec.load('model_300dim.pkl')
    
    #Constructing sentences
    train['sentence'] = train.apply(lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1)
    test['sentence'] = test.apply(lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1)

    # extracting embeddings to a numpy.array
    # note that we always should mark unseen='UNK' in sentence2vec() 
    # so that model is taught how to handle unknown substructures
    train['mol2vec'] = [DfVec(x) for x in sentences2vec(train['sentence'], model, unseen='UNK')]
    test['mol2vec'] = [DfVec(x) for x in sentences2vec(test['sentence'], model, unseen='UNK')]
    x_train = np.array([x.vec for x in train['mol2vec']])
    x_test = np.array([x.vec for x in test['mol2vec']])
    
    return x_train, x_test


def number_of_atoms(atom_list, df):
    """Helper function for oned_featurizer"""
    
    for i in atom_list:
        df['num_of_{}_atoms'.format(i)] = df['mol'].apply(lambda x: len(x.GetSubstructMatches(Chem.MolFromSmiles(i))))
        
def oned_featurizer(train, test):
    """Creates datasets ready to input into ML models
       Uses mol2vec to create 1D representations of molecules from SMILES strings
       Includes the following features: number of atoms, number of heavy atoms,
           number of C, O, N, and Cl atoms, molecular weight, 
           number of valence electrons, and number of heteroatoms"""
    
    # convert SMILES to RDKit Mol object
    train['mol'] = train['std_compounds'].apply(lambda x: Chem.MolFromSmiles(x))
    test['mol'] = test['std_compounds'].apply(lambda x: Chem.MolFromSmiles(x))
    
    # number of atoms
    train['mol'] = train['mol'].apply(lambda x: Chem.AddHs(x))
    train['num_of_atoms'] = train['mol'].apply(lambda x: x.GetNumAtoms())
    train['num_of_heavy_atoms'] = train['mol'].apply(lambda x: x.GetNumHeavyAtoms())
    number_of_atoms(['C','O', 'N', 'Cl'], train)
    
    test['mol'] = test['mol'].apply(lambda x: Chem.AddHs(x))
    test['num_of_atoms'] = test['mol'].apply(lambda x: x.GetNumAtoms())
    test['num_of_heavy_atoms'] = train['mol'].apply(lambda x: x.GetNumHeavyAtoms())
    number_of_atoms(['C','O', 'N', 'Cl'], test)
    
    # molecular descriptors
    train['mol_w'] = train['mol'].apply(lambda x: Descriptors.ExactMolWt(x))
    train['num_valence_electrons'] = train['mol'].apply(lambda x: Descriptors.NumValenceElectrons(x))
    train['num_heteroatoms'] = train['mol'].apply(lambda x: Descriptors.NumHeteroatoms(x))
    
    test['mol_w'] = test['mol'].apply(lambda x: Descriptors.ExactMolWt(x))
    test['num_valence_electrons'] = test['mol'].apply(lambda x: Descriptors.NumValenceElectrons(x))
    test['num_heteroatoms'] = test['mol'].apply(lambda x: Descriptors.NumHeteroatoms(x))
    
    x_train = train[['num_of_atoms', 'num_of_heavy_atoms', 'num_of_C_atoms', 
                     'num_of_O_atoms', 'num_of_N_atoms', 'num_of_Cl_atoms', 'mol_w', 
                     'num_valence_electrons', 'num_heteroatoms']]
    x_test = test[['num_of_atoms', 'num_of_heavy_atoms', 'num_of_C_atoms', 
                     'num_of_O_atoms', 'num_of_N_atoms', 'num_of_Cl_atoms', 'mol_w', 
                     'num_valence_electrons', 'num_heteroatoms']]
    
    return x_train, x_test

Overwriting kaggle_chem.py
