In [1]:
import re, string 
import pandas as pd 
from time import time  
from collections import defaultdict
import spacy
from sklearn.manifold import TSNE
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
%matplotlib inline
import torch
from transformers import BertModel, BertConfig, BertTokenizer, PreTrainedTokenizer

In [2]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower().strip()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    #removes unicodes left in text so model does not learn unicodes
    text = re.sub('^\\\\u[\d\D]{4}|-|σ|→|\\\\xad', '', text)
    # Remove a sentence if it is only one word long
    if len(text) > 2:
        return ' '.join(word for word in text.split() if word not in STOPWORDS)
    return 


In [3]:
def tokenizer(text):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    tokens = tokenizer.tokenize(str(text), add_special_tokens=True)

    return tokens

In [4]:
nlp = spacy.load('en_core_web_sm')

def lemmatizer(text):
    if text == None:
        print("Daisy, daisy, sour cream!!!")
        return ""
    sent = []
    tokens = []
    doc = nlp(text)
    for word in doc:
        sent.append(word.lemma_)
    return " ".join(sent)

In [5]:
# Default value of display.max_rows is 10 i.e. at max 10 rows will be printed.
# Set it None to display all rows in the dataframe
#pd.set_option('display.max_rows', None)
#types = df_clean.apply(lambda x: type(x['text']), axis=1)

df = pd.read_json('ChemLibre_JSONS/Basic_Principles_of_Organic_Chemistry_Roberts_and_Caserio.json')
df.columns = ['text']

In [6]:
#df_clean_no_none = df_clean[df_clean.text.notnull()]

#df = pd.read_json('ChemLibre_JSONS/Wade_Map.json')
#df.columns = ['text']


In [7]:
df_clean = pd.DataFrame(df.text.apply(lambda x: clean_text(x)))
#filters out all the None values in the cleaned dataset
#loc combines the operations in brackets into one single operation to avoid chaining indexes operations together
#copy explicitly tells pandas to make copy when creating master_of_none
#this is so later on only the copy is modified and there is no confusion between the copy and the original
master_of_none = df_clean.loc[df_clean.text.notnull()].copy()


In [8]:
#master_of_none["text_tokenize"] =  master_of_none.apply(lambda x: tokenizer(x['text']), axis=1)

In [9]:
#master_of_none["text_lemmatized_tokens"] =  master_of_none.apply(lambda x: lemmatizer(x['text_tokenize']), axis=1)
master_of_none["text_lemmatized"] =  master_of_none.apply(lambda x: lemmatizer(x['text']), axis=1)

In [11]:
master_of_none["text_lemmatized_tokens"] =  master_of_none.apply(lambda x: tokenizer(x['text_lemmatized']), axis=1)

In [None]:
#df_clean['text_lemmatize_clean'] = df_clean['text_lemmatize'].str.replace('-PRON-', '')
#master_of_none['text_lemmatize_clean'] = master_of_none['text_lemmatized_tokens'].str.replace('-PRON-', '')

#sentences = [for row in master_of_none['text_lemmatized_tokens']]
word_freq = defaultdict(int)
for row in master_of_none['text_lemmatized_tokens']:
    for i in row:
        word_freq[i] += 1
len(word_freq)

# min_count: minimum number of occurrences of a word in the corpus to be included in the model.
# window: the maximum distance between the current and predicted word within a sentence.
# size: the dimensionality of the feature vectors
# workers: I know my system is having 4 cores,

w2v_model = Word2Vec(min_count=50,
                     window=5,
                     size=400,
                     workers=2)


In [None]:
#w2v_model = Word2Vec.load("trial_19.model")

In [None]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

In [None]:
# this line of code to prepare the model vocabulary
w2v_model.build_vocab(master_of_none['text_lemmatized_tokens'])
#w2v_model.build_vocab(sentences, update=True)

In [None]:
# train word vectors
#returns the number of words in the vocab and the number of words in the corpus
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=w2v_model.epochs)

In [None]:
#w2v_model = Word2Vec.load("No_tokenize/trial_1.model")

In [None]:
#Explore the model
w2v_model.wv.most_similar(positive=['chemistry'])

In [None]:
w2v_model.wv.similarity('aromatic', 'equilibrium')

In [None]:
w2v_model.wv.similarity('alcohol', 'hydroxyl')

In [None]:
w2v_model.wv.similarity('ketone', 'carbonyl')

In [None]:
w2v_model.wv.similarity('alkene', 'alkyne')

In [None]:
w2v_model.wv.similarity('acid', 'base')

In [None]:
w2v_model.wv.similarity('oxidize', 'reduce')

In [None]:
w2v_model.wv.similarity('anion', 'cation')

In [None]:
w2v_model.wv.similarity('mechanism', 'atom')

In [None]:
w2v_model.wv.similarity('resonance', 'solvent')

In [None]:
w2v_model.wv.similarity('synthesis', 'electron')

In [None]:
w2v_model.save("test.model")

In [None]:
# As we do not plan to train the model any further, 
# we are calling init_sims(), which will make the model much more memory-efficient
#w2v_model.init_sims(replace=True)