## Attempt at building a word2vec/cbow model to find related pesticides

In [None]:
# Needs to be run only once, or when an update is required
#import nltk
#nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /homes/michiel/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
import gensim
from gensim.models import Word2Vec, KeyedVectors
import pandas as pd 
import utils # all datareading and preprocessing functionality

In [13]:
file1 = './data/abstract_set1.txt' # pesticides
#file2 = './data/abstract_set2.txt' # cancer-ish no need for this one here
data_selection = 'abstract_clean'
label_selection = 'label'

data = pd.read_csv(file1, sep="\t")
data = data.dropna()
data.head()

Unnamed: 0,pmid,title,abstract
0,29990300,Oxidative stress in triazine pesticide toxicit...,This review article provides a summary of the ...
1,29990732,Toxicity induced by glyphosate and glyphosate-...,Glyphosate is the active component of many com...
2,29999060,Direct kinetics study of CH2OO + methyl vinyl ...,Methyl vinyl ketone (MVK) and methacrolein (MA...
3,30003825,Dihydromyricetin improves vascular hyporespons...,CONTEXT: Dihydromyricetin (DMY) has oxidation ...
4,30015122,Glyphosate and atrazine in rainfall and soils ...,The presence in the atmosphere of glyphosate (...


In [14]:
# tokenize
abstract_tokens = data.abstract.apply(gensim.utils.simple_preprocess)

create a word embedding model using Word2Vec - CBOW variant:

In [15]:
# Model parameters
model=gensim.models.Word2Vec(window=5, min_count=2, workers=4, sg=0)

# Train the model
model.build_vocab(abstract_tokens, progress_per=1000)
model.train(abstract_tokens, total_examples=model.corpus_count, epochs=model.epochs)

# Save the trained model
model.save("./pesticides_abstracts_word2vec.model")

The hyperparameters used in gensim.models.Word2Vec are as follows:  

- **`size`**: The number of dimensions of the embeddings (the default is 100).  
- **`window`**: The maximum distance between a target word and words around the target word (the default is 5).  
- **`min_count`**: The minimum count of words to consider when training the model (the default for is 5).  
- **`workers`**: The number of partitions during training (the default is 3).  
- **`sg`**: The training algorithm, either 0 for CBOW or 1 for skip gram (the default is 0).  

Take a look at the vector of a particular word from the corpus. Note it will have length 100.

In [18]:
model.wv["dieldrin"]

array([-0.15374185,  0.69091773,  0.01035906,  0.19827147, -0.10486519,
       -0.20027548,  1.6745435 ,  0.3232113 , -0.2786336 ,  0.60990953,
        0.3436268 ,  0.00609223,  0.24498984,  0.07191842,  0.8171237 ,
       -0.8163991 ,  0.5055392 , -0.36991423, -0.41761562, -0.8833689 ,
       -0.46111727,  0.06610417,  0.31896648, -0.40499607,  0.03776569,
       -0.02193522, -0.59146523,  0.1883266 , -1.9399718 ,  1.1174822 ,
        0.8269693 , -0.28225216,  0.7519978 , -0.27543712, -0.7055754 ,
        0.42404673, -0.06364256,  0.290182  ,  0.27470818, -1.5237079 ,
        0.11347376, -0.8448222 , -0.06127544, -0.05488081,  0.9511501 ,
        0.63782096,  0.36445934,  0.11438367, -0.38866833,  0.33374286,
       -0.54320526,  0.0319364 , -0.21989514, -0.04308268, -0.6551177 ,
        0.24538743,  0.03790284, -0.58751667, -1.0411121 , -0.77488065,
       -0.02837608, -0.2825058 , -0.31125218,  0.10905086, -0.2194577 ,
        0.53885686,  0.38439408,  0.5557378 , -0.28017503,  0.81

In [19]:
model.wv["glyphosate"]

array([ 0.11003078,  0.03516333,  0.96000797,  0.01613707, -0.4626589 ,
       -2.12389   ,  1.3617249 ,  1.2632192 , -0.15923855,  0.8129955 ,
        1.1931697 , -0.81588453, -0.78949344, -0.68160886,  1.1676435 ,
       -0.5642724 ,  0.67206424, -0.39393362, -0.62242013, -2.352616  ,
       -0.97730845, -0.51802963,  0.5102751 ,  0.43697238,  0.27315298,
        0.14897981, -0.8055614 ,  0.5237034 , -1.0138222 ,  0.3279743 ,
        1.2143714 ,  0.3772431 ,  2.7095087 , -0.44817778, -0.35317734,
        0.33508065,  0.3017933 , -0.4416767 ,  0.37668222, -1.1039255 ,
       -1.0386629 , -0.3238383 , -0.93726414, -0.2584309 , -0.2269716 ,
        0.4405077 ,  0.04887197,  0.18141317, -0.6993494 ,  0.9136117 ,
       -1.1842176 ,  0.75051165,  0.05582989,  1.1534021 ,  1.3792963 ,
       -0.5663063 ,  0.47661996, -0.18372248, -0.16165948,  0.24915798,
       -0.02361266, -0.37095416,  0.24674506, -0.9766766 , -0.64819986,
       -0.4897831 ,  1.4866235 ,  0.80109316, -0.291049  , -0.05

Find the top N words that are semantically the closest to the word dieldrin based on the cosine similarity between the vectors of the words in our corpus.

In [22]:
model.wv.most_similar("dieldrin", topn=20)

[('hch', 0.9769817590713501),
 ('dde', 0.9768609404563904),
 ('aldrin', 0.9686754941940308),
 ('chlordane', 0.9483442306518555),
 ('ddd', 0.9469141364097595),
 ('hcb', 0.9453114867210388),
 ('heptachlor', 0.940208375453949),
 ('lindane', 0.930935263633728),
 ('ddt', 0.926992654800415),
 ('bhc', 0.9215170741081238),
 ('endosulfan', 0.9207156300544739),
 ('endrin', 0.9201696515083313),
 ('methoxychlor', 0.917574405670166),
 ('pcb', 0.9032790064811707),
 ('diazinon', 0.9026460647583008),
 ('dichlorvos', 0.9017914533615112),
 ('desmethyl', 0.8985465168952942),
 ('dimethoate', 0.8984988927841187),
 ('pirimicarb', 0.8945596814155579),
 ('thiacloprid', 0.8920301795005798)]

In [23]:
model.wv.most_similar("glyphosate", topn=20)

[('herbicide', 0.907519519329071),
 ('atrazine', 0.8002203106880188),
 ('glufosinate', 0.7993463277816772),
 ('formulation', 0.7891854643821716),
 ('chlorpyrifos', 0.7737353444099426),
 ('imidacloprid', 0.7612190246582031),
 ('fungicide', 0.7609195113182068),
 ('herbicides', 0.7604256272315979),
 ('roundup', 0.7577640414237976),
 ('op', 0.7574062943458557),
 ('neonicotinoid', 0.7539829611778259),
 ('organophosphate', 0.7515392303466797),
 ('carbaryl', 0.7496296167373657),
 ('mixture', 0.7478381991386414),
 ('metolachlor', 0.7460909485816956),
 ('malathion', 0.7430873513221741),
 ('neonicotinoids', 0.7421098351478577),
 ('mixtures', 0.7397744059562683),
 ('ampa', 0.7382535338401794),
 ('carbendazim', 0.7320340275764465)]

In [25]:
model.wv.similarity(w1="dieldrin", w2="persistent")


0.64391196