# InferSent embeddings

In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
import torch
from models import InferSent #not library but a file
import nltk
nltk.download('punkt')

input_file = 'test_data-CLEANED-3.csv' #'train_data-CLEANED-3.csv'
output_file = 'test_data-CLEANED-4.csv' #'train_data-CLEANED-4.csv'

[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


#### Loading module GloVe

In [2]:
model_version = 1
MODEL_PATH = "encoder/infersent%s.pkl" % model_version
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

<All keys matched successfully>

In [3]:
# If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
W2V_PATH = 'GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec'
model.set_w2v_path(W2V_PATH)

In [4]:
# Load embeddings of K most frequent words
model.build_vocab_k_words(K=100000)

Vocab size : 100000


In [5]:
# Try on a few sentences:
sentence = ["Palacky University Olomouc is a university in Olomouc, Czech Republic."]
sentence = [sent.replace(".", " .")for sent in sentence]
print(sentence)
#embeddings = model.encode(sentences, bsize=128, tokenize=False, verbose=True)
embeddings = model.encode(sentences=sentence, tokenize=True)
#the problem seems to lie in creating nmumpy array from sentences which fails on different lengths of sentences, works when done one by one
print(embeddings.shape)
print(embeddings)

['Palacky University Olomouc is a university in Olomouc, Czech Republic .']
(1, 4096)
[[ 0.07064653  0.07747242  0.02942695 ... -0.00530698  0.03489926
   0.09624151]]


In [6]:

embeddings = model.encode(["Oxford University is a collegiate research university in Oxford, England ."], bsize=128, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings)))

Nb words kept : 12/13 (92.3%)
Speed : 6.4 sentences/s (cpu mode, bsize=128)
nb sentences encoded : 1


In [7]:
print(embeddings.shape)
print(embeddings)
embeddings[0].tolist()

(1, 4096)
[[0.0757819  0.1313888  0.01312788 ... 0.0325584  0.07692444 0.10082895]]


[0.07578189671039581,
 0.1313887983560562,
 0.013127877376973629,
 0.024263231083750725,
 0.04238787665963173,
 0.046969134360551834,
 0.02949630469083786,
 0.08654940128326416,
 -0.03616871312260628,
 -0.0033157530706375837,
 -0.0033893194049596786,
 -0.07716049998998642,
 0.1162479892373085,
 0.052520766854286194,
 0.04183901846408844,
 -0.006888399366289377,
 0.1073925644159317,
 0.0567111000418663,
 0.01691325195133686,
 0.10057227313518524,
 -0.004031191114336252,
 0.12095189839601517,
 0.02196621336042881,
 0.025068679824471474,
 -0.03555263206362724,
 -0.011780399829149246,
 0.04248301312327385,
 -0.02821616642177105,
 0.043136194348335266,
 -0.014959435909986496,
 0.052493445575237274,
 -0.015490659512579441,
 -0.008643193170428276,
 0.025819770991802216,
 0.017615586519241333,
 0.08049411326646805,
 -0.00405747490003705,
 0.11437768489122391,
 0.0034772043582051992,
 -0.003626978723332286,
 0.03836957365274429,
 0.04769066348671913,
 -0.010227102786302567,
 -0.0123925944790244

### Loading data (parts to create embeddings for)

In [8]:
t_data = pd.read_csv(input_file, sep = ';')
t_data.drop(columns = ['Unnamed: 0'], inplace = True)
t_data

Unnamed: 0,part,labels,affil_index,embeddings_csd-BERT,embeddings_uncsd-BERT,embeddings_uncsd-BERT-1024,embeddings_Doc2Vec-enwiki,embeddings_Doc2Vec-apnews
0,Aventis Pharma,Institution,0,"[-0.6225269436836243, 0.41175007820129395, 0.9...","[-0.7495080828666687, 0.10450626909732819, 0.8...","[-0.9385257363319397, -0.8636099100112915, 0.9...","[0.001125917537137866, -0.00017105128790717572...","[0.001125917537137866, -0.00017105128790717572..."
1,Romainville,City,0,"[-0.7443768978118896, 0.37689119577407837, 0.9...","[-0.9179261922836304, -0.47461554408073425, -0...","[-0.9338648915290833, -0.8978722095489502, 0.9...","[-0.0005519804544746876, 0.0012614359147846699...","[-0.0005519804544746876, 0.0012614359147846699..."
2,France. christophe.dini@aventis.com,Country,0,"[-0.7713785767555237, 0.44986480474472046, 0.9...","[-0.8849691152572632, -0.5707671642303467, -0....","[-0.925447940826416, -0.9710708260536194, 0.99...","[-0.0009261688101105392, 0.0001335361012024805...","[-0.0009261688101105392, 0.0001335361012024805..."
3,Jiangsu Key Laboratory of Drug Design and Opti...,Institution,1,"[-0.615035891532898, 0.31155872344970703, 0.99...","[-0.9479542970657349, -0.6653308272361755, -0....","[-0.9993273019790649, -0.998293399810791, 0.99...","[-0.0010953565360978246, 4.938078564009629e-05...","[-0.0011839469661936164, -0.001301796175539493..."
4,China Pharmaceutical University,Institution,1,"[-0.738187313079834, 0.46809735894203186, 0.99...","[-0.9523708820343018, -0.5192119479179382, -0....","[-0.9705463647842407, -0.9841862916946411, 0.9...","[0.001557248760946095, -0.0008196939597837627,...","[0.001557248760946095, -0.0008196939597837627,..."
...,...,...,...,...,...,...,...,...
311,People's Republic of China.,Country,58,"[-0.7836729288101196, 0.5700669884681702, 0.99...","[-0.9732127785682678, -0.5987682938575745, -0....","[-0.8492422699928284, -0.9690293669700623, 0.9...","[-0.0007051564170978963, 0.0002351854054722935...","[-0.0007051564170978963, 0.0002351854054722935..."
312,National Center for Natural Products Research,Institution,59,"[-0.6381785273551941, 0.46710556745529175, 0.9...","[-0.9274004697799683, -0.2877633571624756, -0....","[-0.9930930137634277, -0.9795610904693604, 0.9...","[-0.0015818303218111396, 0.0003046033380087465...","[-0.0015818303218111396, 0.0003046033380087465..."
313,The University of Mississippi,Institution,59,"[-0.676986813545227, 0.4199213683605194, 0.999...","[-0.9446776509284973, -0.3148469924926758, -0....","[-0.10345132648944855, -0.3611591160297394, 0....","[-0.0013099084608256817, -0.001418914529494941...","[-0.0013099084608256817, -0.001418914529494941..."
314,MS 38677,State,59,"[-0.7675344944000244, 0.5155598521232605, 0.99...","[-0.919567883014679, -0.28708547353744507, -0....","[-0.7636699676513672, -0.933850109577179, 0.99...","[-0.0005524390726350248, -0.000201352435396984...","[-0.0005524390726350248, -0.000201352435396984..."


In [9]:
parts = t_data.iloc[:, 0].tolist()
parts

['Aventis Pharma',
 'Romainville',
 'France. christophe.dini@aventis.com',
 'Jiangsu Key Laboratory of Drug Design and Optimization',
 'China Pharmaceutical University',
 'Nanjing 210009',
 'China',
 'Key Laboratory on Protein Chemistry and Structural Biology',
 'China Pharmaceutical University',
 'Nanjing 210009',
 'China',
 'China Pharmaceutical University',
 'Nanjing 210009',
 'China.',
 'Universidade Federal do Espírito Santo',
 'Rodovia BR 101 Norte',
 'Km 60',
 '29932-900',
 'Brazil.',
 'Jamia Hamdard',
 'New Delhi 110062',
 'India.',
 'Pondicherry University',
 'Puducherry 605 014',
 'India. Electronic address: tchinnasamy@hotmail.com.',
 'Payame Noor University',
 'Khoy 58168-45164',
 'Iran. saghatforoush@gmail.com',
 'Jadavpur University',
 'Kolkata 700 032',
 'India.',
 'Jagiellonian University Medical College',
 'Kraków',
 'Poland. Electronic address: mfmalaws@cyf-kr.edu.pl.',
 'Istituto Superiore di Sanità',
 'Viale Regina Elena 299',
 '00161 Roma',
 'Italy. carlo.mustazza@

In [10]:
embeddings = []
for part in parts:
    try:
        embeddings.append(model.encode([part.replace(".", " .")], bsize=128, tokenize=False, verbose=True)[0].tolist())
    except:
        print("error")
        print(part)
        break
t_data["embeddings_IS-GloVe"] = embeddings

Nb words kept : 3/4 (75.0%)
Speed : 21.1 sentences/s (cpu mode, bsize=128)
Nb words kept : 2/3 (66.7%)
Speed : 34.4 sentences/s (cpu mode, bsize=128)
Nb words kept : 4/7 (57.1%)
Speed : 25.1 sentences/s (cpu mode, bsize=128)
Nb words kept : 10/10 (100.0%)
Speed : 8.6 sentences/s (cpu mode, bsize=128)
Nb words kept : 5/5 (100.0%)
Speed : 18.9 sentences/s (cpu mode, bsize=128)
Nb words kept : 3/4 (75.0%)
Speed : 29.5 sentences/s (cpu mode, bsize=128)
Nb words kept : 3/3 (100.0%)
Speed : 30.4 sentences/s (cpu mode, bsize=128)
Nb words kept : 10/10 (100.0%)
Speed : 9.8 sentences/s (cpu mode, bsize=128)
Nb words kept : 5/5 (100.0%)
Speed : 17.5 sentences/s (cpu mode, bsize=128)
Nb words kept : 3/4 (75.0%)
Speed : 25.6 sentences/s (cpu mode, bsize=128)
Nb words kept : 3/3 (100.0%)
Speed : 17.5 sentences/s (cpu mode, bsize=128)
Nb words kept : 5/5 (100.0%)
Speed : 19.3 sentences/s (cpu mode, bsize=128)
Nb words kept : 3/4 (75.0%)
Speed : 29.5 sentences/s (cpu mode, bsize=128)
Nb words kept : 

In [11]:
t_data

Unnamed: 0,part,labels,affil_index,embeddings_csd-BERT,embeddings_uncsd-BERT,embeddings_uncsd-BERT-1024,embeddings_Doc2Vec-enwiki,embeddings_Doc2Vec-apnews,embeddings_IS-GloVe
0,Aventis Pharma,Institution,0,"[-0.6225269436836243, 0.41175007820129395, 0.9...","[-0.7495080828666687, 0.10450626909732819, 0.8...","[-0.9385257363319397, -0.8636099100112915, 0.9...","[0.001125917537137866, -0.00017105128790717572...","[0.001125917537137866, -0.00017105128790717572...","[0.016122661530971527, -0.03915251046419144, -..."
1,Romainville,City,0,"[-0.7443768978118896, 0.37689119577407837, 0.9...","[-0.9179261922836304, -0.47461554408073425, -0...","[-0.9338648915290833, -0.8978722095489502, 0.9...","[-0.0005519804544746876, 0.0012614359147846699...","[-0.0005519804544746876, 0.0012614359147846699...","[-0.01049832720309496, -0.08939896523952484, -..."
2,France. christophe.dini@aventis.com,Country,0,"[-0.7713785767555237, 0.44986480474472046, 0.9...","[-0.8849691152572632, -0.5707671642303467, -0....","[-0.925447940826416, -0.9710708260536194, 0.99...","[-0.0009261688101105392, 0.0001335361012024805...","[-0.0009261688101105392, 0.0001335361012024805...","[0.11357733607292175, -0.03367322310805321, 0...."
3,Jiangsu Key Laboratory of Drug Design and Opti...,Institution,1,"[-0.615035891532898, 0.31155872344970703, 0.99...","[-0.9479542970657349, -0.6653308272361755, -0....","[-0.9993273019790649, -0.998293399810791, 0.99...","[-0.0010953565360978246, 4.938078564009629e-05...","[-0.0011839469661936164, -0.001301796175539493...","[0.12751629948616028, 0.1880914717912674, 0.01..."
4,China Pharmaceutical University,Institution,1,"[-0.738187313079834, 0.46809735894203186, 0.99...","[-0.9523708820343018, -0.5192119479179382, -0....","[-0.9705463647842407, -0.9841862916946411, 0.9...","[0.001557248760946095, -0.0008196939597837627,...","[0.001557248760946095, -0.0008196939597837627,...","[0.13147854804992676, 0.17797352373600006, -0...."
...,...,...,...,...,...,...,...,...,...
311,People's Republic of China.,Country,58,"[-0.7836729288101196, 0.5700669884681702, 0.99...","[-0.9732127785682678, -0.5987682938575745, -0....","[-0.8492422699928284, -0.9690293669700623, 0.9...","[-0.0007051564170978963, 0.0002351854054722935...","[-0.0007051564170978963, 0.0002351854054722935...","[0.14862169325351715, 0.12809833884239197, -0...."
312,National Center for Natural Products Research,Institution,59,"[-0.6381785273551941, 0.46710556745529175, 0.9...","[-0.9274004697799683, -0.2877633571624756, -0....","[-0.9930930137634277, -0.9795610904693604, 0.9...","[-0.0015818303218111396, 0.0003046033380087465...","[-0.0015818303218111396, 0.0003046033380087465...","[0.051815617829561234, 0.20423460006713867, -0..."
313,The University of Mississippi,Institution,59,"[-0.676986813545227, 0.4199213683605194, 0.999...","[-0.9446776509284973, -0.3148469924926758, -0....","[-0.10345132648944855, -0.3611591160297394, 0....","[-0.0013099084608256817, -0.001418914529494941...","[-0.0013099084608256817, -0.001418914529494941...","[0.018671929836273193, 0.12176372110843658, -0..."
314,MS 38677,State,59,"[-0.7675344944000244, 0.5155598521232605, 0.99...","[-0.919567883014679, -0.28708547353744507, -0....","[-0.7636699676513672, -0.933850109577179, 0.99...","[-0.0005524390726350248, -0.000201352435396984...","[-0.0005524390726350248, -0.000201352435396984...","[0.0035000434145331383, -0.08939895033836365, ..."


In [12]:
model_version = 2
MODEL_PATH = "encoder/infersent%s.pkl" % model_version
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

<All keys matched successfully>

In [13]:
# If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
W2V_PATH = 'GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec'
model.set_w2v_path(W2V_PATH)

In [14]:
# Load embeddings of K most frequent words
model.build_vocab_k_words(K=100000)

Vocab size : 100000


In [15]:
embeddings = []
for part in parts:
    try:
        embeddings.append(model.encode([part.replace(".", " .")], bsize=128, tokenize=False, verbose=True)[0].tolist())
    except:
        print("error")
        print(part)
        break
t_data["embeddings_IS-FastText"] = embeddings

Nb words kept : 3/4 (75.0%)
Speed : 19.3 sentences/s (cpu mode, bsize=128)
Nb words kept : 2/3 (66.7%)
Speed : 35.6 sentences/s (cpu mode, bsize=128)
Nb words kept : 5/7 (71.4%)
Speed : 14.8 sentences/s (cpu mode, bsize=128)
Nb words kept : 10/10 (100.0%)
Speed : 8.5 sentences/s (cpu mode, bsize=128)
Nb words kept : 5/5 (100.0%)
Speed : 17.3 sentences/s (cpu mode, bsize=128)
Nb words kept : 3/4 (75.0%)
Speed : 26.6 sentences/s (cpu mode, bsize=128)
Nb words kept : 3/3 (100.0%)
Speed : 30.4 sentences/s (cpu mode, bsize=128)
Nb words kept : 10/10 (100.0%)
Speed : 8.6 sentences/s (cpu mode, bsize=128)
Nb words kept : 5/5 (100.0%)
Speed : 15.1 sentences/s (cpu mode, bsize=128)
Nb words kept : 3/4 (75.0%)
Speed : 25.9 sentences/s (cpu mode, bsize=128)
Nb words kept : 3/3 (100.0%)
Speed : 26.4 sentences/s (cpu mode, bsize=128)
Nb words kept : 5/5 (100.0%)
Speed : 17.0 sentences/s (cpu mode, bsize=128)
Nb words kept : 3/4 (75.0%)
Speed : 24.4 sentences/s (cpu mode, bsize=128)
Nb words kept : 

In [16]:
t_data

Unnamed: 0,part,labels,affil_index,embeddings_csd-BERT,embeddings_uncsd-BERT,embeddings_uncsd-BERT-1024,embeddings_Doc2Vec-enwiki,embeddings_Doc2Vec-apnews,embeddings_IS-GloVe,embeddings_IS-FastText
0,Aventis Pharma,Institution,0,"[-0.6225269436836243, 0.41175007820129395, 0.9...","[-0.7495080828666687, 0.10450626909732819, 0.8...","[-0.9385257363319397, -0.8636099100112915, 0.9...","[0.001125917537137866, -0.00017105128790717572...","[0.001125917537137866, -0.00017105128790717572...","[0.016122661530971527, -0.03915251046419144, -...","[0.007468888536095619, -0.11853469163179398, -..."
1,Romainville,City,0,"[-0.7443768978118896, 0.37689119577407837, 0.9...","[-0.9179261922836304, -0.47461554408073425, -0...","[-0.9338648915290833, -0.8978722095489502, 0.9...","[-0.0005519804544746876, 0.0012614359147846699...","[-0.0005519804544746876, 0.0012614359147846699...","[-0.01049832720309496, -0.08939896523952484, -...","[0.007468889933079481, -0.14438317716121674, -..."
2,France. christophe.dini@aventis.com,Country,0,"[-0.7713785767555237, 0.44986480474472046, 0.9...","[-0.8849691152572632, -0.5707671642303467, -0....","[-0.925447940826416, -0.9710708260536194, 0.99...","[-0.0009261688101105392, 0.0001335361012024805...","[-0.0009261688101105392, 0.0001335361012024805...","[0.11357733607292175, -0.03367322310805321, 0....","[0.007468888536095619, -0.07494887709617615, 0..."
3,Jiangsu Key Laboratory of Drug Design and Opti...,Institution,1,"[-0.615035891532898, 0.31155872344970703, 0.99...","[-0.9479542970657349, -0.6653308272361755, -0....","[-0.9993273019790649, -0.998293399810791, 0.99...","[-0.0010953565360978246, 4.938078564009629e-05...","[-0.0011839469661936164, -0.001301796175539493...","[0.12751629948616028, 0.1880914717912674, 0.01...","[0.007468888536095619, -0.10749538987874985, 0..."
4,China Pharmaceutical University,Institution,1,"[-0.738187313079834, 0.46809735894203186, 0.99...","[-0.9523708820343018, -0.5192119479179382, -0....","[-0.9705463647842407, -0.9841862916946411, 0.9...","[0.001557248760946095, -0.0008196939597837627,...","[0.001557248760946095, -0.0008196939597837627,...","[0.13147854804992676, 0.17797352373600006, -0....","[0.007468888536095619, -0.11864248663187027, 0..."
...,...,...,...,...,...,...,...,...,...,...
311,People's Republic of China.,Country,58,"[-0.7836729288101196, 0.5700669884681702, 0.99...","[-0.9732127785682678, -0.5987682938575745, -0....","[-0.8492422699928284, -0.9690293669700623, 0.9...","[-0.0007051564170978963, 0.0002351854054722935...","[-0.0007051564170978963, 0.0002351854054722935...","[0.14862169325351715, 0.12809833884239197, -0....","[0.007468888536095619, -0.09598811715841293, 0..."
312,National Center for Natural Products Research,Institution,59,"[-0.6381785273551941, 0.46710556745529175, 0.9...","[-0.9274004697799683, -0.2877633571624756, -0....","[-0.9930930137634277, -0.9795610904693604, 0.9...","[-0.0015818303218111396, 0.0003046033380087465...","[-0.0015818303218111396, 0.0003046033380087465...","[0.051815617829561234, 0.20423460006713867, -0...","[0.007468888536095619, -0.07379323244094849, 0..."
313,The University of Mississippi,Institution,59,"[-0.676986813545227, 0.4199213683605194, 0.999...","[-0.9446776509284973, -0.3148469924926758, -0....","[-0.10345132648944855, -0.3611591160297394, 0....","[-0.0013099084608256817, -0.001418914529494941...","[-0.0013099084608256817, -0.001418914529494941...","[0.018671929836273193, 0.12176372110843658, -0...","[0.007468888536095619, -0.06945133209228516, 0..."
314,MS 38677,State,59,"[-0.7675344944000244, 0.5155598521232605, 0.99...","[-0.919567883014679, -0.28708547353744507, -0....","[-0.7636699676513672, -0.933850109577179, 0.99...","[-0.0005524390726350248, -0.000201352435396984...","[-0.0005524390726350248, -0.000201352435396984...","[0.0035000434145331383, -0.08939895033836365, ...","[0.007468888536095619, -0.1302298605442047, 0...."


In [17]:
t_data.to_csv(output_file, sep = ';')

### References

In [18]:
#Mainly: https://doi.org/10.48550/arXiv.1705.02364
#https://github.com/facebookresearch/InferSent
#FastText vectors: T. Mikolov, E. Grave, P. Bojanowski, C. Puhrsch, A. Joulin. Advances in Pre-Training Distributed Word Representations.
#GloVe: Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: Global Vectors for Word Representation.
