In [1]:
%matplotlib inline

from random import randint
import numpy as np
import torch
import shutil
import string
import nltk.data
import matplotlib

matplotlib.rcParams['figure.figsize'] = (20.0, 10.0)

Working from https://github.com/facebookresearch/InferSent/blob/master/demo.ipynb & https://www.kaggle.com/jacksoncrow/infersent-demo?select=glove.840B.300d.txt

In [2]:
# here we need to restructure working directory, so that script imports working properly
# shutil.copytree("/kaggle/input/infersent/", "/kaggle/working/infersent")
# ! mv /kaggle/working/infersent/* /kaggle/working/

In [3]:
%%time

# TODO: add encoder to dataset as well
# If this cell freezes, probably you haven't enabled Internet access for the notebook
! mkdir encoder
! curl -Lo encoder/infersent1.pkl https://dl.fbaipublicfiles.com/infersent/infersent1.pkl

mkdir: encoder: File exists
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  146M  100  146M    0     0  12.1M      0  0:00:12  0:00:12 --:--:-- 11.8M
CPU times: user 206 ms, sys: 80.2 ms, total: 286 ms
Wall time: 12.3 s


In [11]:
#### https://nlp.stanford.edu/projects/glove/
url_to_glove_840B = "http://nlp.stanford.edu/data/glove.840B.300d.zip"

In [14]:
url_to_glove_6B = "http://nlp.stanford.edu/data/glove.6B.zip"

In [15]:
import urllib

In [16]:
from urllib.request import urlopen
from tempfile import NamedTemporaryFile
from shutil import unpack_archive

with urlopen(url_to_glove_6B) as zipresp, NamedTemporaryFile() as tfile:
    tfile.write(zipresp.read())
    tfile.seek(0)
    unpack_archive(tfile.name, 'glove.840B.300d.txt', format = 'zip')

KeyboardInterrupt: 

In [17]:
ls

README.md            extract_features.py  [34mglove.840B.300d.txt[m[m/
[34m__pycache__[m[m/         [34mglove.6B[m[m/            models.py
[34mencoder[m[m/             glove.6B.zip         playground_1.ipynb


In [32]:
# Glove website = https://nlp.stanford.edu/projects/glove/

In [21]:
model_version = 1
MODEL_PATH = "encoder/infersent%s.pkl" % model_version
#W2V_PATH = 'glove.840B.300d.txt'
W2V_PATH = "glove.6B/glove.6B.300d.txt"
VOCAB_SIZE = 1e5  # Load embeddings of VOCAB_SIZE most frequent words
USE_CUDA = False  # Keep it on CPU if False, otherwise will put it on GPU

In [22]:
from models import InferSent
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))


<All keys matched successfully>

In [23]:
%%time
model = model.cuda() if USE_CUDA else model

model.set_w2v_path(W2V_PATH)

model.build_vocab_k_words(K=VOCAB_SIZE)

Vocab size : 100000.0
CPU times: user 7.52 s, sys: 1.2 s, total: 8.72 s
Wall time: 8.86 s


In [24]:
sentences = ['Everyone really likes the newest benefits',
 'The Government Executive articles housed on the website are not able to be searched .',
 'I like him for the most part , but would still enjoy seeing someone beat him .',
 'My favorite restaurants are always at least a hundred miles away from my house .',
 'What a day !',
 'What color is it ?',
 'I know exactly .']

In [26]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/justingosses/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [27]:
ls

README.md            extract_features.py  [34mglove.840B.300d.txt[m[m/
[34m__pycache__[m[m/         [34mglove.6B[m[m/            models.py
[34mencoder[m[m/             glove.6B.zip         playground_1.ipynb


In [49]:
tokenizer = nltk.data.load('./tokenizers/punkt/english.pickle')

def format_text(text):
    global tokenizer
    padded_text = text.translate(str.maketrans({key: " {0} ".format(key) for key in string.punctuation}))
    return tokenizer.tokenize(padded_text)

text = 'Everyone really likes the newest benefits. The Government Executive articles housed on the website are not able to be searched.'\
'I like him for the most part, but would still enjoy seeing someone beat him. My favorite restaurants are always at least a hundred '\
'miles away from my house. What a day! What color is it? I know exactly.'

sentences = format_text(text)
sentences

['Everyone really likes the newest benefits .',
 'The Government Executive articles housed on the website are not able to be searched .',
 'I like him for the most part ,  but would still enjoy seeing someone beat him .',
 'My favorite restaurants are always at least a hundred miles away from my house .',
 'What a day !',
 'What color is it ?',
 'I know exactly .']

In [50]:
embeddings = model.encode(sentences, bsize=128, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings)))

Nb words kept : 58/81 (71.6%)
Speed : 42.7 sentences/s (cpu mode, bsize=128)
nb sentences encoded : 7


In [51]:
embeddings

array([[ 0.04763443,  0.08569701,  0.01288608, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.07242841,  0.07847229,  0.02677959, ...,  0.04605662,
         0.        ,  0.08023392],
       [ 0.05768718,  0.05348117, -0.01089828, ..., -0.00951468,
        -0.0298144 ,  0.05962453],
       ...,
       [ 0.03775015,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.07043603,  0.07732859,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.07067885,  0.02220628,  0.        , ...,  0.        ,
         0.        ,  0.00644431]], dtype=float32)

In [52]:
len(embeddings[0])

4096

In [53]:
np.linalg.norm(model.encode(['the cat eats.'], bsize=128, tokenize=False, verbose=True))


Nb words kept : 2/5 (40.0%)
Speed : 58.3 sentences/s (cpu mode, bsize=128)


3.0492694

In [54]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))



In [68]:
cosine(model.encode(['the cat eats.'], bsize=128, tokenize=False, verbose=True)[0], model.encode(['the cat drinks.'], bsize=128, tokenize=False, verbose=True)[0])

Nb words kept : 2/5 (40.0%)
Speed : 62.9 sentences/s (cpu mode, bsize=128)
Nb words kept : 2/5 (40.0%)
Speed : 73.4 sentences/s (cpu mode, bsize=128)


1.0

In [69]:
cosine(model.encode(['the cat eats.'])[0], model.encode(['the cat drinks.'])[0])


0.90281737

In [70]:
cosine(model.encode(['the cat eats.'])[0], model.encode(['the cat drinks.'])[0])


0.90281737

In [85]:
sentence_one_A = "AS HEAD OF ANALYTICS YOU WILL Lead the end-to-end structuration of Quidnet as a lean digital company."
senence_one_B = "Design the company Big-Data architecture from on-site sensors to cloud processes to laptop solutions, turning data into information."

In [86]:
cosine(model.encode([sentence_one_A])[0], model.encode([senence_one_B])[0])


0.85257906

In [73]:
def compare_two_sentences(sentenceOne,sentenceTwo):
    #### Takes in two sentences as strings and compares their sentence level embedding cosine similarity and 
    #### returns scores as float between 0 and 1
    return cosine(model.encode([sentenceOne])[0], model.encode([sentenceTwo])[0])


In [87]:
def creates_score_card(job_description_prepped,resume_info_prepped):
    #### takes in two data structures of each document. Both are array of strings
    
    #### approximately something like
    ### [{"id_number":0,"string":"Here is where the actual string goes in the job description":["resume_string":"Project description","resume_string_id":"String1","project_number":4,"cosine_sim":0.34]}]
    return ""

In [88]:
def add_array_of_top_five_ids(score_card):
    #### Finds the top 5 highest cosine similar strings in resume and puts their IDs in object for line in job description.
    return score_card

In [89]:
def calls_comparison_on_all_pairs():
    #### takes in two data structures of each document. Both are array of strings
    
    #### Calls comparison for each comparison of string item in each array
    
    #### Places score in data structure
    
    #### Returns completed scores in the given data structure
    return ""

In [90]:
np.linalg.norm(model.encode(['the cat eats.']))


3.3647213

In [91]:
cosine(model.encode(['the cat eats.'])[0], model.encode(['the cat drinks.'])[0])


0.90281737

Need to investigate why visualization part doesn't work. wonder if there isn't an assumption baked in on where something resides?

In [98]:
_, _ = model.visualize('The cat is drinking milk.')


KeyError: '<s>'