# Word Representations

## *"I know words. I have the best words!"*
    - Noam Chomsky

## Discrete Sparse Representations

In [None]:
! pip install wget

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-cp37-none-any.whl size=9681 sha256=011bd3276a3b85f15c788c9e864d4b02b2974619342a5d84a5d13ad176939a9e
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [1]:
import wget
url = 'https://raw.githubusercontent.com/dirkhovy/NLPclass/master/data/reviews.full.tsv.zip'
wget.download(url, 'reviews.full.tsv.zip')

'reviews.full.tsv.zip'

In [2]:
from zipfile import ZipFile
with ZipFile('reviews.full.tsv.zip', 'r') as zf:
    zf.extractall()

In [3]:
import pandas as pd
df = pd.read_csv('reviews.full.tsv', sep='\t', nrows=100000)
documents = df.text.tolist()
print(documents[:4])

["Prices change daily and if you want to really research the price continually at many different sites , I have found cheaper cars elsewhere . However , if you don ' t have a lot of time to research the price , this site has always been among the top three ( e . g ., cheapest ) of the ten sites I use to reserve a car .", 'and the fact that they will match other companies is awesome !!', "Used Paypal for my buying and selling for the past 0 years and never had an issue they didn ' t resolve to my satisfaction .", "I ' ve made two purchases on CJ ' s for Fallout : New Vegas and The Elder Scrolls V : Skyrim . I have been satisfied by both , being extremely cheaper than the Steam versions . The Autokey system that CJ ' s uses is genius . I recommend this site to anyone who is a PC gamer !"]


In [56]:
df.groupby("category").gender.value_counts()

category                    gender
Accounting                  M          10
                            F           8
Activewear                  M         326
                            F         240
Affiliate Marketing         M          24
                                     ... 
Wintersport                 F          18
Workout Equipment           F           7
                            M           3
games consoles accessories  M           6
                            F           4
Name: gender, Length: 595, dtype: int64

In [57]:
df.head(10)

Unnamed: 0,score,category,uid,gender,age,text
0,5,Car Rental,899881,F,50,Prices change daily and if you want to really ...
1,5,Fitness & Nutrition,828184,M,32,and the fact that they will match other compan...
2,5,Electronic Payment,1698375,M,48,Used Paypal for my buying and selling for the ...
3,5,Gaming,3324079,M,29,I ' ve made two purchases on CJ ' s for Fallou...
4,4,Jewelry,719816,F,29,I was very happy with the diamond that I order...
5,5,Security Equipment,5630105,F,66,I signed up with front point security 0 months...
6,5,Electronics,6929926,M,69,First off I usually never get extended warrant...
7,5,Gaming,2364273,M,20,"The games come , no worries , they are reputab..."
8,1,Media & Marketing,2561769,F,32,We worked hard to send out email invitations f...
9,4,Shoes,2561769,F,32,I am in love with all the free movies and show...


In [115]:
from sklearn.feature_extraction.text import CountVectorizer

small_vectorizer = CountVectorizer()

sentences_2 = documents[:10]

X1 = small_vectorizer.fit_transform(sentences_2)

Let's implement this ourselves:

In [118]:
import numpy as np
num_docs = 2

# collect all word types (= vocabulary)
vocabulary = set()
for document in documents[:num_docs]:
    tokens = document.lower().split()
    vocabulary = vocabulary.union(set(tokens))
vocabulary = sorted(vocabulary)

# create a data matrix with #docs-by-#features dimensions
X = np.zeros((num_docs, len(vocabulary)))

# fill that matrix with sweet counts
for d, document in enumerate(documents[:num_docs]):
    tokens = document.lower().split()
    for i, feature in enumerate(vocabulary):
        X[d, i] = tokens.count(feature)

# show the result as a DataFrame
pd.DataFrame(data=X, columns=vocabulary, dtype=int)

Unnamed: 0,!!,',(,),",",.,".,",a,always,among,...,they,this,three,time,to,top,use,want,will,you
0,0,1,1,1,3,3,1,2,1,1,...,0,1,1,1,3,1,1,1,0,2
1,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0


In [119]:
vocabulary

['!!',
 "'",
 '(',
 ')',
 ',',
 '.',
 '.,',
 'a',
 'always',
 'among',
 'and',
 'at',
 'awesome',
 'been',
 'car',
 'cars',
 'change',
 'cheaper',
 'cheapest',
 'companies',
 'continually',
 'daily',
 'different',
 'don',
 'e',
 'elsewhere',
 'fact',
 'found',
 'g',
 'has',
 'have',
 'however',
 'i',
 'if',
 'is',
 'lot',
 'many',
 'match',
 'of',
 'other',
 'price',
 'prices',
 'really',
 'research',
 'reserve',
 'site',
 'sites',
 't',
 'ten',
 'that',
 'the',
 'they',
 'this',
 'three',
 'time',
 'to',
 'top',
 'use',
 'want',
 'will',
 'you']

In [120]:
vocabulary_ = {word: position for position, word in enumerate(vocabulary)}
vocabulary_

{'!!': 0,
 "'": 1,
 '(': 2,
 ')': 3,
 ',': 4,
 '.': 5,
 '.,': 6,
 'a': 7,
 'always': 8,
 'among': 9,
 'and': 10,
 'at': 11,
 'awesome': 12,
 'been': 13,
 'car': 14,
 'cars': 15,
 'change': 16,
 'cheaper': 17,
 'cheapest': 18,
 'companies': 19,
 'continually': 20,
 'daily': 21,
 'different': 22,
 'don': 23,
 'e': 24,
 'elsewhere': 25,
 'fact': 26,
 'found': 27,
 'g': 28,
 'has': 29,
 'have': 30,
 'however': 31,
 'i': 32,
 'if': 33,
 'is': 34,
 'lot': 35,
 'many': 36,
 'match': 37,
 'of': 38,
 'other': 39,
 'price': 40,
 'prices': 41,
 'really': 42,
 'research': 43,
 'reserve': 44,
 'site': 45,
 'sites': 46,
 't': 47,
 'ten': 48,
 'that': 49,
 'the': 50,
 'they': 51,
 'this': 52,
 'three': 53,
 'time': 54,
 'to': 55,
 'top': 56,
 'use': 57,
 'want': 58,
 'will': 59,
 'you': 60}

The result is a *sparse count matrix*:

In [121]:
# indexed representation
import numpy as np
# print(X1)

# dense representation
print(X1.todense())

[[0 0 0 ... 0 0 2]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 1]]


We can access the mapping from vector position to feature names via `get_feature_names()`:

In [122]:
print(small_vectorizer.get_feature_names())

['00', 'able', 'about', 'act', 'addition', 'after', 'again', 'ago', 'all', 'allowing', 'also', 'always', 'am', 'amazon', 'among', 'an', 'and', 'another', 'anyone', 'are', 'area', 'as', 'at', 'autokey', 'away', 'awesome', 'bank', 'be', 'because', 'been', 'being', 'believe', 'both', 'brilliance', 'but', 'buy', 'buying', 'by', 'called', 'car', 'cars', 'change', 'cheaper', 'cheapest', 'cj', 'come', 'companies', 'company', 'confirm', 'confirmed', 'continually', 'could', 'couple', 'customer', 'customers', 'daily', 'decision', 'declined', 'def', 'delayed', 'delivery', 'diamond', 'did', 'didn', 'different', 'direct', 'don', 'down', 'due', 'during', 'earned', 'elder', 'elsewhere', 'email', 'enough', 'ensure', 'error', 'even', 'experience', 'expired', 'extended', 'extra', 'extremely', 'fact', 'fallout', 'far', 'fine', 'first', 'fix', 'follow', 'for', 'found', 'free', 'friendly', 'from', 'front', 'funds', 'gamer', 'games', 'genius', 'gentleman', 'get', 'give', 'great', 'had', 'handful', 'happy', 

The inverse (the mapping from feature names to vector positions) is encoded as a list in `vocabulary_`:

In [123]:
print(small_vectorizer.vocabulary_)

{'prices': 171, 'change': 41, 'daily': 55, 'and': 16, 'if': 116, 'you': 281, 'want': 261, 'to': 239, 'really': 179, 'research': 185, 'the': 229, 'price': 170, 'continually': 50, 'at': 22, 'many': 134, 'different': 64, 'sites': 210, 'have': 109, 'found': 91, 'cheaper': 42, 'cars': 40, 'elsewhere': 72, 'however': 115, 'don': 66, 'lot': 128, 'of': 149, 'time': 238, 'this': 234, 'site': 209, 'has': 108, 'always': 11, 'been': 29, 'among': 14, 'top': 242, 'three': 236, 'cheapest': 43, 'ten': 225, 'use': 252, 'reserve': 186, 'car': 39, 'fact': 83, 'that': 228, 'they': 233, 'will': 273, 'match': 135, 'other': 155, 'companies': 46, 'is': 119, 'awesome': 25, 'used': 253, 'paypal': 164, 'for': 90, 'my': 142, 'buying': 36, 'selling': 200, 'past': 162, 'years': 280, 'never': 144, 'had': 104, 'an': 15, 'issue': 120, 'didn': 63, 'resolve': 187, 'satisfaction': 193, 've': 256, 'made': 130, 'two': 248, 'purchases': 178, 'on': 151, 'cj': 44, 'fallout': 84, 'new': 145, 'vegas': 257, 'elder': 71, 'scrolls

## Terminology 

![](matrix.pdf)

Let's redo this for the entire corpus:

In [124]:
vectorizer = CountVectorizer(analyzer='word', 
                             ngram_range=(1, 2), 
                             min_df=0.001, 
                             max_df=0.75, 
                             stop_words='english')

X = vectorizer.fit_transform(documents[:10000])

print(X.shape)

(10000, 3869)


Calling `transform()` on a new document will apply the vocabulary we collected previously to this new data point. Any words we have not seen before are ignored.


In [133]:
vectorizer.transform([documents[-1]])

<1x3869 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [134]:
documents[-1]

'Never had any issues , easy to use and great prices .'

## Exercise

Use vector operations to find out 
- what the 5 most frequent words are in `X`
- in how many different documents the word `delivery` occurs
- what percentage of the overall corpus that number corresponds to

In [193]:
# your code here

word_counts = pd.DataFrame({"word": vectorizer.get_feature_names(), "count": np.array(X.sum(0)).ravel()})
print(f"Most 5 common words:\n {word_counts.sort_values('count', ascending=False)[:5]}\n")

delivery_index = vectorizer.get_feature_names().index("delivery")
documents_containing_delivery = (X[:, delivery_index] > 0).sum()

print(f"Documents containing 'delivery': {documents_containing_delivery}, percentage: {documents_containing_delivery / X.shape[0] * 100:.2f}%")

Most 5 common words:
          word  count
0          00   2325
1446    great   2268
3011  service   2123
3439     time   2069
2212    order   2056

Documents containing 'delivery': 738, percentage: 7.38%


## Character $n$-grams

We can also use characters to analyze text:

In [198]:
char_vectorizer = CountVectorizer(analyzer='char', 
                                  ngram_range=(2, 6), 
                                  min_df=0.001, 
                                  max_df=0.75)

C = char_vectorizer.fit_transform(documents[:10])
C

<10x8054 sparse matrix of type '<class 'numpy.int64'>'
	with 10806 stored elements in Compressed Sparse Row format>

In [None]:
print(char_vectorizer.vocabulary_)

## Syntactic $n$-grams

In [201]:
import spacy
nlp = spacy.load('en_core_web_sm')

processed_documents = [' '.join(["{}_{}".format(c.lemma_, c.head.lemma_) 
                      for c in nlp(sentence)])
                      for sentence in documents[:100]]

syntax_vectorizer = CountVectorizer()
X = syntax_vectorizer.fit_transform(processed_documents)

In [210]:
print(documents[0])
print(syntax_vectorizer.get_feature_names()[:10])

Prices change daily and if you want to really research the price continually at many different sites , I have found cheaper cars elsewhere . However , if you don ' t have a lot of time to research the price , this site has always been among the top three ( e . g ., cheapest ) of the ten sites I use to reserve a car .
['000000000_number', '0000_00', '0000_august', '0000_be', '0000_com', '0000_dicember', '0000_february', '0000_nov', '0000_october', '0000_on']


In [None]:
print(syntax_vectorizer.vocabulary_)

# Dense Distributed Representations

## Word embeddings with `Word2vec`

In [256]:
from gensim.models import Word2Vec
from gensim.models.word2vec import FAST_VERSION

corpus = [document.split() for document in documents]

# initialize model
w2v_model = Word2Vec(vector_size=100,
                     window=15,
                     sample=0.0001,
                     epochs=50,
                     negative=5, 
                     min_count=100,
                     workers=-1, 
                     hs=0
)

w2v_model.build_vocab(corpus)

w2v_model.train(corpus, 
                total_examples=w2v_model.corpus_count, 
                epochs=w2v_model.epochs)

(0, 0)

In [257]:
print(corpus[0])

['Prices', 'change', 'daily', 'and', 'if', 'you', 'want', 'to', 'really', 'research', 'the', 'price', 'continually', 'at', 'many', 'different', 'sites', ',', 'I', 'have', 'found', 'cheaper', 'cars', 'elsewhere', '.', 'However', ',', 'if', 'you', 'don', "'", 't', 'have', 'a', 'lot', 'of', 'time', 'to', 'research', 'the', 'price', ',', 'this', 'site', 'has', 'always', 'been', 'among', 'the', 'top', 'three', '(', 'e', '.', 'g', '.,', 'cheapest', ')', 'of', 'the', 'ten', 'sites', 'I', 'use', 'to', 'reserve', 'a', 'car', '.']



Now, we can use the embeddings of the model

In [258]:
w2v_model.wv['delivery']

array([ 8.0003524e-03,  7.9075880e-03, -5.4218676e-03, -7.9771737e-03,
        8.0753779e-03, -3.8121247e-03, -4.0653921e-03,  7.1844123e-03,
        5.4238248e-03,  3.8177418e-03, -1.2296462e-03,  8.6690066e-03,
        8.4521556e-03,  6.9737816e-03,  9.0663983e-03, -5.7780431e-03,
        9.9869492e-04, -1.6307544e-03,  6.3399361e-03,  9.7781988e-03,
        5.1675295e-03,  4.0597632e-03,  6.5322802e-03,  5.8781100e-03,
        1.0234594e-04, -8.3432980e-03, -2.1702480e-03,  9.0750270e-03,
       -5.0799586e-03,  1.0089040e-03,  4.8747016e-03, -6.9280099e-03,
        8.1908731e-03,  1.4734459e-03,  2.4277496e-03,  8.3077475e-03,
        1.4566254e-03,  8.8965177e-04, -9.4241379e-03,  9.8624257e-03,
       -9.8660495e-03,  8.5506346e-03,  5.0526452e-03, -5.6564808e-05,
       -7.8733182e-03,  9.3416572e-03, -1.3780380e-03,  5.3466274e-03,
       -3.0297732e-03, -4.9048639e-03,  1.6399146e-04, -8.0301976e-03,
       -6.6411900e-03, -9.4553521e-03,  5.3499150e-03,  4.2900801e-04,
      

In [259]:
w2v_model.wv.most_similar(['delivery'])

[('solutions', 0.36946070194244385),
 ('wasting', 0.32564324140548706),
 ('waste', 0.30824023485183716),
 ('furniture', 0.29574525356292725),
 ('ALL', 0.2858591675758362),
 ('wrap', 0.27942055463790894),
 ('protein', 0.27140310406684875),
 ('health', 0.271228164434433),
 ('ordering', 0.26813775300979614),
 ('House', 0.26511016488075256)]

In [260]:
w2v_model.wv.most_similar(['delivery','concert'])

[('solutions', 0.41811078786849976),
 ('!', 0.34398943185806274),
 ('charge', 0.3014979064464569),
 ('responding', 0.29103609919548035),
 ('won', 0.2770461440086365),
 ('DVD', 0.27391940355300903),
 ('letting', 0.27379173040390015),
 ('enter', 0.2734227478504181),
 ('four', 0.26945194602012634),
 ('ALL', 0.2689569294452667)]

In [261]:
# birthday - present + husband => birthday:present as husband:?
w2v_model.wv.most_similar(positive=['birthday', 'husband'], negative=['present'], topn=5)

[('Ok', 0.3783721625804901),
 ('called', 0.32779461145401),
 ('hitch', 0.32767337560653687),
 ('map', 0.3259039521217346),
 ('AND', 0.31048113107681274)]

In [265]:
word1 = "Cheapest"
word2 = "friendly"

# retrieve the actual vector
# print(w2v_model.wv[word1])

# compare
print(w2v_model.wv.similarity(word1, word2))

# get the 3 most similar words
print(w2v_model.wv.most_similar(word1, topn=3))


0.044036888
[('god', 0.2920132875442505), ('Italy', 0.2912299335002899), ('scam', 0.2818167209625244)]



### Exercise
Use `spacy` to restrict the words in the tweets to *content words*, i.e., nouns, verbs, and adjectives. Transform the words to lower case and add the POS with an underderscore. E.g.:

`love_VERB old-fashioneds_NOUN`

This also allows us to distinguish between homographs, i.e., words that are written the same, but belong to different word classes, e.g., *love* in "I **love** old-fashioneds" vs. "He felt so sick, it must have been **love**".


Make sure to exclude sentences that contain none of the above.

Write the resulting corpus to a variable called `word_corpus`.

In [None]:
# Your code here

Rerun the `Word2vec` model from above on the new data set and test the words out

In [None]:
# Your code here

## Exercise

Train 4 more `Word2vec` models and average the resulting embedding matrices.

In [None]:
# Your code here


## Document embeddings with `Doc2Vec`

In [None]:
df.head()

Unnamed: 0,score,category,uid,gender,age,text
0,5,Car Rental,899881,F,50,Prices change daily and if you want to really ...
1,5,Fitness & Nutrition,828184,M,32,and the fact that they will match other compan...
2,5,Electronic Payment,1698375,M,48,Used Paypal for my buying and selling for the ...
3,5,Gaming,3324079,M,29,I ' ve made two purchases on CJ ' s for Fallou...
4,4,Jewelry,719816,F,29,I was very happy with the diamond that I order...


In [266]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import FAST_VERSION
from gensim.models.doc2vec import TaggedDocument

corpus = []

for row in df.iterrows():
    label = row[1].score
    text = row[1].text
    corpus.append(TaggedDocument(words=text.split(), tags=[str(label)]))

print('done')
d2v_model = Doc2Vec(vector_size=100, 
                    window=15,
                    hs=0,
                    sample=0.000001,
                    negative=5,
                    min_count=100,
                    workers=-1,
                    epochs=500,
                    dm=0, 
                    dbow_words=1)

d2v_model.build_vocab(corpus)

d2v_model.train(corpus, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)

done


We can now look at the elements

In [292]:
d2v_model.dv.doctags

AttributeError: 'KeyedVectors' object has no attribute 'doctags'

In [311]:
target_doc = '5'

similar_docs = d2v_model.dv.most_similar(target_doc, topn=5)
print(similar_docs)

[('4', 0.1626252382993698), ('1', -9.823018626775593e-05), ('2', -0.016333024948835373), ('3', -0.07919131964445114)]


In [294]:
d2v_model.wv

<gensim.models.keyedvectors.KeyedVectors at 0x18a06aebfd0>

## Exercise

What are the 10 most similar ***words*** to each category?

In [314]:
# your code here
_ = [print(f"score: {i}\n words: {d2v_model.wv.most_similar(d2v_model.dv[i], topn=10)}\n") for i in range(5)]

score: 0
 words: [('marks', 0.3785155117511749), ('help', 0.3546956777572632), ('gift', 0.32810214161872864), ('guaranteed', 0.29891666769981384), ('prepaid', 0.2799282371997833), ('cancellation', 0.27858394384384155), ('corrected', 0.2780011296272278), ('recieve', 0.2756739854812622), ('My', 0.26788169145584106), ('rectified', 0.2612289786338806)]

score: 1
 words: [('Will', 0.3301686644554138), ('placed', 0.32967785000801086), ('minimum', 0.32175981998443604), ('girl', 0.3141840696334839), ('great', 0.3033904731273651), ('trust', 0.2803640067577362), ('means', 0.2801167070865631), ('weather', 0.275889128446579), ('solve', 0.27240413427352905), ('valid', 0.267742782831192)]

score: 2
 words: [('sons', 0.32430171966552734), ('euros', 0.29904088377952576), ('faith', 0.28802356123924255), ('love', 0.2868878245353699), ('stag', 0.28481584787368774), ('catch', 0.2807292342185974), ('contents', 0.2801634967327118), ('difficulty', 0.2780471742153168), ('To', 0.27764394879341125), ('everythin