# Representing phrases – phrase2vec

Encoding words is useful, but usually, we deal with more complex units, such as phrases
and sentences. Phrases are important because they specify more detail than just words.
For example, the phrase delicious fried rice is very diﬀerent than just the word rice.
In this recipe, we will train a word2vec model that uses phrases as well as words.

IMPORTATION

In [2]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m63.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [21]:
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
import logging

SETTING UP logging

In [22]:
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level = logging.INFO)

CORPUS

In [23]:
corpus = [
    ['i', 'love', 'the', 'happy', 'hour', 'specials'],
    ['san', 'francisco', 'is', 'a', 'beautiful', 'city'],
    ['we', 'went', 'to', 'san', 'francisco', 'for', 'a', 'drink'],
    ['the', 'bar', 'has', 'the', 'best', 'happy', 'hour'],
    ['can', 'we', 'go', 'to', 'san', 'francisco', 'next', 'week'],
    ['i', 'always', 'recommend', 'the', 'happy', 'hour', 'menu']
]

for sentence in corpus:
  print(sentence)

['i', 'love', 'the', 'happy', 'hour', 'specials']
['san', 'francisco', 'is', 'a', 'beautiful', 'city']
['we', 'went', 'to', 'san', 'francisco', 'for', 'a', 'drink']
['the', 'bar', 'has', 'the', 'best', 'happy', 'hour']
['can', 'we', 'go', 'to', 'san', 'francisco', 'next', 'week']
['i', 'always', 'recommend', 'the', 'happy', 'hour', 'menu']


phrase2vec

In [24]:
# phrase model training
bigram_model = Phrases(corpus, min_count=2, threshold=2)

In [25]:
# phraser object
bigram_phraser = Phraser(bigram_model)

In [26]:
# apply phraser to corpus
phrased_corpus = [bigram_phraser[sentence] for sentence in corpus]

for sentence in phrased_corpus:
  print(sentence)

['i', 'love', 'the', 'happy_hour', 'specials']
['san_francisco', 'is', 'a', 'beautiful', 'city']
['we', 'went', 'to', 'san_francisco', 'for', 'a', 'drink']
['the', 'bar', 'has', 'the', 'best', 'happy_hour']
['can', 'we', 'go', 'to', 'san_francisco', 'next', 'week']
['i', 'always', 'recommend', 'the', 'happy_hour', 'menu']


TRAIN word2vec on the new corpus

In [28]:
model = Word2Vec(
    phrased_corpus,
    vector_size = 20,
    window = 3,
    min_count = 1,
    sg = 1
)

print("\nSuccessful Training")


Successful Training


TEST PHRASE EMBEDDINGS

In [29]:
phrase = "happy_hour"
if phrase in model.wv:
  print(f"\nVector for '{phrase}' exists in the model vocabulary")

  similar_tokens = model.wv.most_similar(phrase, topn=3)
  for token, score in similar_tokens:
    print(f"'{token}: ' {score:.3f}")

  else:
    print(f"{phrase} was not frequent enough to be included in the model.")

  phrase_2 = "san_francisco"
  if phrase_2 in model.wv:
    print(f"tokens most similar to '{phrase_2}':")
    similar_token_2 = model.wv.most_similar(phrase_2, topn = 3)
    for token, score in similar_token_2:
      print(f"{token}: {score: .3f}")


Vector for 'happy_hour' exists in the model vocabulary
'has: ' 0.387
'city: ' 0.257
'to: ' 0.230
happy_hour was not frequent enough to be included in the model.
tokens most similar to 'san_francisco':
went:  0.437
week:  0.254
has:  0.159
