Automatically detect common phrases – aka multi-word expressions, word n-gram collocations – from a stream of sentences.

In [1]:
from gensim.test.utils import datapath
from gensim.models.word2vec import Text8Corpus
from gensim.models.phrases import Phrases, Phraser

In [2]:
# Load training data.
sentences = Text8Corpus(datapath('testcorpus.txt'))

In [3]:
# The training corpus must be a sequence (stream, generator) of sentences,
# with each sentence a list of tokens:
print(list(sentences)[0][:10])

['computer', 'human', 'interface', 'computer', 'response', 'survey', 'system', 'time', 'user', 'interface']


In [4]:
# Train a toy bigram model.
phrases = Phrases(sentences, min_count=1, threshold=1)

In [5]:
# Apply the trained phrases model to a new, unseen sentence.
phrases[['trees', 'graph', 'minors']]

['trees_graph', 'minors']

In [6]:
# The toy model considered "trees graph" a single phrase => joined the two
# tokens into a single token, `trees_graph`.

In [7]:
# Update the model with two new sentences on the fly.
phrases.add_vocab([["hello", "world"], ["meow"]])

In [8]:
# Export the trained model = use less RAM, faster processing. Model updates no longer possible.
bigram = Phraser(phrases)
bigram[['trees', 'graph', 'minors']]  # apply the exported model to a sentence

['trees_graph', 'minors']

In [11]:
# Apply the exported model to each sentence of a corpus:
for sent in bigram[sentences]:
    pass

In [12]:
from gensim.models import Phrases
documents = ["the mayor of new york was there", "machine learning can be useful sometimes","new york mayor was present"]

sentence_stream = [doc.split(" ") for doc in documents]
bigram = Phrases(sentence_stream, min_count=1, threshold=2)
sent = [u'the', u'mayor', u'of', u'new', u'york', u'was', u'there']
print(bigram[sent])

['the', 'mayor', 'of', 'new_york', 'was', 'there']
