In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
%matplotlib inline



In [2]:
# instantiate spacy parser
nlp = spacy.load('en')

In [3]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space or token.is_stop

def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for parsed_review in nlp.pipe(line_review(filename),
                                  batch_size=10000, n_threads=4):
        
      
        yield u' '.join([token.lemma_ for token in parsed_review
                             if not punct_space(token)])

In [4]:
import os
review_text_path = os.path.join('review_text_sample.txt')

In [5]:
unigram_sentences = os.path.join('unigram_reviews_all.txt')

In [6]:
import codecs

if 1 == 1:
    with codecs.open(unigram_sentences, 'w', encoding='utf-8') as f:
        for review in lemmatized_sentence_corpus(review_text_path):
            f.write(review + '\n')

In [None]:
unigram_sentences_parser = LineSentence(unigram_sentences)

In [7]:
import itertools as it
for unigram_sentence in it.islice(unigram_sentences_parser, 0, 10):
    print(u' '.join(unigram_sentence))
    print(u'')

NameError: name 'unigram_sentences_parser' is not defined

In [8]:
bigram_model_filepath = os.path.join('bigram_model_all')

In [9]:
bigram_model = Phrases(unigram_sentences_parser)
bigram_model.save(bigram_model_filepath)

NameError: name 'unigram_sentences_parser' is not defined

In [None]:
bigram_model = Phrases.load(bigram_model_filepath)

In [None]:
bigram_sentences_filepath = os.path.join('bigram_sentences_all.txt')

In [None]:
if 1 == 1:
    with codecs.open(bigram_sentences_filepath, 'w', encoding='utf-8') as f:
        for unigram_sentence in unigram_sentences_parser:
            bigram_sentence = u' '.join(bigram_model[unigram_sentence])
            f.write(bigram_sentence + '\n')

In [None]:
bigram_sentences = LineSentence(bigram_sentences_filepath)

In [None]:
for bigram_sentence in it.islice(bigram_sentences, 0, 10):
    print(u' '.join(bigram_sentence))
    print(u'')

In [None]:
trigram_model_filepath = os.path.join('trigram_model_all')

In [None]:
trigram_model = Phrases(bigram_sentences)

trigram_model.save(trigram_model_filepath)

trigram_model = Phrases.load(trigram_model_filepath)

In [None]:
trigram_sentences_filepath = os.path.join('trigram_sentences_all.txt')

In [None]:
with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
    for bigram_sentence in bigram_sentences:
        trigram_sentence = u' '.join(trigram_model[bigram_sentence])
        f.write(trigram_sentence + '\n')

In [None]:
trigram_sentences = LineSentence(trigram_sentences_filepath)

In [None]:
for trigram_sentence in it.islice(trigram_sentences, 0, 10):
    print(u' '.join(trigram_sentence))
    print('')

In [None]:
trigram_reviews_filepath = os.path.join('trigram_transformed_reviews.txt')

In [None]:
nlp.vocab["'s"].is_stop = True

In [None]:
with codecs.open(trigram_reviews_filepath, 'w', encoding='utf_8') as f:
    for parsed_review in nlp.pipe(line_review(review_text_path), batch_size=10000, n_threads=3):
        unigram_review = [token.lemma_ for token in parsed_review if not punct_space(token)]
        
        bigram_review = bigram_model[unigram_review]
        trigram_review = trigram_model[bigram_review]
        
        trigram_review = [term for term in trigram_review if term not in spacy.en.STOPWORDS and len(term) > 2]
        
        trigram_review = u' '.join(trigram_review)
        f.write(trigram_review + '\n')

In [None]:
print(u'Original:' +u'\n')

for review in it.islice(line_review(review_text_path), 0, 1):
    print(review)
    
print(u'-------' + u'\n')
print(u'Transformed:' + u'\n')

with codecs.open(trigram_reviews_filepath, encoding='utf-8') as f:
    for review in it.islice(f, 0, 1):
        print(review)

In [None]:
import warnings

with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=PendingDeprecationWarning)
    import pyLDAvis
    import pyLDAvis.gensim

    from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
import _pickle as pickle

In [None]:
import os
trigram_dictionary_filepath = os.path.join('trigram_dict_all.dict')

In [None]:

# remove stop_words

stop = ['.',',','(',')',"'",'"',':','','...','-','``',';',";'",'&']

stop += ["'s", "’s",
         "n't", "n’t",
         "there_'", "there_’",
         "they_'re", "they_’re",
         "he_'", "he_’", 
         "it_’", "it_'"]


stop = set(stop) 

In [None]:
trigram_reviews = LineSentence(trigram_reviews_filepath)

trigram_dictionary = Dictionary(trigram_reviews)

trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)

trigram_dictionary.filter_tokens(stop)

trigram_dictionary.compactify()

trigram_dictionary.save(trigram_dictionary_filepath)

trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

In [None]:
trigram_bow_filepath = os.path.join('trigram_bow_corpus_all.mm')

In [None]:
def trigram_bow_generator(filepath):
    for review in LineSentence(filepath):
        yield trigram_dictionary.doc2bow(review)

In [None]:
MmCorpus.serialize(trigram_bow_filepath, trigram_bow_generator(trigram_reviews_filepath))

trigram_bow_corpus = MmCorpus(trigram_bow_filepath)

In [None]:
lda_model_filepath = os.path.join('lda_model_all')

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    
    lda = LdaMulticore(trigram_bow_corpus,
                      num_topics=50,
                      id2word=trigram_dictionary,
                      workers=3)
lda.save(lda_model_filepath)
    
lda = LdaMulticore.load(lda_model_filepath)

In [None]:
def explore_topic(topic_number, topn=25):
       
    print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')

    for term, frequency in lda.show_topic(topic_number, topn=25):
        print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))



In [None]:
explore_topic(topic_number=1)

In [None]:
trigram_dictionary_filepath