In [3]:
import matplotlib.pyplot as plt
from collections import Counter
import gensim
import numpy as np
import spacy
import math

from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
import pyLDAvis.gensim

import nltk
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text

import os, re, operator, warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
def clean(text):
    '''
    This function takes in texts and converts it to unicode format so we can process it
    '''
    
    return str(''.join([i if ord(i) < 128 else ' ' for i in text]))

test_data_dir = '{}'.format(os.sep).join([gensim.__path__[0], 'test', 'test_data'])
lee_train_file = test_data_dir + os.sep + 'lee_background.cor'

In [None]:
text = open(lee_train_file).read()
nlp = spacy.load('en_core_web_lg')
stop_words = stopwords.words('english')

In [None]:
# Removing the stopwords from our model vocabulary
for stopword in stop_words:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True

In [None]:
doc = nlp(clean(text))

In [None]:
texts, article = [], []
for w in doc:
    if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num:
        article.append(w.lemma_)
    if w.text == '\n':
        texts.append(article)
        article = []

In [None]:
#find and slice bigrams
bigram = gensim.models.Phrases(text)
texts = [bigram[line]for line in texts] 

In [None]:
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
### LSI

lsimodel = LsiModel(corpus = corpus, num_topics=10, id2word=dictionary)
lsimodel.show_topics(num_topics=5)

In [None]:
### LDA

ldamodel = LdaModel(corpus = corpus, num_topics=10, id2word=dictionary)
ldamodel.show_topics(num_topics=5)

In [None]:
hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)
hdpmodel.show_topics()

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel,corpus,dictionary)

In [None]:
lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)]
hdptopics = [[word for word, prob in topic] for topicid, topic in hdpmodel.show_topics(formatted=False)]
ldatopics = [[word for word, prob in topic] for topicid, topic in ldamodel.show_topics(formatted=False)]


In [None]:
lsi_coherence = CoherenceModel(topics=lsitopics[:10],texts=texts,dictionary=dictionary,window_size=10).get_coherence()
hdp_coherence = CoherenceModel(topics=hdptopics[:10],texts=texts,dictionary=dictionary,window_size=10).get_coherence()
lda_coherence = CoherenceModel(topics=ldatopics[:10],texts=texts,dictionary=dictionary,window_size=10).get_coherence()


In [None]:
def evaluate_bar_graph(coherences, indices):
    assert len(coherences) == len(indices)
    n = len(coherences)
    x = np.arange(n)
    plt.bar(x, coherences, width=0.2,tick_label=indices, align='center')
    plt.xlabel('Models')
    plt.ylabel('Coherence Value')


In [1]:
sid = SentimentIntensityAnalyzer()

sid.polarity_scores('I am happy.')

NameError: name 'SentimentIntensityAnalyzer' is not defined

In [None]:
texts_transformed = [' '.join(i) for i in texts]
texts_transformed

In [None]:
for i in range(len(texts_transformed)):
    print(texts_transformed[i] + ': ')
    scores = sid.polarity_scores(texts_transformed[i])
    print(scores)

In [None]:
from textblob import TextBlob

TextBlob('I am bored.').sentiment.polarity #subjectivity

for i in range(len(texts_transformed)):
    print(texts_transformed[i] + ': ')
    scores = TextBlob(texts_transformed[i].sentiment.polarity) #subjectivity
    print(scores)