In [61]:
#Text analysis basics in Python
# Bigram/trigrams and topic modeling

In [62]:
# corpus = [
# 'Great course. Love the professor.',
# 'Great content. Textbook was great',
# 'This course has very hard assignments. Great content.',
# 'Love the professor.',
# 'Hard assignments though',
# 'Hard to understand.'
# ]
corpus = [
'Good nachos. Love the cheese',
'Great texture. Nice and crispy',
'The olives on the nachos were nasty',
'Loved the salsa though.',
'Very expensive,'
'Very delicious'
]

In [63]:
import pandas as pd
df = pd.DataFrame(corpus)
df.columns = ['reviews']

In [64]:
#Next, we can explore some word associations. N-grams analyses are often used to see which words often show up together.

In [65]:
#An n-gram is a contiguous sequence of n items from a given sample of text or speech.

In [66]:
from nltk.corpus import stopwords
stoplist = stopwords.words('english') + ['though']

In [67]:
#Now we can remove the stop words and work with some bigrams/trigrams. 
#The function CountVectorizer “convert a collection of text documents to a matrix of token counts”

In [68]:
from sklearn.feature_extraction.text import CountVectorizer
c_vec = CountVectorizer(stop_words=stoplist, ngram_range=(2,3))
# matrix of ngrams
ngrams = c_vec.fit_transform(df['reviews'])
# count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)

In [69]:
# list of ngrams
vocab = c_vec.vocabulary_

In [70]:
df_ngram = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'bigram/trigram'})

In [71]:
df_ngram

Unnamed: 0,frequency,bigram/trigram
0,1,texture nice crispy
1,1,texture nice
2,1,olives nachos nasty
3,1,olives nachos
4,1,nice crispy
5,1,nachos nasty
6,1,nachos love cheese
7,1,nachos love
8,1,loved salsa
9,1,love cheese


In [72]:
# Non-Negative Matrix Factorization (NMF) is a matrix decomposition method
# we can use to produce 3 topics and we showed 3 bigrams/trigrams in each topic. 
# How it actually does it takes some math, but don't worry about the details.

In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.pipeline import make_pipeline

In [74]:
tfidf_vectorizer = TfidfVectorizer(stop_words=stoplist, ngram_range=(2,3))
nmf = NMF(n_components=3)
pipe = make_pipeline(tfidf_vectorizer, nmf)
pipe.fit(df['reviews'])



Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(ngram_range=(2, 3),
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('nmf', NMF(n_components=3))])

In [75]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += ", ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [76]:
print_top_words(nmf, tfidf_vectorizer.get_feature_names(), n_top_words=3)

Topic #0: loved salsa, olives nachos nasty, olives nachos
Topic #1: texture nice crispy, texture nice, nice crispy
Topic #2: expensive delicious, olives nachos nasty, olives nachos





In [77]:
#Looks like topic 0 is about the professor and courses; 
#topic 1 is about the assignment,
#and topic 3 is about the textbook. 
#Note that we do not know what is the best number of topics here. We used 3 just because our sample size is very small. 

In [78]:
# LatentDirichletAllocation
# It is a topic model that is used for discovering abstract topics from a collection of documents.

In [79]:
from sklearn.decomposition import LatentDirichletAllocation
tfidf_vectorizer = TfidfVectorizer(stop_words=stoplist, ngram_range=(2,3))
lda = LatentDirichletAllocation(n_components=3)
pipe = make_pipeline(tfidf_vectorizer, lda)
pipe.fit(df['reviews'])

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(ngram_range=(2, 3),
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('latentdirichletallocation',
                 LatentDirichletAllocation(n_components=3))])

In [80]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += ", ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [81]:
print_top_words(lda, tfidf_vectorizer.get_feature_names(), n_top_words=3)

Topic #0: loved salsa, expensive delicious, texture nice
Topic #1: olives nachos nasty, olives nachos, nachos nasty
Topic #2: nachos love cheese, nachos love, love cheese



In [82]:
# And now you might try it out with another corpus...