### **Session Prep**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#install pydrive to load data
!pip install -U -q Pydrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials. get_application_default()
drive = GoogleDrive(gauth)

In [None]:
# https://docs.google.com/spreadsheets/d/13g6pResyKlWomR7O-wiRnswIvVa5nGaU/edit?usp=sharing&ouid=108977082471473204318&rtpof=true&sd=true
id = "13g6pResyKlWomR7O-wiRnswIvVa5nGaU"
file = drive.CreateFile({'id':id})
file.GetContentFile("E-Commerce Reviews Dataset.xlsx")

In [None]:
#text normalization function
# https://drive.google.com/file/d/1DEd0NGAZOz43u8TcCRooJQ4JZ-yP3yHS/view?usp=sharing
id = "1DEd0NGAZOz43u8TcCRooJQ4JZ-yP3yHS"
file_1 = drive.CreateFile({'id':id})
file_1.GetContentFile("Text_Normalization_Function.ipynb")
%run "Text_Normalization_Function.ipynb"

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Original:   <p>The circus dog in a plissé skirt jumped over Python who wasn't that large, just 3 feet long.</p>
Processed:  ['<', 'p', '>', 'The', 'circus', 'dog', 'in', 'a', 'plissé', 'skirt', 'jumped', 'over', 'Python', 'who', 'was', "n't", 'that', 'large', ',', 'just', '3', 'feet', 'long.', '<', '/p', '>']
Original:   <p>The circus dog in a plissé skirt jumped over Python who wasn't that large, just 3 feet long.</p>
Processed:  <p>The circus dog in a plissé skirt jumped over Python who was not that large, just 3 feet long.</p>
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /ro

In [None]:
#the module 'sys' allows istalling module from inside Jupyter
import sys

!{sys.executable} -m pip install numpy
import numpy as np

!{sys.executable} -m pip install pandas
import pandas as pd

#Natrual Language ToolKit (NLTK)
!{sys.executable} -m pip install nltk
import nltk

!{sys.executable} -m pip install sklearn
from sklearn import metrics
#from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import  CountVectorizer #bag-of-words vectorizer 
from sklearn.decomposition import LatentDirichletAllocation #package for LDA

# Plotting tools

from pprint import pprint
!{sys.executable} -m pip install pyLDAvis #visualizing LDA
import pyLDAvis
import pyLDAvis.sklearn

import matplotlib.pyplot as plt
%matplotlib inline

#define text normalization function
# %run ./Text_Normalization_Function.ipynb #defining text normalization function

#ignore warnings about future changes in functions as they take too much space
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)



In [None]:
df = pd.read_excel("E-Commerce Reviews Dataset.xlsx")

In [None]:
df.columns

Index(['Unnamed: 0', 'Clothing ID', 'Age', 'Title', 'Review Text', 'Rating',
       'Recommended IND', 'Positive Feedback Count', 'Division Name',
       'Department Name', 'Class Name'],
      dtype='object')

In [None]:
df.isnull().sum()

Unnamed: 0                    0
Clothing ID                   0
Age                           0
Title                      3810
Review Text                 845
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Division Name                14
Department Name              14
Class Name                   14
dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               23486 non-null  float64
 1   Clothing ID              23486 non-null  float64
 2   Age                      23486 non-null  float64
 3   Title                    19676 non-null  object 
 4   Review Text              22641 non-null  object 
 5   Rating                   23486 non-null  float64
 6   Recommended IND          23486 non-null  float64
 7   Positive Feedback Count  23486 non-null  float64
 8   Division Name            23472 non-null  object 
 9   Department Name          23472 non-null  object 
 10  Class Name               23472 non-null  object 
dtypes: float64(6), object(5)
memory usage: 2.0+ MB


In [None]:
df_review=df.loc[:,["Review Text","Age","Department Name"]].dropna().reset_index(drop=True)

In [None]:
len(df_review)

22628

In [None]:
df_GenZ_tops=df_review.loc[(df['Age']<=25) & (df['Department Name']=="Tops"),]["Review Text"].tolist()
df_GenZ_dress=df_review.loc[(df['Age']<=25) & (df['Department Name']=="Dresses"),]["Review Text"].tolist()
df_GenZ_bottoms=df_review.loc[(df['Age']<=25) & (df['Department Name']=="Bottoms"),]["Review Text"].tolist()

df_Millen_tops=df_review.loc[(df['Age']<=41) & (df['Age']>=26) & (df['Department Name']=="Tops"),]["Review Text"].tolist()
df_Millen_dress=df_review.loc[(df['Age']<=41) & (df['Age']>=26) & (df['Department Name']=="Dresses"),]["Review Text"].tolist()
df_Millen_bottoms=df_review.loc[(df['Age']<=41) & (df['Age']>=26) & (df['Department Name']=="Bottoms"),]["Review Text"].tolist()

df_GenX_tops=df_review.loc[(df['Age']>=41) & (df['Department Name']=="Tops"),]["Review Text"].tolist()
df_GenX_dress=df_review.loc[(df['Age']>=41) & (df['Department Name']=="Dresses"),]["Review Text"].tolist()
df_GenX_bottoms=df_review.loc[(df['Age']>=41) & (df['Department Name']=="Bottoms"),]["Review Text"].tolist()

In [None]:
# df_review_list = df_review["Review Text"].tolist()

In [None]:
#len(df_review)

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
def get_topic_words(vectorizer, lda_model, n_words):
    keywords = np.array(vectorizer.get_feature_names())
    topic_words = []
    for topic_weights in lda_model.components_:
        top_word_locs = (-topic_weights).argsort()[:n_words]
        topic_words.append(keywords.take(top_word_locs).tolist())
    return topic_words

### **GenZ_tops**

In [None]:
#normalize data
normalized_corpus_news = normalize_corpus(df_GenZ_tops)

#define a Bag-of-Words vecgtorizer
bow_vectorizer_news = CountVectorizer(max_features=1000)

#vectorize data
bow_news_corpus = bow_vectorizer_news.fit_transform(normalized_corpus_news)

In [None]:
lda_news = LatentDirichletAllocation(n_components=5, max_iter=100,
                                     doc_topic_prior = 0.25,
                                     topic_word_prior = 0.25).fit(bow_news_corpus)

Display results with top 10 words for each topic:

In [None]:
no_top_words_news = 10
display_topics(lda_news, bow_vectorizer_news.get_feature_names(), no_top_words_news)

Topic 0:
size top like look fit love order wear really color
Topic 1:
color look fit size like side top bra dress small
Topic 2:
color size small top fit love look sweater perfect wear
Topic 3:
dress love wear great fit like buy look well size
Topic 4:
size top wear love look dress shoulder fit fabric nice


Display **word vectors** (words are in alphabetical order) for each topic. Each column is a topic:

In [None]:
word_weights = lda_news.components_ / lda_news.components_.sum(axis=1)[:, np.newaxis]
word_weights_df = pd.DataFrame(word_weights.T, 
                               index = bow_vectorizer_news.get_feature_names(), 
                               columns = ["Topic_" + str(i) for i in range(5)])
word_weights_df.head(10)

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4
0p,0.000444,0.000152,0.000119,7.5e-05,0.000171
110lbs,0.000448,0.000152,0.000119,7.2e-05,0.000166
12p,0.000248,0.000762,0.000119,7.2e-05,0.000166
30dd,5.6e-05,0.000751,0.000583,7.2e-05,0.000171
32d,5e-05,0.000152,0.000593,7.2e-05,0.00083
34aa,5.2e-05,0.001364,0.000119,7.2e-05,0.000166
34b,5.2e-05,0.00109,0.000808,7.2e-05,0.000166
34c,5e-05,0.000174,0.000119,0.00064,0.000166
34d,0.000248,0.000762,0.000594,7.2e-05,0.000166
34dd,0.000644,0.000152,0.000124,8.3e-05,0.000806


Now, **sort by word weights in Topic 0** (descending order) and see the weights by 10 most frequent words in Topic 0:

In [None]:
word_weights_df.sort_values(by='Topic_0',ascending=False).head(10)

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4
size,0.022213,0.011798,0.022721,0.012255,0.021895
top,0.021259,0.010164,0.017172,0.004444,0.018694
like,0.019795,0.011273,0.002535,0.014537,0.007059
look,0.01971,0.01332,0.015692,0.014115,0.014237
fit,0.018727,0.012213,0.016217,0.021485,0.012356
love,0.016005,0.00842,0.015694,0.026922,0.014487
order,0.014266,0.005811,0.006285,0.003325,0.000173
wear,0.013798,0.007175,0.013121,0.025052,0.015832
really,0.012971,0.002915,0.000124,0.004386,0.007195
color,0.012467,0.013598,0.025503,0.01183,0.003072


In [None]:
#prepare to display result in the Jupyter notebook
pyLDAvis.enable_notebook()

#run the visualization [mds is a function to use for visualizing the "distance" between topics]
pyLDAvis.sklearn.prepare(lda_news, bow_news_corpus, bow_vectorizer_news, mds='tsne')

In [None]:
lda_news_topic_weights = lda_news.transform(bow_news_corpus)

Let's convert lda_news_topic_weights into a nice-looking dataframe and have a look at the computed topic weights in documents:

In [None]:
#array of document "names" and topic "names" ("names" are just indecies)
doc_names = ["Doc_" + str(i) for i in range(len(normalized_corpus_news))]
topic_names = ["Topic_" + str(i) for i in range(5)]

#convert to dataframe
df_document_topic = pd.DataFrame(np.round(lda_news_topic_weights, 5), columns=topic_names, index=doc_names)
df_document_topic.head(5)

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4
Doc_0,0.96,0.00978,0.00995,0.01028,0.01
Doc_1,0.42946,0.00821,0.00827,0.54601,0.00805
Doc_2,0.96965,0.00728,0.00775,0.00768,0.00764
Doc_3,0.01344,0.01282,0.01338,0.28124,0.67912
Doc_4,0.01184,0.0115,0.95312,0.01189,0.01165


The topic with the highest weight in each document is a **dominant topic**. The weights across the 4 topics sum up to 1. Let's add a column that shows dominant topic for each document:

In [None]:
#vector of indecies for columns with the highest value by each row in df_document_topic
dominant_topic = np.argmax(df_document_topic.values, axis=1)

#add dominant_topic as a column to df_document_topic
df_document_topic['dominant_topic'] = dominant_topic
df_document_topic.head(5)

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,dominant_topic
Doc_0,0.96,0.00978,0.00995,0.01028,0.01,0
Doc_1,0.42946,0.00821,0.00827,0.54601,0.00805,3
Doc_2,0.96965,0.00728,0.00775,0.00768,0.00764,0
Doc_3,0.01344,0.01282,0.01338,0.28124,0.67912,4
Doc_4,0.01184,0.0115,0.95312,0.01189,0.01165,2


**Topic Model Evaluation: Log-likelihood, Perplexity and Coherence Scores**

Log-likelihood, Perplexity and Coherence Score are **measures of performance** for a topic model. They are used for comparing and discriminating between topic models estimated on the same data. Log-likelihood, perplexity and coherence scores **do not have** a baseline or a threshold values and therefore are useful only for comparing models. 

How do you specify different models? You can set **different number of topics** and also play with the **parameters of the Dirichlet distributions**. 

#### **Coherence Score**

We will use a function **CoherenceModel()** from the **gensim** module (you can also explore that package as it can be used to estimate an LDA model). The sklearn module does not have the functionality to compute the coherence score. Let's install the gensim package and the functions needed:

In [None]:
!{sys.executable} -m pip install gensim
import gensim

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary



The function CoherenceModel() needs as **inputs**:

**1. Dictionary of the corpus**<br>
**2. Corpus with each document represented as Bag-of-Words**<br>
**3. An array of top words for each topic: we'll have top 20 words for each topic** 
  
We will now create those objects:

In [None]:
#tokenizing the corpus
news_corpus_tokenized = [tokenize_text(normalized_corpus_news[doc_id]) for doc_id in range(len(normalized_corpus_news))]

#Dictionary of the corpus:
news_dictionary = Dictionary(news_corpus_tokenized)

#Bag-of-words representation for each document of the corpus:
news_corpus_bow = [news_dictionary.doc2bow(doc) for doc in news_corpus_tokenized]

#top 20 words for each topic (using the function defined in session prep)
topic_topwords = get_topic_words(vectorizer = bow_vectorizer_news, lda_model = lda_news, n_words=20)

Now let's compute **the coherence score for the model overall**. We use one of the coherence metrics "u-mass" which measures semantic similarity of words in a topic, but there are other metrics as well.

*Note: You can check out different coherence metrics here if you are interested: https://dl.acm.org/doi/abs/10.1145/2684822.2685324*

In [None]:
cm = CoherenceModel(topics=topic_topwords, 
                    corpus = news_corpus_bow , 
                    dictionary = news_dictionary, coherence='u_mass')
print("Coherence score for the model: ", np.round(cm.get_coherence(), 5))  # get coherence value

Coherence score for the model:  -2.05573


You can also see **coherence scores by topic**:

In [None]:
print("Coherence score by topic (higher values are better): ", np.round(cm.get_coherence_per_topic(),5))

Coherence score by topic (higher values are better):  [-1.81083 -2.30274 -1.94405 -2.01139 -2.20964]


**Log-Likelihood Score**

To compute the log-likelihood score we use the **.score** attribute of our defined and fitted LDA function:

In [None]:
print("Log-Likelihood (higher values are better): ", lda_news.score(bow_news_corpus))

Log-Likelihood (higher values are better):  -79538.17305477532


**Perplexity Score**

To compute the Perplexity score we use the **.perplexity** attribute of our defined and fitted LDA function:

In [None]:
print("Perplexity (lower values are better): ", lda_news.perplexity(bow_news_corpus))

Perplexity (lower values are better):  583.2747027660083


<br>**NOTE:** Generally, you can write a simple script that selects the best topic model **automatically** based on a criterion for "best model" (log-likelihood, perplexity, or coherence score). The script can vary both parameters of the Dirichlet distributions and the number of topics, or just the number of topics.

### **GenZ_dress**

In [None]:
#normalize data
normalized_corpus_news = normalize_corpus(df_GenZ_dress)

#define a Bag-of-Words vecgtorizer
bow_vectorizer_news = CountVectorizer(max_features=1000)

#vectorize data
bow_news_corpus = bow_vectorizer_news.fit_transform(normalized_corpus_news)

In [None]:
lda_news = LatentDirichletAllocation(n_components=5, max_iter=100,
                                     doc_topic_prior = 0.25,
                                     topic_word_prior = 0.25).fit(bow_news_corpus)

Display results with top 10 words for each topic:

In [None]:
no_top_words_news = 10
display_topics(lda_news, bow_vectorizer_news.get_feature_names(), no_top_words_news)

Topic 0:
dress look fit size like love top color wear small
Topic 1:
wear skirt size fit love look dress soft fall great
Topic 2:
dress great love fit size perfect wear color order large
Topic 3:
great fit top look quality wear shirt blouse material buy
Topic 4:
size dress like love fit look fabric even great well


Display **word vectors** (words are in alphabetical order) for each topic. Each column is a topic:

In [None]:
word_weights = lda_news.components_ / lda_news.components_.sum(axis=1)[:, np.newaxis]
word_weights_df = pd.DataFrame(word_weights.T, 
                               index = bow_vectorizer_news.get_feature_names(), 
                               columns = ["Topic_" + str(i) for i in range(5)])
word_weights_df.head(10)

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4
34b,0.000884,0.000149,0.00015,0.000213,0.000248
34c,0.000612,0.000149,0.00015,0.000213,0.000248
34d,7.1e-05,0.000745,0.00015,0.001052,0.001243
36c,0.000339,0.000149,0.00015,0.000213,0.001245
4p,0.000612,0.000149,0.00015,0.000213,0.000248
able,0.000927,0.000149,0.001249,0.001063,0.003236
absolute,6.8e-05,0.000745,0.000167,0.001039,0.000248
absolutely,0.001932,0.001334,0.000837,0.002778,0.000248
across,0.000611,0.000744,0.002551,0.000217,0.000248
actually,0.001574,0.002089,0.000873,0.000217,0.000248


Now, **sort by word weights in Topic 0** (descending order) and see the weights by 10 most frequent words in Topic 0:

In [None]:
word_weights_df.sort_values(by='Topic_0',ascending=False).head(10)

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4
dress,0.033344,0.010553,0.027919,0.0068,0.019327
look,0.022567,0.012627,0.007476,0.016579,0.011093
fit,0.020077,0.019382,0.015376,0.017858,0.012228
size,0.019429,0.019597,0.0143,0.002418,0.024102
like,0.017396,0.007289,0.005911,0.005471,0.015688
love,0.017011,0.016835,0.017832,0.009648,0.013316
top,0.016652,0.007307,0.002288,0.017349,0.005544
color,0.01419,0.006704,0.011964,0.000215,0.004622
wear,0.011739,0.02711,0.013131,0.013954,0.004287
small,0.010917,0.005789,0.003862,0.007112,0.004584


In [None]:
#prepare to display result in the Jupyter notebook
pyLDAvis.enable_notebook()

#run the visualization [mds is a function to use for visualizing the "distance" between topics]
pyLDAvis.sklearn.prepare(lda_news, bow_news_corpus, bow_vectorizer_news, mds='tsne')

In [None]:
lda_news_topic_weights = lda_news.transform(bow_news_corpus)

Let's convert lda_news_topic_weights into a nice-looking dataframe and have a look at the computed topic weights in documents:

In [None]:
#array of document "names" and topic "names" ("names" are just indecies)
doc_names = ["Doc_" + str(i) for i in range(len(normalized_corpus_news))]
topic_names = ["Topic_" + str(i) for i in range(5)]

#convert to dataframe
df_document_topic = pd.DataFrame(np.round(lda_news_topic_weights, 5), columns=topic_names, index=doc_names)
df_document_topic.head(5)

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4
Doc_0,0.93357,0.01668,0.01683,0.01659,0.01634
Doc_1,0.03655,0.03736,0.0377,0.85327,0.03512
Doc_2,0.9671,0.00853,0.00824,0.00798,0.00815
Doc_3,0.96244,0.00936,0.00951,0.00933,0.00936
Doc_4,0.00735,0.0072,0.00704,0.97124,0.00717


The topic with the highest weight in each document is a **dominant topic**. The weights across the 4 topics sum up to 1. Let's add a column that shows dominant topic for each document:

In [None]:
#vector of indecies for columns with the highest value by each row in df_document_topic
dominant_topic = np.argmax(df_document_topic.values, axis=1)

#add dominant_topic as a column to df_document_topic
df_document_topic['dominant_topic'] = dominant_topic
df_document_topic.head(5)

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,dominant_topic
Doc_0,0.93357,0.01668,0.01683,0.01659,0.01634,0
Doc_1,0.03655,0.03736,0.0377,0.85327,0.03512,3
Doc_2,0.9671,0.00853,0.00824,0.00798,0.00815,0
Doc_3,0.96244,0.00936,0.00951,0.00933,0.00936,0
Doc_4,0.00735,0.0072,0.00704,0.97124,0.00717,3


**Topic Model Evaluation: Log-likelihood, Perplexity and Coherence Scores**

Log-likelihood, Perplexity and Coherence Score are **measures of performance** for a topic model. They are used for comparing and discriminating between topic models estimated on the same data. Log-likelihood, perplexity and coherence scores **do not have** a baseline or a threshold values and therefore are useful only for comparing models. 

How do you specify different models? You can set **different number of topics** and also play with the **parameters of the Dirichlet distributions**. 

#### **Coherence Score**

We will use a function **CoherenceModel()** from the **gensim** module (you can also explore that package as it can be used to estimate an LDA model). The sklearn module does not have the functionality to compute the coherence score. Let's install the gensim package and the functions needed:

In [None]:
!{sys.executable} -m pip install gensim
import gensim

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary



The function CoherenceModel() needs as **inputs**:

**1. Dictionary of the corpus**<br>
**2. Corpus with each document represented as Bag-of-Words**<br>
**3. An array of top words for each topic: we'll have top 20 words for each topic** 
  
We will now create those objects:

In [None]:
#tokenizing the corpus
news_corpus_tokenized = [tokenize_text(normalized_corpus_news[doc_id]) for doc_id in range(len(normalized_corpus_news))]

#Dictionary of the corpus:
news_dictionary = Dictionary(news_corpus_tokenized)

#Bag-of-words representation for each document of the corpus:
news_corpus_bow = [news_dictionary.doc2bow(doc) for doc in news_corpus_tokenized]

#top 20 words for each topic (using the function defined in session prep)
topic_topwords = get_topic_words(vectorizer = bow_vectorizer_news, lda_model = lda_news, n_words=20)

Now let's compute **the coherence score for the model overall**. We use one of the coherence metrics "u-mass" which measures semantic similarity of words in a topic, but there are other metrics as well.

*Note: You can check out different coherence metrics here if you are interested: https://dl.acm.org/doi/abs/10.1145/2684822.2685324*

In [None]:
cm = CoherenceModel(topics=topic_topwords, 
                    corpus = news_corpus_bow , 
                    dictionary = news_dictionary, coherence='u_mass')
print("Coherence score for the model: ", np.round(cm.get_coherence(), 5))  # get coherence value

Coherence score for the model:  -2.5455


You can also see **coherence scores by topic**:

In [None]:
print("Coherence score by topic (higher values are better): ", np.round(cm.get_coherence_per_topic(),5))

Coherence score by topic (higher values are better):  [-1.80807 -2.4549  -1.71623 -2.51033 -4.23798]


**Log-Likelihood Score**

To compute the log-likelihood score we use the **.score** attribute of our defined and fitted LDA function:

In [None]:
print("Log-Likelihood (higher values are better): ", lda_news.score(bow_news_corpus))

Log-Likelihood (higher values are better):  -51603.49809964042


**Perplexity Score**

To compute the Perplexity score we use the **.perplexity** attribute of our defined and fitted LDA function:

In [None]:
print("Perplexity (lower values are better): ", lda_news.perplexity(bow_news_corpus))

Perplexity (lower values are better):  657.5741530459667


<br>**NOTE:** Generally, you can write a simple script that selects the best topic model **automatically** based on a criterion for "best model" (log-likelihood, perplexity, or coherence score). The script can vary both parameters of the Dirichlet distributions and the number of topics, or just the number of topics.

### **GenZ_bottoms**

In [None]:
#normalize data
normalized_corpus_news = normalize_corpus(df_GenZ_bottoms)

#define a Bag-of-Words vecgtorizer
bow_vectorizer_news = CountVectorizer(max_features=1000)

#vectorize data
bow_news_corpus = bow_vectorizer_news.fit_transform(normalized_corpus_news)

In [None]:
lda_news = LatentDirichletAllocation(n_components=5, max_iter=100,
                                     doc_topic_prior = 0.25,
                                     topic_word_prior = 0.25).fit(bow_news_corpus)

Display results with top 10 words for each topic:

In [None]:
no_top_words_news = 10
display_topics(lda_news, bow_vectorizer_news.get_feature_names(), no_top_words_news)

Topic 0:
top size wear dress fit work like great color look
Topic 1:
size shirt wear look like dress white small arm cut
Topic 2:
dress fit love wear size like top perfect look color
Topic 3:
look love like color wear dress fit great really sweater
Topic 4:
fit dress love length wear size bit like perfect well


Display **word vectors** (words are in alphabetical order) for each topic. Each column is a topic:

In [None]:
word_weights = lda_news.components_ / lda_news.components_.sum(axis=1)[:, np.newaxis]
word_weights_df = pd.DataFrame(word_weights.T, 
                               index = bow_vectorizer_news.get_feature_names(), 
                               columns = ["Topic_" + str(i) for i in range(5)])
word_weights_df.head(10)

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4
0p,0.000176,0.000321,0.000992,0.000154,0.000344
120lbs,0.000176,0.000321,0.000198,0.000154,0.001718
128lbs,0.000176,0.000321,0.000198,0.00077,0.000344
130lbs,0.000176,0.000321,0.000198,0.000154,0.001718
140lbs,0.000176,0.001607,0.000991,0.000154,0.000344
155lbs,0.000176,0.000321,0.000198,0.00077,0.000344
20s,0.000879,0.000321,0.000198,0.000154,0.000344
25p,0.000176,0.000321,0.000198,0.001386,0.000344
26p,0.000176,0.000321,0.000198,0.001386,0.000344
32a,0.000176,0.000321,0.000198,0.000154,0.001718


Now, **sort by word weights in Topic 0** (descending order) and see the weights by 10 most frequent words in Topic 0:

In [None]:
word_weights_df.sort_values(by='Topic_0',ascending=False).head(10)

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4
top,0.02645,0.003203,0.013828,0.010465,0.005127
size,0.019982,0.015784,0.014108,0.008337,0.009953
wear,0.016738,0.012337,0.016775,0.014853,0.010446
dress,0.014766,0.00969,0.023051,0.014681,0.013517
fit,0.014228,0.006733,0.021235,0.014215,0.020538
work,0.011625,0.002891,0.001167,0.002312,0.004461
like,0.011125,0.009965,0.014087,0.018703,0.009028
great,0.011056,0.000322,0.007344,0.014031,0.004466
color,0.009319,0.004162,0.010527,0.014909,0.003157
look,0.009143,0.010168,0.012191,0.020428,0.007998


In [None]:
#prepare to display result in the Jupyter notebook
pyLDAvis.enable_notebook()

#run the visualization [mds is a function to use for visualizing the "distance" between topics]
pyLDAvis.sklearn.prepare(lda_news, bow_news_corpus, bow_vectorizer_news, mds='tsne')

In [None]:
lda_news_topic_weights = lda_news.transform(bow_news_corpus)

Let's convert lda_news_topic_weights into a nice-looking dataframe and have a look at the computed topic weights in documents:

In [None]:
#array of document "names" and topic "names" ("names" are just indecies)
doc_names = ["Doc_" + str(i) for i in range(len(normalized_corpus_news))]
topic_names = ["Topic_" + str(i) for i in range(5)]

#convert to dataframe
df_document_topic = pd.DataFrame(np.round(lda_news_topic_weights, 5), columns=topic_names, index=doc_names)
df_document_topic.head(5)

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4
Doc_0,0.02815,0.88765,0.02751,0.02819,0.0285
Doc_1,0.00927,0.00903,0.9631,0.00932,0.00928
Doc_2,0.02541,0.02495,0.02498,0.89951,0.02516
Doc_3,0.94904,0.01283,0.01277,0.01282,0.01254
Doc_4,0.93822,0.01524,0.01539,0.01623,0.01492


The topic with the highest weight in each document is a **dominant topic**. The weights across the 4 topics sum up to 1. Let's add a column that shows dominant topic for each document:

In [None]:
#vector of indecies for columns with the highest value by each row in df_document_topic
dominant_topic = np.argmax(df_document_topic.values, axis=1)

#add dominant_topic as a column to df_document_topic
df_document_topic['dominant_topic'] = dominant_topic
df_document_topic.head(5)

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,dominant_topic
Doc_0,0.02815,0.88765,0.02751,0.02819,0.0285,1
Doc_1,0.00927,0.00903,0.9631,0.00932,0.00928,2
Doc_2,0.02541,0.02495,0.02498,0.89951,0.02516,3
Doc_3,0.94904,0.01283,0.01277,0.01282,0.01254,0
Doc_4,0.93822,0.01524,0.01539,0.01623,0.01492,0


**Topic Model Evaluation: Log-likelihood, Perplexity and Coherence Scores**

Log-likelihood, Perplexity and Coherence Score are **measures of performance** for a topic model. They are used for comparing and discriminating between topic models estimated on the same data. Log-likelihood, perplexity and coherence scores **do not have** a baseline or a threshold values and therefore are useful only for comparing models. 

How do you specify different models? You can set **different number of topics** and also play with the **parameters of the Dirichlet distributions**. 

#### **Coherence Score**

We will use a function **CoherenceModel()** from the **gensim** module (you can also explore that package as it can be used to estimate an LDA model). The sklearn module does not have the functionality to compute the coherence score. Let's install the gensim package and the functions needed:

In [None]:
!{sys.executable} -m pip install gensim
import gensim

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary



The function CoherenceModel() needs as **inputs**:

**1. Dictionary of the corpus**<br>
**2. Corpus with each document represented as Bag-of-Words**<br>
**3. An array of top words for each topic: we'll have top 20 words for each topic** 
  
We will now create those objects:

In [None]:
#tokenizing the corpus
news_corpus_tokenized = [tokenize_text(normalized_corpus_news[doc_id]) for doc_id in range(len(normalized_corpus_news))]

#Dictionary of the corpus:
news_dictionary = Dictionary(news_corpus_tokenized)

#Bag-of-words representation for each document of the corpus:
news_corpus_bow = [news_dictionary.doc2bow(doc) for doc in news_corpus_tokenized]

#top 20 words for each topic (using the function defined in session prep)
topic_topwords = get_topic_words(vectorizer = bow_vectorizer_news, lda_model = lda_news, n_words=20)

Now let's compute **the coherence score for the model overall**. We use one of the coherence metrics "u-mass" which measures semantic similarity of words in a topic, but there are other metrics as well.

*Note: You can check out different coherence metrics here if you are interested: https://dl.acm.org/doi/abs/10.1145/2684822.2685324*

In [None]:
cm = CoherenceModel(topics=topic_topwords, 
                    corpus = news_corpus_bow , 
                    dictionary = news_dictionary, coherence='u_mass')
print("Coherence score for the model: ", np.round(cm.get_coherence(), 5))  # get coherence value

Coherence score for the model:  -2.72741


You can also see **coherence scores by topic**:

In [None]:
print("Coherence score by topic (higher values are better): ", np.round(cm.get_coherence_per_topic(),5))

Coherence score by topic (higher values are better):  [-1.85277 -3.71361 -2.25444 -1.64265 -4.17359]


**Log-Likelihood Score**

To compute the log-likelihood score we use the **.score** attribute of our defined and fitted LDA function:

In [None]:
print("Log-Likelihood (higher values are better): ", lda_news.score(bow_news_corpus))

Log-Likelihood (higher values are better):  -30244.946527880245


**Perplexity Score**

To compute the Perplexity score we use the **.perplexity** attribute of our defined and fitted LDA function:

In [None]:
print("Perplexity (lower values are better): ", lda_news.perplexity(bow_news_corpus))

Perplexity (lower values are better):  758.3988993379553


<br>**NOTE:** Generally, you can write a simple script that selects the best topic model **automatically** based on a criterion for "best model" (log-likelihood, perplexity, or coherence score). The script can vary both parameters of the Dirichlet distributions and the number of topics, or just the number of topics.

### **Millen_tops**

In [None]:
#normalize data
normalized_corpus_news = normalize_corpus(df_Millen_tops)

#define a Bag-of-Words vecgtorizer
bow_vectorizer_news = CountVectorizer(max_features=1000)

#vectorize data
bow_news_corpus = bow_vectorizer_news.fit_transform(normalized_corpus_news)

In [None]:
lda_news = LatentDirichletAllocation(n_components=5, max_iter=100,
                                     doc_topic_prior = 0.25,
                                     topic_word_prior = 0.25).fit(bow_news_corpus)

Display results with top 10 words for each topic:

In [None]:
no_top_words_news = 10
display_topics(lda_news, bow_vectorizer_news.get_feature_names(), no_top_words_news)

Topic 0:
color top look like love fabric shirt blue buy white
Topic 1:
size small top fit large run order look like big
Topic 2:
wear love great jean pant comfortable fit look color perfect
Topic 3:
sweater sleeve long look like soft little short length wear
Topic 4:
dress fit size petite wear perfect love length skirt great


Display **word vectors** (words are in alphabetical order) for each topic. Each column is a topic:

In [None]:
word_weights = lda_news.components_ / lda_news.components_.sum(axis=1)[:, np.newaxis]
word_weights_df = pd.DataFrame(word_weights.T, 
                               index = bow_vectorizer_news.get_feature_names(), 
                               columns = ["Topic_" + str(i) for i in range(5)])
word_weights_df.head(10)

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4
135lbs,1.3e-05,0.000599,1.3e-05,1.8e-05,0.000173
32c,1.2e-05,0.000475,1.3e-05,1.8e-05,1.2e-05
32d,1.2e-05,0.000415,1.4e-05,1.8e-05,0.000178
34b,1.3e-05,1e-05,1.3e-05,1.9e-05,0.001972
34c,1.3e-05,0.000873,1.4e-05,1.8e-05,0.000509
34d,1.3e-05,1e-05,1.3e-05,0.000753,0.000767
34dd,1.2e-05,0.000619,1.3e-05,1.8e-05,1.2e-05
36c,1.3e-05,0.000422,1.3e-05,1.8e-05,0.000307
36dd,0.000225,1e-05,1.3e-05,1.8e-05,0.000543
able,1.3e-05,0.000806,0.000498,1.9e-05,0.002315


Now, **sort by word weights in Topic 0** (descending order) and see the weights by 10 most frequent words in Topic 0:

In [None]:
word_weights_df.sort_values(by='Topic_0',ascending=False).head(10)

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4
color,0.039157,0.002286,0.016418,0.005686,0.004945
top,0.023666,0.030651,0.015509,1.9e-05,1.2e-05
look,0.023471,0.017409,0.018121,0.02182,0.007608
like,0.021781,0.016779,0.006103,0.01816,0.009092
love,0.020716,0.013353,0.039344,0.008792,0.0146
fabric,0.01639,0.01153,0.003992,0.001298,0.006783
shirt,0.014597,0.0056,0.003601,0.005922,1.2e-05
blue,0.01248,9e-06,1.4e-05,1.9e-05,1.2e-05
buy,0.011869,0.003032,0.01364,1.9e-05,0.009937
white,0.011147,1e-05,0.003399,1.9e-05,1.2e-05


In [None]:
#prepare to display result in the Jupyter notebook
pyLDAvis.enable_notebook()

#run the visualization [mds is a function to use for visualizing the "distance" between topics]
pyLDAvis.sklearn.prepare(lda_news, bow_news_corpus, bow_vectorizer_news, mds='tsne')

In [None]:
lda_news_topic_weights = lda_news.transform(bow_news_corpus)

Let's convert lda_news_topic_weights into a nice-looking dataframe and have a look at the computed topic weights in documents:

In [None]:
#array of document "names" and topic "names" ("names" are just indecies)
doc_names = ["Doc_" + str(i) for i in range(len(normalized_corpus_news))]
topic_names = ["Topic_" + str(i) for i in range(5)]

#convert to dataframe
df_document_topic = pd.DataFrame(np.round(lda_news_topic_weights, 5), columns=topic_names, index=doc_names)
df_document_topic.head(5)

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4
Doc_0,0.26743,0.1443,0.0827,0.00972,0.49586
Doc_1,0.00677,0.1531,0.38276,0.13342,0.32395
Doc_2,0.21194,0.18288,0.27501,0.15484,0.17533
Doc_3,0.01311,0.1964,0.0125,0.397,0.38098
Doc_4,0.007,0.18169,0.00679,0.63231,0.17221


The topic with the highest weight in each document is a **dominant topic**. The weights across the 4 topics sum up to 1. Let's add a column that shows dominant topic for each document:

In [None]:
#vector of indecies for columns with the highest value by each row in df_document_topic
dominant_topic = np.argmax(df_document_topic.values, axis=1)

#add dominant_topic as a column to df_document_topic
df_document_topic['dominant_topic'] = dominant_topic
df_document_topic.head(5)

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,dominant_topic
Doc_0,0.26743,0.1443,0.0827,0.00972,0.49586,4
Doc_1,0.00677,0.1531,0.38276,0.13342,0.32395,2
Doc_2,0.21194,0.18288,0.27501,0.15484,0.17533,2
Doc_3,0.01311,0.1964,0.0125,0.397,0.38098,3
Doc_4,0.007,0.18169,0.00679,0.63231,0.17221,3


**Topic Model Evaluation: Log-likelihood, Perplexity and Coherence Scores**

Log-likelihood, Perplexity and Coherence Score are **measures of performance** for a topic model. They are used for comparing and discriminating between topic models estimated on the same data. Log-likelihood, perplexity and coherence scores **do not have** a baseline or a threshold values and therefore are useful only for comparing models. 

How do you specify different models? You can set **different number of topics** and also play with the **parameters of the Dirichlet distributions**. 

#### **Coherence Score**

We will use a function **CoherenceModel()** from the **gensim** module (you can also explore that package as it can be used to estimate an LDA model). The sklearn module does not have the functionality to compute the coherence score. Let's install the gensim package and the functions needed:

In [None]:
!{sys.executable} -m pip install gensim
import gensim

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary



The function CoherenceModel() needs as **inputs**:

**1. Dictionary of the corpus**<br>
**2. Corpus with each document represented as Bag-of-Words**<br>
**3. An array of top words for each topic: we'll have top 20 words for each topic** 
  
We will now create those objects:

In [None]:
#tokenizing the corpus
news_corpus_tokenized = [tokenize_text(normalized_corpus_news[doc_id]) for doc_id in range(len(normalized_corpus_news))]

#Dictionary of the corpus:
news_dictionary = Dictionary(news_corpus_tokenized)

#Bag-of-words representation for each document of the corpus:
news_corpus_bow = [news_dictionary.doc2bow(doc) for doc in news_corpus_tokenized]

#top 20 words for each topic (using the function defined in session prep)
topic_topwords = get_topic_words(vectorizer = bow_vectorizer_news, lda_model = lda_news, n_words=20)

Now let's compute **the coherence score for the model overall**. We use one of the coherence metrics "u-mass" which measures semantic similarity of words in a topic, but there are other metrics as well.

*Note: You can check out different coherence metrics here if you are interested: https://dl.acm.org/doi/abs/10.1145/2684822.2685324*

In [None]:
cm = CoherenceModel(topics=topic_topwords, 
                    corpus = news_corpus_bow , 
                    dictionary = news_dictionary, coherence='u_mass')
print("Coherence score for the model: ", np.round(cm.get_coherence(), 5))  # get coherence value

Coherence score for the model:  -1.99131


You can also see **coherence scores by topic**:

In [None]:
print("Coherence score by topic (higher values are better): ", np.round(cm.get_coherence_per_topic(),5))

Coherence score by topic (higher values are better):  [-2.19056 -1.78003 -2.19029 -1.99035 -1.80531]


**Log-Likelihood Score**

To compute the log-likelihood score we use the **.score** attribute of our defined and fitted LDA function:

In [None]:
print("Log-Likelihood (higher values are better): ", lda_news.score(bow_news_corpus))

Log-Likelihood (higher values are better):  -622660.6536874948


**Perplexity Score**

To compute the Perplexity score we use the **.perplexity** attribute of our defined and fitted LDA function:

In [None]:
print("Perplexity (lower values are better): ", lda_news.perplexity(bow_news_corpus))

Perplexity (lower values are better):  444.14405115823604


<br>**NOTE:** Generally, you can write a simple script that selects the best topic model **automatically** based on a criterion for "best model" (log-likelihood, perplexity, or coherence score). The script can vary both parameters of the Dirichlet distributions and the number of topics, or just the number of topics.

### **Millen_dress**

In [None]:
#normalize data
normalized_corpus_news = normalize_corpus(df_Millen_dress)

#define a Bag-of-Words vecgtorizer
bow_vectorizer_news = CountVectorizer(max_features=1000)

#vectorize data
bow_news_corpus = bow_vectorizer_news.fit_transform(normalized_corpus_news)

In [None]:
lda_news = LatentDirichletAllocation(n_components=5, max_iter=100,
                                     doc_topic_prior = 0.25,
                                     topic_word_prior = 0.25).fit(bow_news_corpus)

Display results with top 10 words for each topic:

In [None]:
no_top_words_news = 10
display_topics(lda_news, bow_vectorizer_news.get_feature_names(), no_top_words_news)

Display **word vectors** (words are in alphabetical order) for each topic. Each column is a topic:

In [None]:
word_weights = lda_news.components_ / lda_news.components_.sum(axis=1)[:, np.newaxis]
word_weights_df = pd.DataFrame(word_weights.T, 
                               index = bow_vectorizer_news.get_feature_names(), 
                               columns = ["Topic_" + str(i) for i in range(5)])
word_weights_df.head(10)

Now, **sort by word weights in Topic 0** (descending order) and see the weights by 10 most frequent words in Topic 0:

In [None]:
word_weights_df.sort_values(by='Topic_0',ascending=False).head(10)

In [None]:
#prepare to display result in the Jupyter notebook
pyLDAvis.enable_notebook()

#run the visualization [mds is a function to use for visualizing the "distance" between topics]
pyLDAvis.sklearn.prepare(lda_news, bow_news_corpus, bow_vectorizer_news, mds='tsne')

In [None]:
lda_news_topic_weights = lda_news.transform(bow_news_corpus)

Let's convert lda_news_topic_weights into a nice-looking dataframe and have a look at the computed topic weights in documents:

In [None]:
#array of document "names" and topic "names" ("names" are just indecies)
doc_names = ["Doc_" + str(i) for i in range(len(normalized_corpus_news))]
topic_names = ["Topic_" + str(i) for i in range(5)]

#convert to dataframe
df_document_topic = pd.DataFrame(np.round(lda_news_topic_weights, 5), columns=topic_names, index=doc_names)
df_document_topic.head(5)

The topic with the highest weight in each document is a **dominant topic**. The weights across the 4 topics sum up to 1. Let's add a column that shows dominant topic for each document:

In [None]:
#vector of indecies for columns with the highest value by each row in df_document_topic
dominant_topic = np.argmax(df_document_topic.values, axis=1)

#add dominant_topic as a column to df_document_topic
df_document_topic['dominant_topic'] = dominant_topic
df_document_topic.head(5)

**Topic Model Evaluation: Log-likelihood, Perplexity and Coherence Scores**

Log-likelihood, Perplexity and Coherence Score are **measures of performance** for a topic model. They are used for comparing and discriminating between topic models estimated on the same data. Log-likelihood, perplexity and coherence scores **do not have** a baseline or a threshold values and therefore are useful only for comparing models. 

How do you specify different models? You can set **different number of topics** and also play with the **parameters of the Dirichlet distributions**. 

#### **Coherence Score**

We will use a function **CoherenceModel()** from the **gensim** module (you can also explore that package as it can be used to estimate an LDA model). The sklearn module does not have the functionality to compute the coherence score. Let's install the gensim package and the functions needed:

In [None]:
!{sys.executable} -m pip install gensim
import gensim

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

The function CoherenceModel() needs as **inputs**:

**1. Dictionary of the corpus**<br>
**2. Corpus with each document represented as Bag-of-Words**<br>
**3. An array of top words for each topic: we'll have top 20 words for each topic** 
  
We will now create those objects:

In [None]:
#tokenizing the corpus
news_corpus_tokenized = [tokenize_text(normalized_corpus_news[doc_id]) for doc_id in range(len(normalized_corpus_news))]

#Dictionary of the corpus:
news_dictionary = Dictionary(news_corpus_tokenized)

#Bag-of-words representation for each document of the corpus:
news_corpus_bow = [news_dictionary.doc2bow(doc) for doc in news_corpus_tokenized]

#top 20 words for each topic (using the function defined in session prep)
topic_topwords = get_topic_words(vectorizer = bow_vectorizer_news, lda_model = lda_news, n_words=20)

Now let's compute **the coherence score for the model overall**. We use one of the coherence metrics "u-mass" which measures semantic similarity of words in a topic, but there are other metrics as well.

*Note: You can check out different coherence metrics here if you are interested: https://dl.acm.org/doi/abs/10.1145/2684822.2685324*

In [None]:
cm = CoherenceModel(topics=topic_topwords, 
                    corpus = news_corpus_bow , 
                    dictionary = news_dictionary, coherence='u_mass')
print("Coherence score for the model: ", np.round(cm.get_coherence(), 5))  # get coherence value

You can also see **coherence scores by topic**:

In [None]:
print("Coherence score by topic (higher values are better): ", np.round(cm.get_coherence_per_topic(),5))

**Log-Likelihood Score**

To compute the log-likelihood score we use the **.score** attribute of our defined and fitted LDA function:

In [None]:
print("Log-Likelihood (higher values are better): ", lda_news.score(bow_news_corpus))

**Perplexity Score**

To compute the Perplexity score we use the **.perplexity** attribute of our defined and fitted LDA function:

In [None]:
print("Perplexity (lower values are better): ", lda_news.perplexity(bow_news_corpus))

<br>**NOTE:** Generally, you can write a simple script that selects the best topic model **automatically** based on a criterion for "best model" (log-likelihood, perplexity, or coherence score). The script can vary both parameters of the Dirichlet distributions and the number of topics, or just the number of topics.

### **Millen_bottoms**

In [None]:
#normalize data
normalized_corpus_news = normalize_corpus(df_Millen_bottoms)

#define a Bag-of-Words vecgtorizer
bow_vectorizer_news = CountVectorizer(max_features=1000)

#vectorize data
bow_news_corpus = bow_vectorizer_news.fit_transform(normalized_corpus_news)

In [None]:
lda_news = LatentDirichletAllocation(n_components=5, max_iter=100,
                                     doc_topic_prior = 0.25,
                                     topic_word_prior = 0.25).fit(bow_news_corpus)

Display results with top 10 words for each topic:

In [None]:
no_top_words_news = 10
display_topics(lda_news, bow_vectorizer_news.get_feature_names(), no_top_words_news)

Display **word vectors** (words are in alphabetical order) for each topic. Each column is a topic:

In [None]:
word_weights = lda_news.components_ / lda_news.components_.sum(axis=1)[:, np.newaxis]
word_weights_df = pd.DataFrame(word_weights.T, 
                               index = bow_vectorizer_news.get_feature_names(), 
                               columns = ["Topic_" + str(i) for i in range(5)])
word_weights_df.head(10)

Now, **sort by word weights in Topic 0** (descending order) and see the weights by 10 most frequent words in Topic 0:

In [None]:
word_weights_df.sort_values(by='Topic_0',ascending=False).head(10)

In [None]:
#prepare to display result in the Jupyter notebook
pyLDAvis.enable_notebook()

#run the visualization [mds is a function to use for visualizing the "distance" between topics]
pyLDAvis.sklearn.prepare(lda_news, bow_news_corpus, bow_vectorizer_news, mds='tsne')

In [None]:
lda_news_topic_weights = lda_news.transform(bow_news_corpus)

Let's convert lda_news_topic_weights into a nice-looking dataframe and have a look at the computed topic weights in documents:

In [None]:
#array of document "names" and topic "names" ("names" are just indecies)
doc_names = ["Doc_" + str(i) for i in range(len(normalized_corpus_news))]
topic_names = ["Topic_" + str(i) for i in range(5)]

#convert to dataframe
df_document_topic = pd.DataFrame(np.round(lda_news_topic_weights, 5), columns=topic_names, index=doc_names)
df_document_topic.head(5)

The topic with the highest weight in each document is a **dominant topic**. The weights across the 4 topics sum up to 1. Let's add a column that shows dominant topic for each document:

In [None]:
#vector of indecies for columns with the highest value by each row in df_document_topic
dominant_topic = np.argmax(df_document_topic.values, axis=1)

#add dominant_topic as a column to df_document_topic
df_document_topic['dominant_topic'] = dominant_topic
df_document_topic.head(5)

**Topic Model Evaluation: Log-likelihood, Perplexity and Coherence Scores**

Log-likelihood, Perplexity and Coherence Score are **measures of performance** for a topic model. They are used for comparing and discriminating between topic models estimated on the same data. Log-likelihood, perplexity and coherence scores **do not have** a baseline or a threshold values and therefore are useful only for comparing models. 

How do you specify different models? You can set **different number of topics** and also play with the **parameters of the Dirichlet distributions**. 

#### **Coherence Score**

We will use a function **CoherenceModel()** from the **gensim** module (you can also explore that package as it can be used to estimate an LDA model). The sklearn module does not have the functionality to compute the coherence score. Let's install the gensim package and the functions needed:

In [None]:
!{sys.executable} -m pip install gensim
import gensim

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

The function CoherenceModel() needs as **inputs**:

**1. Dictionary of the corpus**<br>
**2. Corpus with each document represented as Bag-of-Words**<br>
**3. An array of top words for each topic: we'll have top 20 words for each topic** 
  
We will now create those objects:

In [None]:
#tokenizing the corpus
news_corpus_tokenized = [tokenize_text(normalized_corpus_news[doc_id]) for doc_id in range(len(normalized_corpus_news))]

#Dictionary of the corpus:
news_dictionary = Dictionary(news_corpus_tokenized)

#Bag-of-words representation for each document of the corpus:
news_corpus_bow = [news_dictionary.doc2bow(doc) for doc in news_corpus_tokenized]

#top 20 words for each topic (using the function defined in session prep)
topic_topwords = get_topic_words(vectorizer = bow_vectorizer_news, lda_model = lda_news, n_words=20)

Now let's compute **the coherence score for the model overall**. We use one of the coherence metrics "u-mass" which measures semantic similarity of words in a topic, but there are other metrics as well.

*Note: You can check out different coherence metrics here if you are interested: https://dl.acm.org/doi/abs/10.1145/2684822.2685324*

In [None]:
cm = CoherenceModel(topics=topic_topwords, 
                    corpus = news_corpus_bow , 
                    dictionary = news_dictionary, coherence='u_mass')
print("Coherence score for the model: ", np.round(cm.get_coherence(), 5))  # get coherence value

You can also see **coherence scores by topic**:

In [None]:
print("Coherence score by topic (higher values are better): ", np.round(cm.get_coherence_per_topic(),5))

**Log-Likelihood Score**

To compute the log-likelihood score we use the **.score** attribute of our defined and fitted LDA function:

In [None]:
print("Log-Likelihood (higher values are better): ", lda_news.score(bow_news_corpus))

**Perplexity Score**

To compute the Perplexity score we use the **.perplexity** attribute of our defined and fitted LDA function:

In [None]:
print("Perplexity (lower values are better): ", lda_news.perplexity(bow_news_corpus))

<br>**NOTE:** Generally, you can write a simple script that selects the best topic model **automatically** based on a criterion for "best model" (log-likelihood, perplexity, or coherence score). The script can vary both parameters of the Dirichlet distributions and the number of topics, or just the number of topics.

### **GenX_tops**

In [None]:
#normalize data
normalized_corpus_news = normalize_corpus(df_GenX_tops)

#define a Bag-of-Words vecgtorizer
bow_vectorizer_news = CountVectorizer(max_features=1000)

#vectorize data
bow_news_corpus = bow_vectorizer_news.fit_transform(normalized_corpus_news)

In [None]:
lda_news = LatentDirichletAllocation(n_components=5, max_iter=100,
                                     doc_topic_prior = 0.25,
                                     topic_word_prior = 0.25).fit(bow_news_corpus)

Display results with top 10 words for each topic:

In [None]:
no_top_words_news = 10
display_topics(lda_news, bow_vectorizer_news.get_feature_names(), no_top_words_news)

Display **word vectors** (words are in alphabetical order) for each topic. Each column is a topic:

In [None]:
word_weights = lda_news.components_ / lda_news.components_.sum(axis=1)[:, np.newaxis]
word_weights_df = pd.DataFrame(word_weights.T, 
                               index = bow_vectorizer_news.get_feature_names(), 
                               columns = ["Topic_" + str(i) for i in range(5)])
word_weights_df.head(10)

Now, **sort by word weights in Topic 0** (descending order) and see the weights by 10 most frequent words in Topic 0:

In [None]:
word_weights_df.sort_values(by='Topic_0',ascending=False).head(10)

In [None]:
#prepare to display result in the Jupyter notebook
pyLDAvis.enable_notebook()

#run the visualization [mds is a function to use for visualizing the "distance" between topics]
pyLDAvis.sklearn.prepare(lda_news, bow_news_corpus, bow_vectorizer_news, mds='tsne')

In [None]:
lda_news_topic_weights = lda_news.transform(bow_news_corpus)

Let's convert lda_news_topic_weights into a nice-looking dataframe and have a look at the computed topic weights in documents:

In [None]:
#array of document "names" and topic "names" ("names" are just indecies)
doc_names = ["Doc_" + str(i) for i in range(len(normalized_corpus_news))]
topic_names = ["Topic_" + str(i) for i in range(5)]

#convert to dataframe
df_document_topic = pd.DataFrame(np.round(lda_news_topic_weights, 5), columns=topic_names, index=doc_names)
df_document_topic.head(5)

The topic with the highest weight in each document is a **dominant topic**. The weights across the 4 topics sum up to 1. Let's add a column that shows dominant topic for each document:

In [None]:
#vector of indecies for columns with the highest value by each row in df_document_topic
dominant_topic = np.argmax(df_document_topic.values, axis=1)

#add dominant_topic as a column to df_document_topic
df_document_topic['dominant_topic'] = dominant_topic
df_document_topic.head(5)

**Topic Model Evaluation: Log-likelihood, Perplexity and Coherence Scores**

Log-likelihood, Perplexity and Coherence Score are **measures of performance** for a topic model. They are used for comparing and discriminating between topic models estimated on the same data. Log-likelihood, perplexity and coherence scores **do not have** a baseline or a threshold values and therefore are useful only for comparing models. 

How do you specify different models? You can set **different number of topics** and also play with the **parameters of the Dirichlet distributions**. 

#### **Coherence Score**

We will use a function **CoherenceModel()** from the **gensim** module (you can also explore that package as it can be used to estimate an LDA model). The sklearn module does not have the functionality to compute the coherence score. Let's install the gensim package and the functions needed:

In [None]:
!{sys.executable} -m pip install gensim
import gensim

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

The function CoherenceModel() needs as **inputs**:

**1. Dictionary of the corpus**<br>
**2. Corpus with each document represented as Bag-of-Words**<br>
**3. An array of top words for each topic: we'll have top 20 words for each topic** 
  
We will now create those objects:

In [None]:
#tokenizing the corpus
news_corpus_tokenized = [tokenize_text(normalized_corpus_news[doc_id]) for doc_id in range(len(normalized_corpus_news))]

#Dictionary of the corpus:
news_dictionary = Dictionary(news_corpus_tokenized)

#Bag-of-words representation for each document of the corpus:
news_corpus_bow = [news_dictionary.doc2bow(doc) for doc in news_corpus_tokenized]

#top 20 words for each topic (using the function defined in session prep)
topic_topwords = get_topic_words(vectorizer = bow_vectorizer_news, lda_model = lda_news, n_words=20)

Now let's compute **the coherence score for the model overall**. We use one of the coherence metrics "u-mass" which measures semantic similarity of words in a topic, but there are other metrics as well.

*Note: You can check out different coherence metrics here if you are interested: https://dl.acm.org/doi/abs/10.1145/2684822.2685324*

In [None]:
cm = CoherenceModel(topics=topic_topwords, 
                    corpus = news_corpus_bow , 
                    dictionary = news_dictionary, coherence='u_mass')
print("Coherence score for the model: ", np.round(cm.get_coherence(), 5))  # get coherence value

You can also see **coherence scores by topic**:

In [None]:
print("Coherence score by topic (higher values are better): ", np.round(cm.get_coherence_per_topic(),5))

**Log-Likelihood Score**

To compute the log-likelihood score we use the **.score** attribute of our defined and fitted LDA function:

In [None]:
print("Log-Likelihood (higher values are better): ", lda_news.score(bow_news_corpus))

**Perplexity Score**

To compute the Perplexity score we use the **.perplexity** attribute of our defined and fitted LDA function:

In [None]:
print("Perplexity (lower values are better): ", lda_news.perplexity(bow_news_corpus))

<br>**NOTE:** Generally, you can write a simple script that selects the best topic model **automatically** based on a criterion for "best model" (log-likelihood, perplexity, or coherence score). The script can vary both parameters of the Dirichlet distributions and the number of topics, or just the number of topics.

### **GenX_dress**

In [None]:
#normalize data
normalized_corpus_news = normalize_corpus(df_GenX_dress)

#define a Bag-of-Words vecgtorizer
bow_vectorizer_news = CountVectorizer(max_features=1000)

#vectorize data
bow_news_corpus = bow_vectorizer_news.fit_transform(normalized_corpus_news)

In [None]:
lda_news = LatentDirichletAllocation(n_components=5, max_iter=100,
                                     doc_topic_prior = 0.25,
                                     topic_word_prior = 0.25).fit(bow_news_corpus)

Display results with top 10 words for each topic:

In [None]:
no_top_words_news = 10
display_topics(lda_news, bow_vectorizer_news.get_feature_names(), no_top_words_news)

Display **word vectors** (words are in alphabetical order) for each topic. Each column is a topic:

In [None]:
word_weights = lda_news.components_ / lda_news.components_.sum(axis=1)[:, np.newaxis]
word_weights_df = pd.DataFrame(word_weights.T, 
                               index = bow_vectorizer_news.get_feature_names(), 
                               columns = ["Topic_" + str(i) for i in range(5)])
word_weights_df.head(10)

Now, **sort by word weights in Topic 0** (descending order) and see the weights by 10 most frequent words in Topic 0:

In [None]:
word_weights_df.sort_values(by='Topic_0',ascending=False).head(10)

In [None]:
#prepare to display result in the Jupyter notebook
pyLDAvis.enable_notebook()

#run the visualization [mds is a function to use for visualizing the "distance" between topics]
pyLDAvis.sklearn.prepare(lda_news, bow_news_corpus, bow_vectorizer_news, mds='tsne')

In [None]:
lda_news_topic_weights = lda_news.transform(bow_news_corpus)

Let's convert lda_news_topic_weights into a nice-looking dataframe and have a look at the computed topic weights in documents:

In [None]:
#array of document "names" and topic "names" ("names" are just indecies)
doc_names = ["Doc_" + str(i) for i in range(len(normalized_corpus_news))]
topic_names = ["Topic_" + str(i) for i in range(5)]

#convert to dataframe
df_document_topic = pd.DataFrame(np.round(lda_news_topic_weights, 5), columns=topic_names, index=doc_names)
df_document_topic.head(5)

The topic with the highest weight in each document is a **dominant topic**. The weights across the 4 topics sum up to 1. Let's add a column that shows dominant topic for each document:

In [None]:
#vector of indecies for columns with the highest value by each row in df_document_topic
dominant_topic = np.argmax(df_document_topic.values, axis=1)

#add dominant_topic as a column to df_document_topic
df_document_topic['dominant_topic'] = dominant_topic
df_document_topic.head(5)

**Topic Model Evaluation: Log-likelihood, Perplexity and Coherence Scores**

Log-likelihood, Perplexity and Coherence Score are **measures of performance** for a topic model. They are used for comparing and discriminating between topic models estimated on the same data. Log-likelihood, perplexity and coherence scores **do not have** a baseline or a threshold values and therefore are useful only for comparing models. 

How do you specify different models? You can set **different number of topics** and also play with the **parameters of the Dirichlet distributions**. 

#### **Coherence Score**

We will use a function **CoherenceModel()** from the **gensim** module (you can also explore that package as it can be used to estimate an LDA model). The sklearn module does not have the functionality to compute the coherence score. Let's install the gensim package and the functions needed:

In [None]:
!{sys.executable} -m pip install gensim
import gensim

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

The function CoherenceModel() needs as **inputs**:

**1. Dictionary of the corpus**<br>
**2. Corpus with each document represented as Bag-of-Words**<br>
**3. An array of top words for each topic: we'll have top 20 words for each topic** 
  
We will now create those objects:

In [None]:
#tokenizing the corpus
news_corpus_tokenized = [tokenize_text(normalized_corpus_news[doc_id]) for doc_id in range(len(normalized_corpus_news))]

#Dictionary of the corpus:
news_dictionary = Dictionary(news_corpus_tokenized)

#Bag-of-words representation for each document of the corpus:
news_corpus_bow = [news_dictionary.doc2bow(doc) for doc in news_corpus_tokenized]

#top 20 words for each topic (using the function defined in session prep)
topic_topwords = get_topic_words(vectorizer = bow_vectorizer_news, lda_model = lda_news, n_words=20)

Now let's compute **the coherence score for the model overall**. We use one of the coherence metrics "u-mass" which measures semantic similarity of words in a topic, but there are other metrics as well.

*Note: You can check out different coherence metrics here if you are interested: https://dl.acm.org/doi/abs/10.1145/2684822.2685324*

In [None]:
cm = CoherenceModel(topics=topic_topwords, 
                    corpus = news_corpus_bow , 
                    dictionary = news_dictionary, coherence='u_mass')
print("Coherence score for the model: ", np.round(cm.get_coherence(), 5))  # get coherence value

You can also see **coherence scores by topic**:

In [None]:
print("Coherence score by topic (higher values are better): ", np.round(cm.get_coherence_per_topic(),5))

**Log-Likelihood Score**

To compute the log-likelihood score we use the **.score** attribute of our defined and fitted LDA function:

In [None]:
print("Log-Likelihood (higher values are better): ", lda_news.score(bow_news_corpus))

**Perplexity Score**

To compute the Perplexity score we use the **.perplexity** attribute of our defined and fitted LDA function:

In [None]:
print("Perplexity (lower values are better): ", lda_news.perplexity(bow_news_corpus))

<br>**NOTE:** Generally, you can write a simple script that selects the best topic model **automatically** based on a criterion for "best model" (log-likelihood, perplexity, or coherence score). The script can vary both parameters of the Dirichlet distributions and the number of topics, or just the number of topics.

### **GenX_bottoms**

In [None]:
#normalize data
normalized_corpus_news = normalize_corpus(df_GenX_bottoms)

#define a Bag-of-Words vecgtorizer
bow_vectorizer_news = CountVectorizer(max_features=1000)

#vectorize data
bow_news_corpus = bow_vectorizer_news.fit_transform(normalized_corpus_news)

In [None]:
lda_news = LatentDirichletAllocation(n_components=5, max_iter=100,
                                     doc_topic_prior = 0.25,
                                     topic_word_prior = 0.25).fit(bow_news_corpus)

Display results with top 10 words for each topic:

In [None]:
no_top_words_news = 10
display_topics(lda_news, bow_vectorizer_news.get_feature_names(), no_top_words_news)

Display **word vectors** (words are in alphabetical order) for each topic. Each column is a topic:

In [None]:
word_weights = lda_news.components_ / lda_news.components_.sum(axis=1)[:, np.newaxis]
word_weights_df = pd.DataFrame(word_weights.T, 
                               index = bow_vectorizer_news.get_feature_names(), 
                               columns = ["Topic_" + str(i) for i in range(5)])
word_weights_df.head(10)

Now, **sort by word weights in Topic 0** (descending order) and see the weights by 10 most frequent words in Topic 0:

In [None]:
word_weights_df.sort_values(by='Topic_0',ascending=False).head(10)

In [None]:
#prepare to display result in the Jupyter notebook
pyLDAvis.enable_notebook()

#run the visualization [mds is a function to use for visualizing the "distance" between topics]
pyLDAvis.sklearn.prepare(lda_news, bow_news_corpus, bow_vectorizer_news, mds='tsne')

In [None]:
lda_news_topic_weights = lda_news.transform(bow_news_corpus)

Let's convert lda_news_topic_weights into a nice-looking dataframe and have a look at the computed topic weights in documents:

In [None]:
#array of document "names" and topic "names" ("names" are just indecies)
doc_names = ["Doc_" + str(i) for i in range(len(normalized_corpus_news))]
topic_names = ["Topic_" + str(i) for i in range(5)]

#convert to dataframe
df_document_topic = pd.DataFrame(np.round(lda_news_topic_weights, 5), columns=topic_names, index=doc_names)
df_document_topic.head(5)

The topic with the highest weight in each document is a **dominant topic**. The weights across the 4 topics sum up to 1. Let's add a column that shows dominant topic for each document:

In [None]:
#vector of indecies for columns with the highest value by each row in df_document_topic
dominant_topic = np.argmax(df_document_topic.values, axis=1)

#add dominant_topic as a column to df_document_topic
df_document_topic['dominant_topic'] = dominant_topic
df_document_topic.head(5)

**Topic Model Evaluation: Log-likelihood, Perplexity and Coherence Scores**

Log-likelihood, Perplexity and Coherence Score are **measures of performance** for a topic model. They are used for comparing and discriminating between topic models estimated on the same data. Log-likelihood, perplexity and coherence scores **do not have** a baseline or a threshold values and therefore are useful only for comparing models. 

How do you specify different models? You can set **different number of topics** and also play with the **parameters of the Dirichlet distributions**. 

#### **Coherence Score**

We will use a function **CoherenceModel()** from the **gensim** module (you can also explore that package as it can be used to estimate an LDA model). The sklearn module does not have the functionality to compute the coherence score. Let's install the gensim package and the functions needed:

In [None]:
!{sys.executable} -m pip install gensim
import gensim

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

The function CoherenceModel() needs as **inputs**:

**1. Dictionary of the corpus**<br>
**2. Corpus with each document represented as Bag-of-Words**<br>
**3. An array of top words for each topic: we'll have top 20 words for each topic** 
  
We will now create those objects:

In [None]:
#tokenizing the corpus
news_corpus_tokenized = [tokenize_text(normalized_corpus_news[doc_id]) for doc_id in range(len(normalized_corpus_news))]

#Dictionary of the corpus:
news_dictionary = Dictionary(news_corpus_tokenized)

#Bag-of-words representation for each document of the corpus:
news_corpus_bow = [news_dictionary.doc2bow(doc) for doc in news_corpus_tokenized]

#top 20 words for each topic (using the function defined in session prep)
topic_topwords = get_topic_words(vectorizer = bow_vectorizer_news, lda_model = lda_news, n_words=20)

Now let's compute **the coherence score for the model overall**. We use one of the coherence metrics "u-mass" which measures semantic similarity of words in a topic, but there are other metrics as well.

*Note: You can check out different coherence metrics here if you are interested: https://dl.acm.org/doi/abs/10.1145/2684822.2685324*

In [None]:
cm = CoherenceModel(topics=topic_topwords, 
                    corpus = news_corpus_bow , 
                    dictionary = news_dictionary, coherence='u_mass')
print("Coherence score for the model: ", np.round(cm.get_coherence(), 5))  # get coherence value

You can also see **coherence scores by topic**:

In [None]:
print("Coherence score by topic (higher values are better): ", np.round(cm.get_coherence_per_topic(),5))

**Log-Likelihood Score**

To compute the log-likelihood score we use the **.score** attribute of our defined and fitted LDA function:

In [None]:
print("Log-Likelihood (higher values are better): ", lda_news.score(bow_news_corpus))

**Perplexity Score**

To compute the Perplexity score we use the **.perplexity** attribute of our defined and fitted LDA function:

In [None]:
print("Perplexity (lower values are better): ", lda_news.perplexity(bow_news_corpus))

<br>**NOTE:** Generally, you can write a simple script that selects the best topic model **automatically** based on a criterion for "best model" (log-likelihood, perplexity, or coherence score). The script can vary both parameters of the Dirichlet distributions and the number of topics, or just the number of topics.