# Visualizing Topics

The visualization will allow us to quickly see words that are most relevant to a topic and the distances between topics.

In [3]:
#!pip install --upgrade jupyter_client


In [1]:
#%pip install pyLDAvis

IMPORT LIBRARIES

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import pyLDAvis
import pyLDAvis.lda_model

nltk.download("stopwords")
nltk.download("wordnet")

import warnings
warnings.filterwarnings("ignore", message=r"datetime\.datetime\.utcnow\(\) is deprecated")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


LOAD THE DATASET

In [4]:
document = pd.read_csv("/content/bbc-text.csv")
document.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


TEXT PROCESSING

In [5]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def process_text(text):
  text = text.lower()
  tokens = text.split()
  tokens = [word for word in tokens if word.isalpha()]
  tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
  return " ".join(tokens)

document["clean_text"] = document["text"].astype(str).apply(process_text)

DOCUMENT-TERM MATRIX

In [7]:
vectorizer = CountVectorizer(
    max_df = 0.95,
    min_df = 5
)

dtm = vectorizer.fit_transform(document["clean_text"])

TRAIN LDA TOPIC MODEL

In [8]:
num_topics = 5

lda_model = LatentDirichletAllocation(
    n_components = num_topics,
    random_state = 42,
    learning_method = "batch"
)

lda_model.fit(dtm)

DISPLAY TOPICS

In [10]:
feature_names = vectorizer.get_feature_names_out()

def display_topics(model, feature_names, n_words = 10):
  for idx, topic in enumerate(model.components_):
    print(f"\nTopic {idx + 1}:")
    print(" ".join([feature_names[i]
                    for i in topic.argsort()[:-n_words - 1:-1]]))

display_topics(lda_model, feature_names)


Topic 1:
game england player club would said wale back side first

Topic 2:
said year company firm market also bank would sale new

Topic 3:
film best year said one award also first last world

Topic 4:
said people game mobile technology phone new also one service

Topic 5:
mr said would government labour people party minister say blair


VISUALIZE TOPICS

In [12]:
pyLDAvis.enable_notebook()

lda_vis = pyLDAvis.lda_model.prepare(
    lda_model,
    dtm,
    vectorizer
)

lda_vis