# Latent Dirichlet Allocation
---
  
**LDA**  
- Generative Probabilistic Topic Model. 
- Assumes documents are a mixture of topics and that each word in the document is attributable to a topic  
- The topic probabilities provide an explicit representation of a document

## Env Preparation

In [1]:
RUNTIMES = ['cluster', 'local'] # Modify the paths from the coonfig.yaml
RUNTIME = RUNTIMES[0]

In [2]:
import os
import sys
# os.chdir('/Users/pabloruizruiz/OneDrive/Courses/NLP_Stanford/Complementary_Courses/Document_Clustering')
os.chdir('/app/')
print('Workdir: ', os.getcwd())
from os.path import join as JP
sys.path.append(os.getcwd())
sys.path.append(JP(os.getcwd(),'utils'))
sys.path.append(JP(os.getcwd(),'scripts'))

Workdir:  /app


In [3]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import plotly.express as px
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

tqdm.pandas()
%matplotlib notebook
np.set_printoptions(precision=3)
pd.options.display.float_format = '{:,.3f}'.format

In [5]:
import pickle
from pprint import pprint
from collections import defaultdict

from utils.nlp_utils import preproces
from utils.general import parse_yaml, ensure_directories

from scripts.catalog import (
    Catalog, Document, Corpus,
    load_catalog, load_corpus)

config = parse_yaml('config.yaml')
paths = config['paths']
ensure_directories(paths)
paths

{'results': 'results/',
 'catalog': 'data/catalog',
 'checkpoints': 'checkpoints/',
 'stopwords': 'data/stopwords/',
 'data': 'data/',
 'images': 'images/'}

In [8]:
from nltk.corpus import stopwords
SW = stopwords.words('english')
# with open(JP(paths['stopwords'],'nltk_stopwords.pkl'), 'rb') as f:
#     SW = pickle.load(f)

## Data

In [9]:
name = 'bbc-text-processed.csv'

In [10]:
data = pd.read_csv(JP('data',name), index_col=0)
data.head()

Unnamed: 0,category,text,lenght,processed
0,tech,tv future in the hands of viewers with home th...,806,tv future hand viewer system plasma high defin...
1,business,worldcom boss left books alone former worldc...,332,left book ebber accuse oversee 11bn fraud acco...
2,sport,tigers wary of farrell gamble leicester say ...,270,wary farrell gamble rush make bid decide switc...
3,sport,yeading face newcastle in fa cup premiership s...,390,yeade face premiership face trip leader yeade ...
4,entertainment,ocean s twelve raids box office ocean s twelve...,287,raid ocean crime go number office chart take w...


## TFIDF

In [11]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
EMBED_SIZE = 10000 
NUM_CLUSTERS = data['category'].nunique()
WORDS_PER_CLUSTER = None
print(NUM_CLUSTERS)

5


In [13]:
documents = [Document() for i in range(data.shape[0])]
for d in range(len(documents)):
    documents[d].processed_text = data['processed'][d]

In [14]:
catalog = Catalog()
catalog.documents = documents

In [15]:
vectorizer = TfidfVectorizer(
    min_df=.05,
    max_df=.8,
    norm='l2',
    use_idf=True,
    smooth_idf=True,
    max_features=EMBED_SIZE,
    ngram_range=(1,3),
    lowercase=True,
    stop_words=SW)

In [16]:
_ = catalog.collect_corpus(attr='processed_text', form=list)
tfidf = catalog.to_matrix(
    vectorizer=vectorizer,
    modelname='TFIDF',
    max_docs=None)
print(tfidf.representation.shape)
tfidf.representation.head()

(2225, 460)


Unnamed: 0,able,accept,access,accord,account,accuse,act,action,add,admit,...,week,weekend,win,winner,woman,work,world,write,year,young
0,0.0,0.0,0.0,0.037,0.0,0.0,0.0,0.0,0.027,0.0,...,0.0,0.0,0.0,0.0,0.0,0.029,0.0,0.0,0.038,0.0
1,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.239,0.066,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.115,0.0,0.0,0.0,0.0,0.0,0.0,...,0.094,0.316,0.087,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Sklearn Topic Modelling

- 1: Latent Dirichlet Allocation
- 2: Non Negative Matrix Factorization

In [17]:
NUM_TOPICS = NUM_CLUSTERS

In [18]:
def selected_topics(model, vectorizer, top_n=10):
    ''' Functions for printing keywords for each topic '''
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([vectorizer.get_feature_names()[i] for i in topic.argsort()[:-top_n - 1:-1]]) 

### Latente Dirichlet Allocation

In [17]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
lda = LatentDirichletAllocation(
    n_components=NUM_TOPICS, 
    learning_method='batch', 
    max_iter=100, 
    batch_size=128, 
    random_state=2019, 
    verbose=0)
lda

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=100,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=2019,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

**Load Pretrained Model**

In [19]:
with open(JP(paths['checkpoints'], 'lda_sklearn.pkl'), 'rb') as obj:
    lda = pickle.load(obj)
lda

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=True)

**Train Model**

In [32]:
print('Latent Semantion Allocation')
data_lda = lda.fit_transform(tfidf.representation)

Latent Semantion Allocation
iteration: 1 of max_iter: 100
iteration: 2 of max_iter: 100
iteration: 3 of max_iter: 100
iteration: 4 of max_iter: 100
iteration: 5 of max_iter: 100
iteration: 6 of max_iter: 100
iteration: 7 of max_iter: 100
iteration: 8 of max_iter: 100
iteration: 9 of max_iter: 100
iteration: 10 of max_iter: 100
iteration: 11 of max_iter: 100
iteration: 12 of max_iter: 100
iteration: 13 of max_iter: 100
iteration: 14 of max_iter: 100
iteration: 15 of max_iter: 100
iteration: 16 of max_iter: 100
iteration: 17 of max_iter: 100
iteration: 18 of max_iter: 100
iteration: 19 of max_iter: 100
iteration: 20 of max_iter: 100
iteration: 21 of max_iter: 100
iteration: 22 of max_iter: 100
iteration: 23 of max_iter: 100
iteration: 24 of max_iter: 100
iteration: 25 of max_iter: 100
iteration: 26 of max_iter: 100
iteration: 27 of max_iter: 100
iteration: 28 of max_iter: 100
iteration: 29 of max_iter: 100
iteration: 30 of max_iter: 100
iteration: 31 of max_iter: 100
iteration: 32 of max

In [33]:
# Keywords for topics clustered by Latent Dirichlet Allocation
print("LDA Model:")
selected_topics(lda, vectorizer, 8)

LDA Model:
Topic 0:
['game', 'win', 'play', 'player', 'match', 'team', 'club', 'injury']
Topic 1:
['government', 'election', 'party', 'labour', 'people', 'plan', 'tory', 'law']
Topic 2:
['technology', 'people', 'mobile', 'phone', 'user', 'computer', 'service', 'software']
Topic 3:
['film', 'award', 'star', 'music', 'include', 'year', 'win', 'release']
Topic 4:
['company', 'sale', 'share', 'firm', 'market', 'price', 'year', 'rise']


**Save Model**

In [34]:
with open(JP(paths['checkpoints'], 'lda_sklearn.pkl'), 'wb') as obj:
    pickle.dump(lda,obj)

#### Visualization

In [20]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [27]:
pyLDAvis.sklearn.prepare(lda, tfidf.dtm_sparse, tfidf.mapping, mds='tsne')

## Non-Negative Matrix Factorization

In [37]:
nmf = NMF(n_components=NUM_TOPICS)
nmf

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=5, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [28]:
print('Non-Negative Matrix Factorization')
data_nmf = nmf.fit_transform(tfidf.representation)

Non-Negative Matrix Factorization


In [29]:
# Keywords for topics clustered by Latent Semantic Indexing
print("NMF Model:")
selected_topics(nmf, vectorizer, 8)

NMF Model:
Topic 0:
['sale', 'year', 'rise', 'company', 'market', 'growth', 'share', 'price']
Topic 1:
['game', 'win', 'play', 'player', 'match', 'team', 'final', 'injury']
Topic 2:
['film', 'award', 'star', 'win', 'movie', 'include', 'director', 'year']
Topic 3:
['election', 'party', 'labour', 'government', 'tory', 'plan', 'tax', 'leader']
Topic 4:
['people', 'mobile', 'phone', 'technology', 'user', 'music', 'service', 'computer']


**Save Model**

In [30]:
with open(JP(paths['checkpoints'], 'NNMF.pkl'), 'wb') as obj:
    pickle.dump(nmf,obj)

**Load Model**

In [38]:
with open(JP(paths['checkpoints'], 'NNMF.pkl'), 'rb') as obj:
    nmf = pickle.load(obj)
nmf

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=5, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)