# LDA
- LDA is a probabilistic model where each word is assigned to a topic and each document is assigned to a topic based on its content.


In [1]:
!kaggle datasets download -d beridzeg45/guardian-environment-related-news

Dataset URL: https://www.kaggle.com/datasets/beridzeg45/guardian-environment-related-news
License(s): other
Downloading guardian-environment-related-news.zip to e:\nlp




  0%|          | 0.00/57.3M [00:00<?, ?B/s]
  2%|▏         | 1.00M/57.3M [00:00<00:39, 1.49MB/s]
  5%|▌         | 3.00M/57.3M [00:00<00:13, 4.22MB/s]
  9%|▊         | 5.00M/57.3M [00:01<00:08, 6.28MB/s]
 10%|█         | 6.00M/57.3M [00:01<00:09, 5.46MB/s]
 12%|█▏        | 7.00M/57.3M [00:01<00:08, 6.12MB/s]
 14%|█▍        | 8.00M/57.3M [00:01<00:10, 5.03MB/s]
 16%|█▌        | 9.00M/57.3M [00:01<00:09, 5.32MB/s]
 17%|█▋        | 10.0M/57.3M [00:02<00:09, 5.35MB/s]
 19%|█▉        | 11.0M/57.3M [00:02<00:09, 5.34MB/s]
 21%|██        | 12.0M/57.3M [00:02<00:08, 5.40MB/s]
 23%|██▎       | 13.0M/57.3M [00:02<00:08, 5.41MB/s]
 24%|██▍       | 14.0M/57.3M [00:02<00:08, 5.42MB/s]
 26%|██▌       | 15.0M/57.3M [00:03<00:08, 5.40MB/s]
 28%|██▊       | 16.0M/57.3M [00:03<00:07, 5.43MB/s]
 30%|██▉       | 17.0M/57.3M [00:03<00:07, 5.40MB/s]
 31%|███▏      | 18.0M/57.3M [00:03<00:08, 4.96MB/s]
 33%|███▎      | 19.0M/57.3M [00:03<00:07, 5.57MB/s]
 35%|███▍      | 20.0M/57.3M [00:04<00:07, 5.42MB/s]
 

In [2]:
!unzip guardian-environment-related-news

Archive:  guardian-environment-related-news.zip
  inflating: guardian_environment_news.csv  


In [4]:
import pandas as pd

data = pd.read_csv('guardian_environment_news.csv')
data_text = data[['Article Text']]
data_text['index'] = data_text.index
documents = data_text
documents = documents.dropna()
documents.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_text['index'] = data_text.index


Article Text    0
index           0
dtype: int64

In [5]:
print(len(documents))
documents.columns = ['article_text', 'index']
documents.head()

29691


Unnamed: 0,article_text,index
0,Liz Truss will sign off on a push for more oil...,0
1,It is an area so tranquil that the notion of b...,1
2,"Visits to parks, community gardens and other u...",2
3,"I devised today’s nut roast for Oddbox, a veg ...",3
4,‘Constant companions to our gardening’A peacoc...,4


# Data Preprocessing

- `Tokenization`
- Remove `stopwords` and whose len < 3
- words are `lemmatized` : third person -> first & verbs -> present
- words are `stemmed` - words are reduces to their root form

### Using `gensim` & `nltk`

In [17]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import PorterStemmer
import numpy as np

import nltk
nltk.download('wordnet')
nltk.download('corpora/wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dasha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Error loading corpora/wordnet: Package 'corpora/wordnet'
[nltk_data]     not found in index


False

In [18]:
import zipfile
import os

zip_path = str(nltk.find('corpora/wordnet.zip'))
print(zip_path)
# Directory to extract to
extract_to = 'C:/Users/dasha/AppData/Roaming/nltk_data/corpora/'

# Check if the zip file exists
if os.path.exists(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    print('Extraction successful!')
else:
    print('Zip file not found!')


C:\Users\dasha\AppData\Roaming\nltk_data\corpora\wordnet.zip
Extraction successful!


In [19]:
stemmer = SnowballStemmer("english")
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [20]:
# checking the preprocessing on an random doc
import random
rand_idx = random.randint(0, len(documents))

doc_sample = documents[documents['index'] == rand_idx].values[0][0]

words = []
for word in doc_sample.split():
    words.append(word)
print("Original document: \n", words[:20])
print("\nTokenized document: \n", preprocess(doc_sample)[:20])

Original document: 
 ['The', 'US', 'is', 'set', 'to', 'impose', 'new', 'carbon', 'pollution', 'standards', 'upon', 'its', 'coal-', 'and', 'gas-fired', 'power', 'plants,', 'in', 'a', 'move']

Tokenized document: 
 ['impos', 'carbon', 'pollut', 'standard', 'coal', 'fire', 'power', 'plant', 'biden', 'administr', 'hail', 'major', 'step', 'confront', 'climat', 'crisi', 'rule', 'forward', 'environment', 'protect']


In [21]:
%%time

processed_docs = documents['article_text'].map(preprocess)
processed_docs[:5]

CPU times: total: 25.9 s
Wall time: 2min 51s


0    [truss, sign, push, drill, north, win, conserv...
1    [area, tranquil, notion, bitter, disput, huge,...
2    [visit, park, communiti, garden, urban, green,...
3    [devis, today, roast, oddbox, outfit, support,...
4    [constant, companion, garden, peacock, butterf...
Name: article_text, dtype: object

# Bag of Words 
- keeping track of occurance of a word in the docs

In [38]:
dictionary = gensim.corpora.Dictionary(processed_docs) # type: ignore

for i, (k, v) in enumerate(dictionary.iteritems()):
    print(v, "-", k)
    if i > 10:
        break

accord - 0
acquiesc - 1
action - 2
add - 3
advis - 4
alongsid - 5
amid - 6
astronom - 7
averag - 8
backdrop - 9
begin - 10
bill - 11


In [23]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

### Gensim doc2bow
- here for each article_text we create a list containing the tuple -> (word, count)

In [24]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[rand_idx][:5] # showing the first 5 of the article_text at rand_index

[(3, 1), (8, 1), (14, 2), (17, 2), (25, 1)]

- previewing the Bag Of Words for this random article_text

In [25]:
bow_doc_rand_idx = bow_corpus[rand_idx]

for i in range(len(bow_doc_rand_idx)):
    print(f"Word {bow_doc_rand_idx[i][0]} ('{dictionary[bow_doc_rand_idx[i][0]]}') appears {bow_doc_rand_idx[i][1]} times")
    # showing the first 20
    if i == 20:
        break

Word 3 ('add') appears 1 times
Word 8 ('averag') appears 1 times
Word 14 ('bring') appears 2 times
Word 17 ('campaign') appears 2 times
Word 25 ('compani') appears 1 times
Word 26 ('conserv') appears 1 times
Word 29 ('crisi') appears 1 times
Word 31 ('daili') appears 2 times
Word 43 ('email') appears 3 times
Word 45 ('environment') appears 1 times
Word 59 ('follow') appears 1 times
Word 60 ('fossil') appears 4 times
Word 63 ('fuel') appears 4 times
Word 68 ('global') appears 2 times
Word 70 ('guardian') appears 2 times
Word 72 ('help') appears 1 times
Word 82 ('issu') appears 1 times
Word 91 ('make') appears 3 times
Word 93 ('minist') appears 14 times
Word 97 ('near') appears 1 times
Word 105 ('plan') appears 3 times


# TF-IDF
- measuring the importance of each terms wrt to the docs

In [26]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

from pprint import pprint

for doc in corpus_tfidf:
#     pprint(doc)
    for i in range(len(doc)):
        print(dictionary[doc[i][0]], doc[i][1]) # word : significance
    break

accord 0.021720469344865333
acquiesc 0.13370066637362965
action 0.022747440597037094
add 0.02555825704747701
advis 0.04687115007599941
alongsid 0.05060409589593413
amid 0.10154038427481077
astronom 0.1066579402076301
averag 0.03826728129436504
backdrop 0.08911723157013195
begin 0.02740707839979547
bill 0.21957816352952608
billion 0.046908926527445174
brexit 0.05873605543784116
bring 0.04997892807397803
busi 0.07531999745073897
call 0.03891221274467663
campaign 0.027461320698540632
cash 0.06501048142775517
centuri 0.03701860482255549
chief 0.034108586641713026
choke 0.08291800622706784
closur 0.14115708934988128
coast 0.041013976278716446
committe 0.04954980567719998
compani 0.02360565663405371
conserv 0.028331041491428476
contin 0.06357514999565907
countri 0.0150928292604786
crisi 0.04547793320996474
critic 0.03324952940011084
daili 0.0351666233941124
decad 0.024597704669070155
depend 0.04234001409531421
discuss 0.1320128215012223
dismay 0.09238506911016088
domest 0.051353683719497153


# Running LDA with Bag Of Words

In [27]:
%%time
lda_model = gensim.models.LdaMulticore(bow_corpus,
                                       num_topics=10,
                                       id2word=dictionary,
                                       passes=2,
                                       workers=2)

CPU times: total: 4.94 s
Wall time: 29 s


In [28]:
for idx, article in lda_model.print_topics(-1):
    print(f"Topic: {idx} Word: {article}") # index -> weight*word

Topic: 0 Word: 0.015*"energi" + 0.011*"emiss" + 0.008*"australia" + 0.008*"power" + 0.007*"coal" + 0.007*"carbon" + 0.007*"plan" + 0.006*"electr" + 0.006*"project" + 0.006*"industri"
Topic: 1 Word: 0.011*"fish" + 0.010*"ocean" + 0.009*"research" + 0.008*"whale" + 0.007*"studi" + 0.007*"water" + 0.006*"scientist" + 0.005*"univers" + 0.005*"marin" + 0.004*"temperatur"
Topic: 2 Word: 0.012*"speci" + 0.010*"farm" + 0.009*"anim" + 0.008*"protect" + 0.007*"farmer" + 0.007*"natur" + 0.006*"conserv" + 0.006*"wildlif" + 0.006*"land" + 0.006*"environ"
Topic: 3 Word: 0.013*"countri" + 0.010*"global" + 0.007*"emiss" + 0.005*"meat" + 0.005*"nation" + 0.005*"report" + 0.005*"food" + 0.004*"crisi" + 0.004*"australia" + 0.004*"develop"
Topic: 4 Word: 0.016*"plastic" + 0.009*"wast" + 0.008*"recycl" + 0.007*"compani" + 0.005*"product" + 0.005*"environment" + 0.005*"food" + 0.004*"work" + 0.004*"industri" + 0.003*"mine"
Topic: 5 Word: 0.005*"tree" + 0.005*"photograph" + 0.005*"bird" + 0.004*"plant" + 0.0

# Running LDA using TF-IDF
- TF-IDF is a measure of how important a word is to a document in a collection or corpus. its a frequency based representation. TF-IDF = TF(term frequency) * IDF(Inverse document frequency)


In [33]:
%%time
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf,
                                             num_topics=10,
                                             id2word=dictionary,
                                             passes=20,
                                             workers=2,
                                             iterations=400)

CPU times: total: 3min 55s
Wall time: 10min 40s


In [34]:
for idx, article in lda_model_tfidf.print_topics(0):
    print(f"Topic: {idx} Word: {article}") # index -> weight*word

Topic: 2 Word: 0.003*"emiss" + 0.003*"energi" + 0.003*"australia" + 0.002*"carbon" + 0.002*"compani" + 0.002*"coal" + 0.002*"electr" + 0.002*"fuel" + 0.002*"plan" + 0.002*"power"
Topic: 7 Word: 0.005*"ridd" + 0.005*"bernhardt" + 0.003*"accc" + 0.003*"sunscreen" + 0.003*"beurden" + 0.003*"britishvolt" + 0.002*"mekong" + 0.002*"jenrick" + 0.002*"contrail" + 0.002*"fishmeal"
Topic: 3 Word: 0.008*"zink" + 0.005*"stork" + 0.004*"natal" + 0.003*"everglad" + 0.003*"formosa" + 0.003*"ipb" + 0.003*"wolverin" + 0.003*"kakadu" + 0.003*"hairstreak" + 0.003*"yanomami"
Topic: 5 Word: 0.005*"cácere" + 0.005*"whitehaven" + 0.004*"mauritius" + 0.004*"flemish" + 0.004*"virunga" + 0.004*"sandpip" + 0.003*"interconnector" + 0.003*"royc" + 0.003*"hitachi" + 0.002*"denka"
Topic: 9 Word: 0.037*"frack" + 0.020*"cartoon" + 0.018*"moon" + 0.017*"shale" + 0.014*"cuadrilla" + 0.007*"ineo" + 0.007*"earthquak" + 0.006*"preston" + 0.006*"tremor" + 0.006*"glencor"
Topic: 6 Word: 0.028*"protest" + 0.017*"polic" + 0.01

# Testing

In [35]:
unseen_doc = """
    Now, emerging evidence suggests reindeer may play a fundamental role in helping to preserve this entire ecosystem, including the snow cover, the open forest with its low-growing berry bushes, mosses and lichen an organism formed by a close association of fungi and algae and even the cold winter climate.
"""

bow_vector = dictionary.doc2bow(preprocess(unseen_doc))

In [36]:
# LDA TF-IDF
for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.6993319392204285	 
Topic: 0.003*"speci" + 0.002*"fish" + 0.002*"bird" + 0.002*"tree" + 0.002*"anim" + 0.002*"park" + 0.002*"whale" + 0.002*"wildlif" + 0.002*"water" + 0.002*"photograph"

Score: 0.2730673849582672	 
Topic: 0.005*"bird" + 0.005*"flower" + 0.004*"tree" + 0.004*"wood" + 0.004*"wing" + 0.004*"butterfli" + 0.003*"diari" + 0.003*"winter" + 0.003*"spring" + 0.003*"white"


In [37]:
# LDA Bag Of Words
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.6745323538780212	 
Topic: 0.005*"tree" + 0.005*"photograph" + 0.005*"bird" + 0.004*"plant" + 0.004*"look" + 0.003*"natur" + 0.003*"leav" + 0.003*"work" + 0.003*"live" + 0.003*"know"

Score: 0.29786381125450134	 
Topic: 0.011*"fish" + 0.010*"ocean" + 0.009*"research" + 0.008*"whale" + 0.007*"studi" + 0.007*"water" + 0.006*"scientist" + 0.005*"univers" + 0.005*"marin" + 0.004*"temperatur"
