# Latent Dirichlet Allocation (LDA)

In [8]:
# !pip install nltk

In [1]:
import nltk

nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)      # For nltk<3.9.0
nltk.download('punkt_tab', quiet=True)  # For nltk>=3.9.0
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

True

In [2]:
import pandas as pd
import string
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
data = pd.read_csv("../data/nlp/lda_data.csv", sep=",", header=None)
data.columns = ['text']
data.head()

Unnamed: 0,text
0,From: gld@cunixb.cc.columbia.edu (Gary L Dare)...
1,From: atterlep@vela.acs.oakland.edu (Cardinal ...
2,From: miner@kuhub.cc.ukans.edu\nSubject: Re: A...
3,From: atterlep@vela.acs.oakland.edu (Cardinal ...
4,From: vzhivov@superior.carleton.ca (Vladimir Z...


In [4]:
data.shape

(1199, 1)

## (1) Preprocessing

In [5]:
def preprocessing(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
    lowercased = text.lower() # Lower Case
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('english')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
    cleaned = ' '.join(lemmatized) # Join back to a string
    return cleaned

data["clean_text"] = data.text.apply(preprocessing)
data.head()

Unnamed: 0,text,clean_text
0,From: gld@cunixb.cc.columbia.edu (Gary L Dare)...,gld cunixb cc columbia edu gary l dare subject...
1,From: atterlep@vela.acs.oakland.edu (Cardinal ...,atterlep vela ac oakland edu cardinal ximenez ...
2,From: miner@kuhub.cc.ukans.edu\nSubject: Re: A...,miner kuhub cc ukans edu subject ancient book ...
3,From: atterlep@vela.acs.oakland.edu (Cardinal ...,atterlep vela ac oakland edu cardinal ximenez ...
4,From: vzhivov@superior.carleton.ca (Vladimir Z...,vzhivov superior carleton ca vladimir zhivov s...


## (2) Latent Dirichlet Allocation model

In [6]:
vectorizer = CountVectorizer()

data_vectorized = vectorizer.fit_transform(data['clean_text'])

lda_model = LatentDirichletAllocation(n_components=2)

lda_vectors = lda_model.fit_transform(data_vectorized)

## (3) Visualize potential topics

In [7]:
def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names_out()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])

In [8]:
print_topics(lda_model, vectorizer)

Topic 0:
[('edu', np.float64(1112.7546814263246)), ('team', np.float64(958.471618477281)), ('game', np.float64(951.0308573666011)), ('line', np.float64(726.1948359443907)), ('ca', np.float64(682.3228033582553)), ('hockey', np.float64(649.4886959343773)), ('subject', np.float64(641.102028749697)), ('organization', np.float64(621.9631850551654)), ('player', np.float64(529.4166041543274)), ('play', np.float64(521.8453251425728))]
Topic 1:
[('god', np.float64(1525.4536590443647)), ('edu', np.float64(1015.2453185736221)), ('one', np.float64(842.8832041691405)), ('christian', np.float64(817.0412980297596)), ('would', np.float64(803.3720757016636)), ('people', np.float64(695.0243179589836)), ('subject', np.float64(665.8979712502531)), ('line', np.float64(630.8051640555591)), ('jesus', np.float64(626.4875541757301)), ('organization', np.float64(550.0368149447843))]


## (4) Predict the document-topic mixture of a new text

In [9]:
example = ["My team performed poorly last season. Their best player was out injured and only played one game"]

example_vectorized = vectorizer.transform(example)

lda_vectors = lda_model.transform(example_vectorized)

print("topic 0 :", lda_vectors[0][0])
print("topic 1 :", lda_vectors[0][1])

topic 0 : 0.9559150708165862
topic 1 : 0.044084929183413786
