In [0]:
import pandas as pd

data = pd.read_csv('data', sep=",", header=None)

data.columns = ['text']

data.head()

Unnamed: 0,text
0,From: gld@cunixb.cc.columbia.edu (Gary L Dare)...
1,From: atterlep@vela.acs.oakland.edu (Cardinal ...
2,From: miner@kuhub.cc.ukans.edu\nSubject: Re: A...
3,From: atterlep@vela.acs.oakland.edu (Cardinal ...
4,From: vzhivov@superior.carleton.ca (Vladimir Z...


## Base LDA

- Vectorize the data with a default TfidfVectorizer
- Train an 2 component LDA with your vectorized data
- Visualize your LDA

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = TfidfVectorizer()

data_vectorized = vectorizer.fit_transform(data['text'])

lda_model = LatentDirichletAllocation(n_components=2)

lda_vectors = lda_model.fit_transform(data_vectorized)

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
        

print_topics(lda_model, vectorizer)

Topic 0:
[('gak', 1.0178950282305435), ('gilligan', 0.8413902912842923), ('howell', 0.8413902911581117), ('dee', 0.7918388420532556), ('wrs', 0.7916063155676158), ('qtr', 0.7381305359607561), ('ddf', 0.7274768532911369), ('196', 0.7227882283184413), ('168', 0.7073526812381591), ('172', 0.7009889040740068)]
Topic 1:
[('the', 180.37107555430543), ('to', 92.15906155311907), ('of', 87.90942611325751), ('and', 70.98849382408052), ('in', 70.57304743469027), ('that', 65.85997167550374), ('is', 64.04727363800576), ('it', 46.80275845082672), ('you', 44.435606975314975), ('edu', 39.12297201556561)]


## Optimize LDA

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = TfidfVectorizer(stop_words = 'english',min_df = 0.05, max_df = 0.75, lowercase=True)

data_vectorized = vectorizer.fit_transform(data['text'])

lda_model = LatentDirichletAllocation(n_components=2)

lda_vectors = lda_model.fit_transform(data_vectorized)

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
        

print_topics(lda_model, vectorizer)

Topic 0:
[('edu', 63.160981871020525), ('ca', 62.90683141208215), ('team', 46.1298386945226), ('game', 44.16088609423095), ('hockey', 43.79535728081677), ('university', 34.07219536492976), ('nhl', 32.0771530754927), ('play', 31.206843171587217), ('posting', 29.637395355032805), ('year', 29.478196457721385)]
Topic 1:
[('god', 78.67696277433242), ('edu', 51.625914075645845), ('people', 44.61910751084179), ('jesus', 40.60872452915141), ('church', 34.97325023824148), ('com', 33.101125936826506), ('christians', 31.762193525483482), ('christian', 31.352561246203496), ('believe', 30.48790429983994), ('know', 29.766393357662892)]


## Predict topic of new text

- Vectorize the example
- Pass the vectors in the lda model using `transform`
- Use the code to print the predictions the LDA model

In [0]:
example = ["My team performed poorly last season. Their best player was out injured and only played one game"]

example_vectorized = vectorizer.transform(example)

lda_vectors = lda_model.transform(example_vectorized)

print("topic 0 :", lda_vectors[0][0])
print("topic 1 :", lda_vectors[0][1])

topic 0 : 0.8506869761179621
topic 1 : 0.14931302388203793


## Tag the texts

In [0]:
def tag_topics(text):
    text = [" ".join(text)]
    vectorized = vectorizer.transform(text)
    lda_vectors = lda_model.transform(vectorized)
    sport = lda_vectors[0][0]
    religion = lda_vectors[0][1]
    if sport > religion:
        return "sport"
    else:
        return "religion"

In [0]:
tag_topics(example)

'sport'

In [0]:
data['tag'] = data.text.apply(tag_topics)

data.tail()

Unnamed: 0,text,tag
1194,From: jerryb@eskimo.com (Jerry Kaufman)\nSubje...,religion
1195,From: golchowy@alchemy.chem.utoronto.ca (Geral...,religion
1196,From: jayne@mmalt.guild.org (Jayne Kulikauskas...,religion
1197,From: sclark@epas.utoronto.ca (Susan Clark)\nS...,religion
1198,From: lmvec@westminster.ac.uk (William Hargrea...,religion


In [0]:
print(data.text.iloc[1194])

From: jerryb@eskimo.com (Jerry Kaufman)
Subject: Re: prayers and advice requested on family problem
Organization: -> ESKIMO NORTH (206) For-Ever <-
Lines: 11

Cloak yourself in God's sustaining and abiding love. Pray, pray, pray.
Pray for your brother, that he will assume the Godly role that is his.
Pray for your sister-in-law, the what ever is driving her to separate
your brother and herself from the the rest of the family will be healed.
Pray for God to give you the peace in the knowledge that you may not be
able to 'fix' it. From your description it would appear that it will
require devine intervention, and the realization by your brother as to
what his responsibilities are. Seek Godly counsel from your pastor, or
other spiritually mature believer. Know always that He is akways there
as a conforter, and will give you wisdon and direction as you call on
Him.

