In [0]:
import pandas as pd

data = pd.read_csv('data', sep=",", header=None)

data.columns = ['text']

data.head()

Unnamed: 0,text
0,From: gld@cunixb.cc.columbia.edu (Gary L Dare)...
1,From: atterlep@vela.acs.oakland.edu (Cardinal ...
2,From: miner@kuhub.cc.ukans.edu\nSubject: Re: A...
3,From: atterlep@vela.acs.oakland.edu (Cardinal ...
4,From: vzhivov@superior.carleton.ca (Vladimir Z...


In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = CountVectorizer(stop_words = 'english', lowercase = True, min_df = 0.05, max_df = 0.65)

data_vectorized = vectorizer.fit_transform(data['text'])

lda_model = LatentDirichletAllocation(n_components=2)

lda_vectors = lda_model.fit_transform(data_vectorized)

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
        

print_topics(lda_model, vectorizer)

Topic 0:
[('god', 1499.8807100618449), ('people', 635.5029826856959), ('jesus', 623.4651703673374), ('10', 470.9634767031545), ('believe', 448.65167790320146), ('church', 432.729569338021), ('christians', 427.3020382540818), ('does', 421.3128536264205), ('think', 416.51941115098043), ('25', 392.53408888939964)]
Topic 1:
[('ca', 764.2899899128447), ('team', 705.2971382983733), ('hockey', 647.7372537861646), ('game', 609.4414080944247), ('university', 547.052160219599), ('writes', 498.59599431553323), ('play', 468.2162241674823), ('article', 420.59630960262314), ('nhl', 417.96991018248985), ('season', 396.23661911743676)]


In [0]:
example = ["god plays sports at university"]

example_vectorized = vectorizer.transform(example)

lda_vectors = lda_model.transform(example_vectorized)

print("topic 0 :", lda_vectors[0][0])
print("topic 1 :", lda_vectors[0][1])

topic 0 : 0.23055395815969196
topic 1 : 0.769446041840308


In [0]:
data.head()

Unnamed: 0,text
0,From: gld@cunixb.cc.columbia.edu (Gary L Dare)...
1,From: atterlep@vela.acs.oakland.edu (Cardinal ...
2,From: miner@kuhub.cc.ukans.edu\nSubject: Re: A...
3,From: atterlep@vela.acs.oakland.edu (Cardinal ...
4,From: vzhivov@superior.carleton.ca (Vladimir Z...


In [0]:
def tag_topics(text):
    vectorized = vectorizer.transform([text])
    lda_vectors = lda_model.transform(vectorized)
    sport = lda_vectors[0][0]
    religion = lda_vectors[0][1]
    if sport < religion:
        return "sport"
    else:
        return "religion"

data['tag'] = data.text.apply(tag_topics)

In [0]:
data.head()

Unnamed: 0,text,tag
0,From: gld@cunixb.cc.columbia.edu (Gary L Dare)...,sport
1,From: atterlep@vela.acs.oakland.edu (Cardinal ...,religion
2,From: miner@kuhub.cc.ukans.edu\nSubject: Re: A...,religion
3,From: atterlep@vela.acs.oakland.edu (Cardinal ...,religion
4,From: vzhivov@superior.carleton.ca (Vladimir Z...,sport
