## LSA and LDA to news group

In [1]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import gensim
import pandas as pd
import numpy as np



In [2]:
with open('20ng-train-all-terms.txt') as f:
    train = f.read()
with open('20ng-test-all-terms.txt') as f:
    test = f.read()

train = train[:-1]
test = test[:-1]
topic_train = []
topic_test = []
news_train = []
news_test = []
for new in train.split('\n'):
    topic_train.append(new.split('\t')[0])
    news_train.append(new.split('\t')[1])
print('number of training news: ' + str(len(news_train)))
for new in test.split('\n'):
    topic_test.append(new.split('\t')[0])
    news_test.append(new.split('\t')[1])
print('number of test news: ' + str(len(news_test)))

number of training news: 11293
number of test news: 7528


In [3]:
# train

In [4]:
y_test = np.array(topic_test)
X_test = np.array(news_test)

In [5]:
print('number of news: ' + str(len(news_train)))
print('number of topics: ' + str(len(topic_test)))

number of news: 11293
number of topics: 7528


In [6]:
# use CountVectorizer to find three letter tokens, remove stop_words,
# remove tokens that don't appear in at least 20 documents,
# remove tokens that appear in more than 20% of the documents
vect = CountVectorizer(min_df = 20, max_df = 0.2, stop_words = 'english', token_pattern = '(?u)\\b\\w\\w\\w+\\b', 
                       ngram_range = (1, 2))
X = vect.fit_transform(news_train)

# Convert sparse matrix to gensim corpus
corpus = gensim.matutils.Sparse2Corpus(X, documents_columns = False)
id_map = dict((v, k) for k, v in vect.vocabulary_.items())


In [7]:
%%time
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 20, id2word = id_map, passes = 25)
# ldamodel = gensim.models.LdaMulticore(corpus, num_topics = 6, id2word = id_map, passes = 25, random_state = 0)

Wall time: 2min 24s


In [8]:
ldamodel.print_topics(20)

[(0,
  '0.018*"government" + 0.013*"president" + 0.009*"states" + 0.009*"state" + 0.009*"american" + 0.007*"health" + 0.007*"rights" + 0.007*"clinton" + 0.006*"country" + 0.006*"anti"'),
 (1,
  '0.014*"state" + 0.012*"article apr" + 0.012*"pitt" + 0.011*"ohio" + 0.011*"john" + 0.010*"writes article" + 0.009*"amendment" + 0.009*"ohio state" + 0.009*"pitt edu" + 0.008*"org"'),
 (2,
  '0.027*"gun" + 0.014*"law" + 0.013*"crime" + 0.013*"police" + 0.012*"turks" + 0.011*"uiuc" + 0.010*"uiuc edu" + 0.010*"weapons" + 0.009*"tax" + 0.008*"court"'),
 (3,
  '0.015*"arms" + 0.015*"gay" + 0.013*"men" + 0.012*"objective" + 0.012*"frank" + 0.011*"virginia" + 0.011*"writes article" + 0.011*"sex" + 0.010*"morality" + 0.009*"sexual"'),
 (4,
  '0.023*"article apr" + 0.018*"colorado" + 0.018*"isc" + 0.015*"henrik" + 0.015*"boston" + 0.014*"henry" + 0.014*"colorado edu" + 0.014*"writes article" + 0.013*"toronto" + 0.012*"rochester"'),
 (5,
  '0.009*"believe" + 0.007*"evidence" + 0.007*"question" + 0.007*"t

In [9]:
X_vect = vect.transform(news_test)
corpus = gensim.matutils.Sparse2Corpus(X_vect, documents_columns = False)
topic_tuple_list = list(ldamodel[corpus])

In [10]:
# len(topic_tuple_list)
y_predict = list()
for topic_tuple in topic_tuple_list:
    y_predict.append(sorted(topic_tuple, key = lambda x: x[1], reverse = True)[0][0])
y_predict = np.array(y_predict)
unique, counts = np.unique(y_predict, return_counts = True)
dict(zip(unique, counts))

{0: 207,
 1: 92,
 2: 106,
 3: 113,
 4: 56,
 5: 467,
 6: 46,
 7: 1762,
 8: 693,
 9: 556,
 10: 74,
 11: 80,
 12: 209,
 13: 111,
 14: 285,
 15: 676,
 16: 372,
 17: 277,
 18: 153,
 19: 1193}

In [11]:
%%time
lsamodel = gensim.models.LsiModel(corpus, num_topics = 6, id2word = id_map)

Wall time: 636 ms


In [12]:
lsamodel.print_topics(6)

[(0,
  '0.685*"dos" + 0.279*"windows" + 0.205*"jpeg" + 0.140*"image" + 0.135*"microsoft" + 0.134*"software" + 0.133*"graphics" + 0.131*"microsoft windows" + 0.124*"file" + 0.117*"version"'),
 (1,
  '0.478*"jpeg" + -0.451*"dos" + 0.293*"image" + 0.215*"file" + 0.196*"gif" + 0.146*"images" + 0.132*"format" + -0.130*"windows" + 0.117*"files" + 0.112*"color"'),
 (2,
  '-0.387*"god" + -0.362*"jehovah" + -0.289*"lord" + 0.248*"jpeg" + -0.178*"christ" + -0.177*"said" + -0.139*"father" + -0.133*"jesus" + -0.103*"say" + -0.102*"did"'),
 (3,
  '-0.406*"jpeg" + -0.304*"jehovah" + -0.241*"god" + -0.231*"lord" + 0.153*"data" + -0.153*"gif" + 0.150*"ftp" + 0.149*"graphics" + -0.129*"christ" + 0.128*"pub"'),
 (4,
  '0.244*"jehovah" + 0.222*"graphics" + -0.203*"president" + 0.173*"lord" + 0.169*"pub" + -0.166*"jpeg" + 0.157*"ftp" + -0.143*"said" + 0.130*"data" + -0.130*"myers"'),
 (5,
  '-0.279*"planet" + -0.272*"earth" + 0.225*"openwindows" + -0.215*"spacecraft" + -0.210*"solar" + -0.207*"venus" + -0

In [13]:
X_vect = vect.transform(news_test)
corpus = gensim.matutils.Sparse2Corpus(X_vect, documents_columns = False)
topic_tuple_list = list(lsamodel[corpus])

In [14]:
y_predict = list()
for topic_tuple in topic_tuple_list:
    y_predict.append(sorted(topic_tuple, key = lambda x: x[1], reverse = True)[0])
y_predict
# y_predict = np.array(y_predict)
# unique, counts = np.unique(y_predict, return_counts = True)
# dict(zip(unique, counts))

IndexError: list index out of range