## LSA and LDA to news group

In [1]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import gensim
import pandas as pd
import numpy as np



In [2]:
with open('20ng-train-all-terms.txt') as f:
    train = f.read()
with open('20ng-test-all-terms.txt') as f:
    test = f.read()

train = train[:-1]
test = test[:-1]
topic_train = []
topic_test = []
news_train = []
news_test = []
for new in train.split('\n'):
    topic_train.append(new.split('\t')[0])
    news_train.append(new.split('\t')[1])
print('number of training news: ' + str(len(news_train)))
for new in test.split('\n'):
    topic_test.append(new.split('\t')[0])
    news_test.append(new.split('\t')[1])
print('number of test news: ' + str(len(news_test)))

number of training news: 11293
number of test news: 7528


In [3]:
# train

In [4]:
y_test = np.array(topic_test)
X_test = np.array(news_test)

In [5]:
print('number of news: ' + str(len(news_train)))
print('number of topics: ' + str(len(topic_test)))

number of news: 11293
number of topics: 7528


In [6]:
# use CountVectorizer to find three letter tokens, remove stop_words,
# remove tokens that don't appear in at least 20 documents,
# remove tokens that appear in more than 20% of the documents
# vect = CountVectorizer(min_df = 20, max_df = 0.2, stop_words = 'english', token_pattern = '(?u)\\b\\w\\w\\w+\\b', 
#                        ngram_range = (1, 2))
vect = CountVectorizer(min_df = 20, max_df = 0.2, stop_words = 'english', token_pattern = '(?u)\\b\\w\\w\\w+\\b')
X = vect.fit_transform(news_train)

# Convert sparse matrix to gensim corpus
corpus = gensim.matutils.Sparse2Corpus(X, documents_columns = False)
id_map = dict((v, k) for k, v in vect.vocabulary_.items())


In [7]:
%%time
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 20, id2word = id_map, passes = 25, random_state = 0)
# ldamodel = gensim.models.LdaMulticore(corpus, num_topics = 6, id2word = id_map, passes = 25, random_state = 0)

Wall time: 2min 25s


In [8]:
ldamodel.print_topics(20)

[(0,
  '0.028*"apple" + 0.027*"sandvik" + 0.024*"kent" + 0.014*"newton" + 0.013*"engine" + 0.013*"gary" + 0.013*"saturn" + 0.011*"tank" + 0.011*"gld" + 0.010*"picture"'),
 (1,
  '0.029*"drive" + 0.023*"card" + 0.018*"scsi" + 0.014*"mac" + 0.012*"disk" + 0.012*"video" + 0.012*"bit" + 0.011*"speed" + 0.011*"drivers" + 0.010*"hard"'),
 (2,
  '0.099*"israel" + 0.033*"arab" + 0.032*"jewish" + 0.026*"land" + 0.019*"arabs" + 0.016*"lebanese" + 0.016*"lebanon" + 0.014*"israelis" + 0.013*"palestine" + 0.013*"center"'),
 (3,
  '0.019*"russian" + 0.019*"hockey" + 0.017*"season" + 0.014*"players" + 0.013*"team" + 0.012*"league" + 0.011*"nhl" + 0.011*"los" + 0.011*"division" + 0.010*"angeles"'),
 (4,
  '0.018*"information" + 0.010*"public" + 0.009*"security" + 0.008*"use" + 0.008*"mail" + 0.008*"list" + 0.008*"new" + 0.008*"data" + 0.008*"technology" + 0.007*"available"'),
 (5,
  '0.009*"uiuc" + 0.008*"news" + 0.007*"really" + 0.006*"cso" + 0.006*"baseball" + 0.006*"mike" + 0.006*"day" + 0.006*"fra

In [9]:
X_vect = vect.transform(news_test)
corpus = gensim.matutils.Sparse2Corpus(X_vect, documents_columns = False)
topic_tuple_list = list(ldamodel[corpus])

In [10]:
# len(topic_tuple_list)
y_predict = list()
for topic_tuple in topic_tuple_list:
    y_predict.append(sorted(topic_tuple, key = lambda x: x[1], reverse = True)[0][0])
y_predict = np.array(y_predict)
unique, counts = np.unique(y_predict, return_counts = True)
dict(zip(unique, counts))

{0: 35,
 1: 508,
 2: 32,
 3: 111,
 4: 265,
 5: 559,
 6: 440,
 7: 256,
 8: 171,
 9: 58,
 10: 742,
 11: 132,
 12: 879,
 13: 97,
 14: 114,
 15: 1499,
 16: 577,
 17: 30,
 18: 197,
 19: 826}

In [11]:
%%time
lsamodel = gensim.models.LsiModel(corpus, num_topics = 20, id2word = id_map)

Wall time: 601 ms


In [12]:
lsamodel.print_topics(20)

[(0,
  '-0.681*"dos" + -0.279*"windows" + -0.221*"jpeg" + -0.150*"image" + -0.136*"software" + -0.136*"graphics" + -0.135*"microsoft" + -0.132*"file" + -0.120*"version" + -0.108*"color"'),
 (1,
  '0.481*"dos" + -0.476*"jpeg" + -0.290*"image" + -0.212*"file" + -0.195*"gif" + -0.145*"images" + 0.141*"windows" + -0.131*"format" + -0.114*"files" + -0.110*"color"'),
 (2,
  '-0.391*"god" + -0.367*"jehovah" + -0.293*"lord" + 0.242*"jpeg" + -0.180*"christ" + -0.178*"said" + -0.141*"father" + -0.134*"jesus" + -0.103*"say" + -0.102*"did"'),
 (3,
  '-0.405*"jpeg" + -0.304*"jehovah" + -0.238*"god" + -0.230*"lord" + 0.154*"data" + -0.152*"gif" + 0.149*"ftp" + 0.146*"graphics" + -0.133*"dos" + -0.128*"christ"'),
 (4,
  '0.240*"jehovah" + 0.228*"graphics" + -0.201*"president" + 0.174*"pub" + -0.174*"jpeg" + 0.170*"lord" + 0.163*"ftp" + -0.146*"said" + 0.136*"data" + -0.130*"going"'),
 (5,
  '-0.291*"planet" + -0.284*"earth" + -0.225*"spacecraft" + -0.219*"solar" + -0.215*"venus" + 0.205*"openwindows"

In [13]:
X_vect = vect.transform(news_test)
corpus = gensim.matutils.Sparse2Corpus(X_vect, documents_columns = False)
topic_tuple_list = list(lsamodel[corpus])

In [14]:
y_predict = list()
for topic_tuple in topic_tuple_list:
    y_predict.append(sorted(topic_tuple, key = lambda x: x[1], reverse = True)[0])
y_predict
# y_predict = np.array(y_predict)
# unique, counts = np.unique(y_predict, return_counts = True)
# dict(zip(unique, counts))

IndexError: list index out of range