## LSA and LDA to news group

In [1]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import gensim
import pandas as pd
import numpy as np



In [2]:
with open('20ng-train-all-terms.txt') as f:
    train = f.read()
with open('20ng-test-all-terms.txt') as f:
    test = f.read()

train = train[:-1]
test = test[:-1]
topic_train = []
topic_test = []
news_train = []
news_test = []
for new in train.split('\n'):
    topic_train.append(new.split('\t')[0])
    news_train.append(new.split('\t')[1])
print('number of training news: ' + str(len(news_train)))
for new in test.split('\n'):
    topic_test.append(new.split('\t')[0])
    news_test.append(new.split('\t')[1])
print('number of test news: ' + str(len(news_test)))

number of training news: 11293
number of test news: 7528


In [3]:
# train

In [4]:
y_test = np.array(topic_test)
X_test = np.array(news_test)

In [5]:
print('number of news: ' + str(len(news_train)))
print('number of topics: ' + str(len(topic_test)))

number of news: 11293
number of topics: 7528


In [6]:
# use CountVectorizer to find three letter tokens, remove stop_words,
# remove tokens that don't appear in at least 20 documents,
# remove tokens that appear in more than 20% of the documents
vect = CountVectorizer(min_df = 20, max_df = 0.2, stop_words = 'english', token_pattern = '(?u)\\b\\w\\w\\w+\\b')
X = vect.fit_transform(news_train)

# Convert sparse matrix to gensim corpus
corpus = gensim.matutils.Sparse2Corpus(X, documents_columns = False)
id_map = dict((v, k) for k, v in vect.vocabulary_.items())


In [7]:
%%time
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 6, id2word = id_map, passes = 25, random_state = 0)

Wall time: 2min 23s


In [8]:
ldamodel.print_topics(6)

[(0,
  '0.008*"year" + 0.007*"car" + 0.007*"team" + 0.006*"game" + 0.005*"new" + 0.004*"play" + 0.004*"games" + 0.004*"better" + 0.004*"got" + 0.004*"hockey"'),
 (1,
  '0.009*"use" + 0.008*"drive" + 0.008*"windows" + 0.007*"thanks" + 0.007*"problem" + 0.006*"need" + 0.006*"card" + 0.005*"work" + 0.005*"bit" + 0.005*"used"'),
 (2,
  '0.019*"turkish" + 0.017*"armenian" + 0.015*"armenians" + 0.010*"armenia" + 0.009*"turkey" + 0.009*"said" + 0.009*"turks" + 0.007*"went" + 0.006*"soviet" + 0.006*"greek"'),
 (3,
  '0.007*"government" + 0.006*"use" + 0.006*"key" + 0.004*"make" + 0.004*"law" + 0.004*"public" + 0.004*"used" + 0.003*"encryption" + 0.003*"say" + 0.003*"right"'),
 (4,
  '0.012*"space" + 0.010*"file" + 0.009*"program" + 0.007*"information" + 0.007*"nasa" + 0.006*"available" + 0.005*"mail" + 0.005*"data" + 0.005*"use" + 0.005*"list"'),
 (5,
  '0.008*"god" + 0.005*"did" + 0.004*"said" + 0.004*"say" + 0.004*"israel" + 0.004*"believe" + 0.004*"jesus" + 0.004*"way" + 0.004*"jews" + 0.00

In [9]:
X_vect = vect.transform(news_test)
corpus = gensim.matutils.Sparse2Corpus(X_vect, documents_columns = False)
topic_tuple_list = list(ldamodel[corpus])

In [10]:
# len(topic_tuple_list)
y_predict = list()
for topic_tuple in topic_tuple_list:
    y_predict.append(sorted(topic_tuple, key = lambda x: x[1], reverse = True)[0][0])
y_predict = np.array(y_predict)
unique, counts = np.unique(y_predict, return_counts = True)
dict(zip(unique, counts))

{0: 1485, 1: 2067, 2: 71, 3: 902, 4: 1017, 5: 1986}

In [11]:
%%time
lsamodel = gensim.models.LsiModel(corpus, num_topics = 6, id2word = id_map)

Wall time: 550 ms


In [12]:
lsamodel.print_topics(6)

[(0,
  '0.681*"dos" + 0.279*"windows" + 0.221*"jpeg" + 0.150*"image" + 0.136*"software" + 0.136*"graphics" + 0.135*"microsoft" + 0.132*"file" + 0.120*"version" + 0.108*"color"'),
 (1,
  '-0.481*"dos" + 0.476*"jpeg" + 0.290*"image" + 0.212*"file" + 0.195*"gif" + 0.145*"images" + -0.141*"windows" + 0.131*"format" + 0.114*"files" + 0.110*"color"'),
 (2,
  '0.391*"god" + 0.367*"jehovah" + 0.293*"lord" + -0.242*"jpeg" + 0.180*"christ" + 0.178*"said" + 0.141*"father" + 0.134*"jesus" + 0.103*"say" + 0.102*"did"'),
 (3,
  '0.405*"jpeg" + 0.304*"jehovah" + 0.238*"god" + 0.230*"lord" + -0.154*"data" + 0.152*"gif" + -0.149*"ftp" + -0.146*"graphics" + 0.133*"dos" + 0.128*"christ"'),
 (4,
  '-0.240*"jehovah" + -0.228*"graphics" + 0.201*"president" + -0.174*"pub" + 0.174*"jpeg" + -0.170*"lord" + -0.163*"ftp" + 0.146*"said" + -0.136*"data" + 0.130*"going"'),
 (5,
  '0.291*"planet" + 0.284*"earth" + 0.225*"spacecraft" + 0.219*"solar" + 0.215*"venus" + -0.205*"openwindows" + 0.182*"surface" + 0.143*"at

In [13]:
X_vect = vect.transform(news_test)
corpus = gensim.matutils.Sparse2Corpus(X_vect, documents_columns = False)
topic_tuple_list = list(lsamodel[corpus])

In [14]:
y_predict = list()
for topic_tuple in topic_tuple_list:
    y_predict.append(sorted(topic_tuple, key = lambda x: x[1], reverse = True)[0])
y_predict
# y_predict = np.array(y_predict)
# unique, counts = np.unique(y_predict, return_counts = True)
# dict(zip(unique, counts))

IndexError: list index out of range