## LSA and LDA to news group

In [1]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import gensim
import pandas as pd
import numpy as np



In [2]:
with open('20ng-train-all-terms.txt') as f:
    train = f.read()
with open('20ng-test-all-terms.txt') as f:
    test = f.read()

train = train[:-1]
test = test[:-1]
topic_train = []
topic_test = []
news_train = []
news_test = []
for new in train.split('\n'):
    topic_train.append(new.split('\t')[0])
    news_train.append(new.split('\t')[1])
print('number of training news: ' + str(len(news_train)))
for new in test.split('\n'):
    topic_test.append(new.split('\t')[0])
    news_test.append(new.split('\t')[1])
print('number of test news: ' + str(len(news_test)))

number of training news: 11293
number of test news: 7528


In [3]:
# train

In [4]:
y_test = np.array(topic_test)
X_test = np.array(news_test)

In [5]:
print('number of news: ' + str(len(news_train)))
print('number of topics: ' + str(len(topic_test)))

number of news: 11293
number of topics: 7528


In [6]:
# use CountVectorizer to find three letter tokens, remove stop_words,
# remove tokens that don't appear in at least 20 documents,
# remove tokens that appear in more than 20% of the documents
vect = TfidfVectorizer()
X = vect.fit_transform(news_train)

# Convert sparse matrix to gensim corpus
corpus = gensim.matutils.Sparse2Corpus(X, documents_columns = False)
id_map = dict((v, k) for k, v in vect.vocabulary_.items())


In [7]:
%%time
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 20, id2word = id_map, passes = 25, random_state = 0)

Wall time: 2min 3s


In [8]:
ldamodel.print_topics(20)

[(0,
  '0.004*"team" + 0.003*"war" + 0.003*"sale" + 0.003*"population" + 0.003*"police" + 0.003*"study" + 0.002*"paul" + 0.002*"batf" + 0.002*"food" + 0.002*"rutgers"'),
 (1,
  '0.002*"francis" + 0.001*"amounts" + 0.001*"skin" + 0.001*"nyc" + 0.001*"gamma" + 0.001*"joslin" + 0.001*"dm" + 0.001*"reed" + 0.001*"protestant" + 0.001*"ot"'),
 (2,
  '0.003*"wings" + 0.002*"ha" + 0.002*"rider" + 0.002*"fpu" + 0.002*"pat" + 0.001*"doctor" + 0.001*"baalke" + 0.001*"rf" + 0.001*"dennis" + 0.001*"train"'),
 (3,
  '0.003*"jobs" + 0.002*"sad" + 0.002*"harvard" + 0.002*"wright" + 0.002*"wife" + 0.002*"massacre" + 0.002*"broadcast" + 0.001*"cultural" + 0.001*"husc" + 0.001*"signed"'),
 (4,
  '0.005*"fbi" + 0.003*"pitt" + 0.002*"relations" + 0.002*"uga" + 0.002*"udel" + 0.002*"zoo" + 0.002*"intelligence" + 0.002*"training" + 0.002*"staff" + 0.002*"authorities"'),
 (5,
  '0.003*"jake" + 0.003*"pat" + 0.002*"gld" + 0.002*"gary" + 0.002*"assault" + 0.002*"officer" + 0.002*"bomb" + 0.002*"mars" + 0.001*"v

In [9]:
X_vect = vect.transform(news_test)
corpus = gensim.matutils.Sparse2Corpus(X_vect, documents_columns = False)
topic_tuple_list = list(ldamodel[corpus])

In [10]:
# len(topic_tuple_list)
y_predict = list()
for topic_tuple in topic_tuple_list:
    y_predict.append(sorted(topic_tuple, key = lambda x: x[1], reverse = True)[0][0])
y_predict = np.array(y_predict)
unique, counts = np.unique(y_predict, return_counts = True)
dict(zip(unique, counts))

{0: 6,
 2: 1,
 3: 1,
 4: 1,
 5: 2,
 7: 1,
 8: 7501,
 9: 4,
 10: 1,
 12: 3,
 13: 2,
 14: 2,
 15: 1,
 18: 1,
 19: 1}

In [11]:
%%time
lsamodel = gensim.models.LsiModel(corpus, num_topics = 20, id2word = id_map)

Wall time: 1.9 s


In [12]:
lsamodel.print_topics(20)

[(0,
  '0.500*"the" + 0.278*"to" + 0.241*"of" + 0.210*"and" + 0.186*"in" + 0.182*"that" + 0.175*"is" + 0.155*"it" + 0.147*"you" + 0.114*"for"'),
 (1,
  '0.338*"the" + -0.191*"windows" + 0.166*"of" + -0.155*"for" + -0.139*"you" + -0.134*"it" + 0.133*"he" + 0.129*"was" + -0.121*"drive" + -0.117*"dos"'),
 (2,
  '-0.477*"the" + 0.392*"you" + 0.205*"god" + 0.185*"that" + 0.134*"your" + 0.122*"not" + 0.114*"to" + 0.102*"do" + -0.098*"drive" + 0.096*"is"'),
 (3,
  '-0.327*"he" + -0.270*"game" + 0.265*"of" + -0.168*"was" + -0.144*"games" + -0.142*"edu" + -0.134*"espn" + -0.117*"his" + -0.110*"team" + 0.105*"is"'),
 (4,
  '-0.351*"he" + -0.274*"god" + 0.265*"you" + -0.176*"his" + 0.170*"com" + 0.166*"they" + 0.154*"edu" + 0.152*"fbi" + -0.143*"is" + 0.124*"writes"'),
 (5,
  '-0.260*"edu" + 0.244*"drive" + -0.242*"of" + -0.232*"in" + 0.194*"scsi" + 0.150*"to" + 0.146*"ide" + 0.144*"they" + -0.130*"for" + 0.124*"that"'),
 (6,
  '0.222*"to" + -0.202*"drive" + 0.201*"game" + -0.198*"he" + -0.182*"s

In [13]:
X_vect = vect.transform(news_test)
corpus = gensim.matutils.Sparse2Corpus(X_vect, documents_columns = False)
topic_tuple_list = list(lsamodel[corpus])

In [14]:
y_predict = list()
for topic_tuple in topic_tuple_list:
    y_predict.append(sorted(topic_tuple, key = lambda x: x[1], reverse = True)[0])
y_predict
# y_predict = np.array(y_predict)
# unique, counts = np.unique(y_predict, return_counts = True)
# dict(zip(unique, counts))

[(0, 0.3013383299441404),
 (0, 0.4156816565943711),
 (0, 0.33209962953396993),
 (0, 0.40469455734883863),
 (0, 0.34278516628100386),
 (0, 0.37493965811096563),
 (0, 0.3451997675108307),
 (0, 0.11719784713624204),
 (0, 0.1392352942451515),
 (0, 0.14476041980599544),
 (0, 0.3137006812439913),
 (0, 0.4169006234674813),
 (0, 0.36680227479854455),
 (0, 0.21074635610931408),
 (0, 0.1811683488957189),
 (0, 0.2577693744619761),
 (0, 0.2700028081677375),
 (0, 0.2542839573962169),
 (0, 0.2792260145110377),
 (0, 0.15186007516552363),
 (0, 0.331346352502283),
 (0, 0.3705591650172031),
 (0, 0.06433765507569486),
 (0, 0.44688894518363914),
 (0, 0.5250827924487756),
 (0, 0.4237899082660869),
 (0, 0.3989154445026732),
 (0, 0.3891264190457616),
 (0, 0.5097259020566497),
 (0, 0.3041717748871301),
 (0, 0.2689123873896978),
 (0, 0.5126715584503092),
 (0, 0.29994200715544356),
 (0, 0.18063378960178636),
 (0, 0.13241546583346736),
 (0, 0.30582760007438825),
 (0, 0.18209034903445107),
 (0, 0.4977940249790519