In [1]:
from gensim import corpora, models, similarities
from collections import defaultdict
import re
import pandas as pd
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
documents = pd.read_pickle('gensim/preprocessed_text.pkl')

In [3]:
# remove words that only appear once in all documents
frequency = defaultdict(int)
for text in documents['text']:
    for token in text:
        frequency[token] += 1
        
documents['text'] = [[token for token in text if frequency[token] > 1]
              for text in documents['text']]

In [4]:
# still some web address tokens, so remove tokens containing 'http' or 'www'
documents_no_web = pd.DataFrame(columns=documents.columns)
for ind, row in documents.iterrows():
    text = row['text']
    new_text = []
    for word in text:
        if re.search(r'http', word):
            continue
        if re.search(r'www', word):
            continue
        new_text.append(word)
    row['text'] = new_text
    documents_no_web = documents_no_web.append(row)

In [5]:
# save re-pre-processed text
documents_no_web.to_pickle('gensim/preprocessed_text_v2.pkl')

In [6]:
# create dictionary
dictionary = corpora.Dictionary(documents_no_web['text'])
print dictionary

Dictionary(174016 unique tokens: [u'fawn', u'sticman', u'denber', u'lowkramen', u'fawk']...)


In [7]:
# filter extremes in dictionary
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)
print dictionary

Dictionary(51857 unique tokens: [u'fawn', u'fawl', u'circuitri', u'fawk', u'damfunk']...)


In [8]:
# save
dictionary.save('gensim/dictionary.dict')

In [9]:
# convert documents to bag of words
corpus = [dictionary.doc2bow(text) for text in documents_no_web['text']]
corpora.MmCorpus.serialize('gensim/corpus.mm', corpus)

In [11]:
# transform to tfidf
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
corpora.MmCorpus.serialize('gensim/corpus_tfidf.mm', corpus_tfidf)

In [12]:
# lsi model on tfidf
%time lsi = models.lsimodel.LsiModel(corpus_tfidf, num_topics = 100, id2word=dictionary)
lsi.save('gensim/model.lsi')

CPU times: user 24.8 s, sys: 1.1 s, total: 25.9 s
Wall time: 20.6 s


In [13]:
# calculate similarities
index = similarities.MatrixSimilarity(lsi[corpus_tfidf])
index.save('gensim/tfidf_lsi_similarities.index')

