In [10]:
# -*- coding: utf-8 -*-
import numpy as np
import lda
import csv
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from gensim import corpora, models

from stemming.porter2 import stem
from nltk.stem import *
import unicodecsv
import re
import os
# import pyLDAvis.gensim
import gensim
import argparse

# Playing around with just dictionary words
# Using PyEnchant spell checker purpose
import enchant

In [11]:
class data:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)
args = data()
args.filename = "paultan.csv"
args.stemmer = "lemma"
args.dictionary = "automotive"
args.num_iter = 20
args.num_top_words = 8
args.num_topics = 10
args.model = "dtm"
dir = os.getcwd()
model_dir = os.path.join(dir, 'models/')
dataset_dir = os.path.join(dir, 'datasets/')
dictionary_dir = os.path.join(dir, 'dictionaries/')
executable_dir = os.path.join(dir, 'executables/')
my_timeslices = [500,500,500,500,500, 346]

In [12]:
_digits = re.compile('\d')
def contains_digits(d):
    return bool(_digits.search(d))


def get_dict_dir(s):
  return os.path.join(dictionary_dir, s)

def get_exec_dir(s):
  return os.path.join(executable_dir, s)
d = enchant.Dict("en_US")
# Or using the /usr/share/dict/british-english word list
if args.dictionary != "none":
  with open(get_dict_dir(args.dictionary + "-english")) as word_file:
    english_words = set(word.strip().lower() for word in word_file)
    # print(english_words)
    def is_english_word(word):
      return word.lower() in english_words

def process_tokens(tokens,stemmer):
  tokens = [i for i in tokens if not i in en_stop and not contains_digits(i) and is_english_word(i)]
  if stemmer == 'porter':
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(i) for i in tokens]
  elif stemmer == 'porter2':
    tokens = [stem(i) for i in tokens]
  elif stemmer == 'lemma':
    lemmatiser = WordNetLemmatizer()
    tokens = [lemmatiser.lemmatize(i) for i in tokens]
  return tokens


def get_model_with_arguments_filename():
  return (args.filename.split('.')[0] + "_" + args.stemmer + "_" + str(args.num_iter) +
   "_" + str(args.num_top_words) + "_" + str(args.num_topics)  + "_" + args.model)



if args.filename == "sample":
  X = lda.datasets.load_reuters()
  dictionary = lda.datasets.load_reuters_vocab()
  titles = lda.datasets.load_reuters_titles()
else:
  # X = np.zeros((len(contents), len(dictionary)), dtype=np.int)
  # for idx,i in enumerate(corpus):
  #   for j in i:
  #     X[idx][j[0]] = j[1]

  model_filename = os.path.join(model_dir, get_model_with_arguments_filename())
  print model_filename
  try:
    ldamodel = models.LdaModel.load(model_filename)
  except IOError:
    dataset_filepath = os.path.join(dataset_dir, args.filename)
    f = open(dataset_filepath)
    reader = unicodecsv.reader(f, encoding='utf-8')
    # csv_length = sum(1 for row in reader)
    # f.seek(0) #reset reader position
    identifiers = reader.next()
    contents_idx = identifiers.index("contents")
    title_idx = identifiers.index("title")

    contents = [ row[contents_idx] for row in reader if row[contents_idx] ]

    f.seek(0)
    reader.next()
    titles = [ row[title_idx] for row in reader if row[contents_idx] ]
    texts = list()
    tokenizer = RegexpTokenizer(r'\w+')
    en_stop = get_stop_words('en')
    for idx,i in enumerate(contents):
      if not idx % 10:
        print "INFO: Tokenizing articles <{}> ".format(idx)
      raw = i.lower()
      tokens = tokenizer.tokenize(raw)
      texts.append(process_tokens(tokens, args.stemmer))
      # print idx
      # add tokens to list

    print "[DEBUG] Length of Texts : {}".format(len(texts))
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]


/Users/Hii/Projects/news_scraper/models/paultan_lemma_20_8_10_dtm


In [14]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
try:
  ldamodel = models.LdaModel.load(model_filename)
except IOError:
  if(args.model == "lda"):
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=int(args.num_topics), id2word = dictionary, passes=int(args.num_iter))
  elif(args.model == "dtm"):
    ldamodel = gensim.models.wrappers.DtmModel(get_exec_dir('dtm-darwin64'), corpus, my_timeslices, num_topics=int(args.num_topics), id2word=dictionary,initialize_lda=True)
  else:
    raise ValueError('Unknown Model Type')
  ldamodel.save(model_filename)


# ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=20)
# pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)



In [15]:
  for idx, topic in enumerate(ldamodel.show_topics(topics=args.num_topics, topn=int(args.num_top_words),times=1, formatted=False)):
    print "Topic #" + str(idx) + " :",
    for word in topic:
      print word[1],
    print

Topic #0 : honda toyota nissan new type car year jaguar
Topic #1 : speed engine new feature wheel torque automatic control
Topic #2 : car vehicle driver kia hyundai feature safety new
Topic #3 : class highway new construction make bentley year turn
Topic #4 : bmw coupe new drive convertible car auto model
Topic #5 : ford mazda safety mitsubishi car fiat year mustang
Topic #6 : new engine fuel car sedan model design cylinder
Topic #7 : year new service increase suzuki price industry vehicle
Topic #8 : volkswagen audi porsche recall new scandal year model
Topic #9 : proton car racing race year ferrari formula design


In [13]:
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)



  def _ipython_display_formatter_default(self):
  def _singleton_printers_default(self):
  def _type_printers_default(self):
  def _deferred_printers_default(self):
