# LDA as Baseline Model

__Classes:__
1. info
2. fact
3. opinion
4. exp
5. other

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from pprint import pprint

In [4]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
np.random.seed(2019)

In [9]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

## Load in Parsed Training Data

In [3]:
# Load in data
data_train = pd.read_csv('data/review_sentence_parsed_train.csv')

## Data Preprocessing

#### Define a few helper functions

In [5]:
stemmer = SnowballStemmer('english')

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS:
            # Removed the len(token) >= 3 constraint
            result.append(lemmatize_stemming(token))
    return result

In [6]:
doc_sample = data_train.sentence.iloc[0]

In [10]:
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['There', 'are', 'a', 'number', 'of', 'crossings', 'where', 'pedestrians', 'do', 'not', 'cross', 'at-grade,', 'and', 'have', 'to', 'use', 'bridges', 'and', 'skywalks.']


 tokenized and lemmatized document: 
['number', 'cross', 'pedestrian', 'cross', 'grade', 'use', 'bridg', 'skywalk']


In [11]:
processed_docs = data_train.sentence.map(preprocess)

#### Build a dictionary

In [12]:
dictionary = gensim.corpora.Dictionary(processed_docs)

__Filter out extremes__

In [13]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000) # Might need slight tweaking

## Modeling using BagOfWords

In [14]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

#### Train an LDA model using bow

In [15]:
lda_model_bow = gensim.models.LdaMulticore(bow_corpus, num_topics=5, id2word=dictionary, passes=2, workers=2)

In [16]:
for idx, topic in lda_model_bow.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.035*"ticket" + 0.028*"line" + 0.026*"tour" + 0.020*"time" + 0.019*"wait" + 0.018*"queue" + 0.016*"pass" + 0.015*"worth" + 0.015*"buy" + 0.014*"pay"
Topic: 1 
Words: 0.033*"peopl" + 0.018*"walk" + 0.010*"photo" + 0.009*"lot" + 0.009*"take" + 0.008*"pictur" + 0.008*"park" + 0.008*"like" + 0.008*"crowd" + 0.007*"tourist"
Topic: 2 
Words: 0.027*"ride" + 0.022*"park" + 0.022*"time" + 0.019*"rid" + 0.015*"experi" + 0.015*"year" + 0.014*"spend" + 0.014*"hour" + 0.013*"day" + 0.013*"like"
Topic: 3 
Words: 0.021*"view" + 0.017*"place" + 0.016*"museum" + 0.014*"visit" + 0.014*"build" + 0.014*"great" + 0.013*"beauti" + 0.013*"amaz" + 0.011*"shop" + 0.011*"food"
Topic: 4 
Words: 0.029*"visit" + 0.026*"day" + 0.021*"go" + 0.017*"time" + 0.014*"park" + 0.013*"recommend" + 0.013*"open" + 0.012*"night" + 0.012*"walk" + 0.011*"crowd"


In [19]:
lda_model_bow.save('model/lda_bow/lda_bow')

## Modeling using TF-IDF