In [15]:
import pandas as pd
import numpy as np

#gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import gensim.corpora as corpora
from gensim.models import CoherenceModel

#nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.stem import wordnet
import nltk
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [16]:
trails_df = pd.read_pickle('../data/mtb_trails_df_2')
trails_df_with_summary = trails_df[trails_df['no_summary']==0]
X = trails_df_with_summary['summary']
documents = X

In [17]:
from gensim.parsing.preprocessing import STOPWORDS
from nltk.corpus import stopwords

nltk_stopwords = set(stopwords.words('english'))
gensim_stopwords = STOPWORDS
my_stopwords = set(['singletrack', 'loop', 'trail', 'trails',  'ride', \
                    'area', 'route', 'way', 'feature', 'section', 'riding'\
                    'north', 'south', 'east', 'west', '-PRON-', 'pron', 'nee', 'regard', 'shall', 'use', 'win'])
all_stopwords = my_stopwords.union(nltk_stopwords.union(gensim_stopwords))

In [18]:
#wordnet = WordNetLemmatizer()
# def get_wordnet_pos(word):
#     tag = nltk.pos_tag([word])[0][1][0].upper()
#     tag_dict = {"J": wordnet.ADJ,
#                 "N": wordnet.NOUN,
#                 "V": wordnet.VERB,
#                 "R": wordnet.ADV}

#     return tag_dict.get(tag, wordnet.NOUN)


def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in all_stopwords and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [19]:
#stemmer = SnowballStemmer('english')
processed_docs = documents.map(preprocess)

In [20]:
processed_docs[:10]

0    [excellent, flow, downhill, exceptional, gravi...
1      [amaze, journey, time, scene, helens, eruption]
2                               [mile, travel, finest]
3     [good, introductory, lower, post, canyon, offer]
4    [road, climb, aggressive, cross, country, hear...
5    [shuttle, assist, iconic, timberline, lodge, d...
6    [iconic, rid, tough, rugged, surveyor, ridge, ...
7                  [great, clear, water, lewis, river]
8                 [hagg, lake, close, portland, miles]
9    [start, buxton, trailhead, bank, vernonia, stu...
Name: summary, dtype: object

In [21]:
#create dictionary
id2word = gensim.corpora.Dictionary(processed_docs)

#create corpus
texts = processed_docs

#Term Document Frequency
bow_corpus = [id2word.doc2bow(text) for text in texts]

In [22]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=3, id2word=id2word, passes=2, workers=2)

In [23]:
from pprint import pprint

# Print the Keyword in the topics
pprint(lda_model.print_topics())
doc_lda = lda_model[bow_corpus]

[(0,
  '0.031*"climb" + 0.016*"great" + 0.015*"steep" + 0.013*"short" + '
  '0.012*"challenge" + 0.011*"descent" + 0.011*"flow" + 0.010*"good" + '
  '0.010*"fast" + 0.009*"technical"'),
 (1,
  '0.029*"road" + 0.017*"great" + 0.016*"park" + 0.015*"short" + 0.014*"creek" '
  '+ 0.014*"climb" + 0.013*"view" + 0.012*"downhill" + 0.011*"rock" + '
  '0.010*"fast"'),
 (2,
  '0.016*"technical" + 0.016*"climb" + 0.014*"park" + 0.014*"rid" + '
  '0.012*"descent" + 0.012*"great" + 0.011*"view" + 0.011*"flow" + '
  '0.010*"fast" + 0.010*"creek"')]


In [24]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(bow_corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.168981922702434

Coherence Score:  0.20636234272716847


In [25]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
