In [50]:
import pandas as pd
import numpy as np

#gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import gensim.corpora as corpora
from gensim.models import CoherenceModel

#nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.stem import wordnet
import nltk
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [7]:
trails_df = pd.read_pickle('../data/mtb_trails_df_2')
trails_df_with_summary = trails_df[trails_df['no_summary']==0]
X = trails_df_with_summary['summary']
documents = X

In [17]:
from gensim.parsing.preprocessing import STOPWORDS
from nltk.corpus import stopwords

nltk_stopwords = set(stopwords.words('english'))
gensim_stopwords = STOPWORDS
my_stopwords = set(['singletrack', 'loop', 'trail', 'trails',  'ride', \
                    'area', 'route', 'way', 'feature', 'section', 'riding'\
                    'north', 'south', 'east', 'west', '-PRON-', 'pron', 'nee', 'regard', 'shall', 'use', 'win'])
all_stopwords = my_stopwords.union(nltk_stopwords.union(gensim_stopwords))

In [53]:
#wordnet = WordNetLemmatizer()
# def get_wordnet_pos(word):
#     tag = nltk.pos_tag([word])[0][1][0].upper()
#     tag_dict = {"J": wordnet.ADJ,
#                 "N": wordnet.NOUN,
#                 "V": wordnet.VERB,
#                 "R": wordnet.ADV}

#     return tag_dict.get(tag, wordnet.NOUN)


def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in all_stopwords and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [54]:
#stemmer = SnowballStemmer('english')
processed_docs = documents.map(preprocess)

In [55]:
processed_docs[:10]

0    [excellent, flow, downhill, exceptional, gravi...
1      [amaze, journey, time, scene, helens, eruption]
2                               [mile, travel, finest]
3     [good, introductory, lower, post, canyon, offer]
4    [road, climb, aggressive, cross, country, hear...
5    [shuttle, assist, iconic, timberline, lodge, d...
6    [iconic, rid, tough, rugged, surveyor, ridge, ...
7                  [great, clear, water, lewis, river]
8                 [hagg, lake, close, portland, miles]
9    [start, buxton, trailhead, bank, vernonia, stu...
Name: summary, dtype: object

In [41]:
#create dictionary
id2word = gensim.corpora.Dictionary(processed_docs)

#create corpus
texts = processed_docs

#Term Document Frequency
bow_corpus = [id2word.doc2bow(text) for text in texts]

In [56]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=3, id2word=id2word, passes=2, workers=2)

In [57]:
from pprint import pprint

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[bow_corpus]

[(0,
  '0.031*"great" + 0.020*"view" + 0.016*"climb" + 0.015*"fast" + 0.014*"road" '
  '+ 0.012*"technical" + 0.012*"descent" + 0.012*"rock" + 0.010*"doubletrack" '
  '+ 0.009*"park"'),
 (1,
  '0.026*"climb" + 0.020*"short" + 0.016*"steep" + 0.014*"downhill" + '
  '0.013*"descent" + 0.012*"technical" + 0.010*"mile" + 0.009*"nice" + '
  '0.009*"flow" + 0.009*"park"'),
 (2,
  '0.024*"road" + 0.018*"creek" + 0.015*"forest" + 0.014*"park" + 0.013*"fast" '
  '+ 0.012*"mountain" + 0.011*"descent" + 0.011*"access" + 0.008*"climb" + '
  '0.008*"downhill"')]


In [30]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(bow_corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.160874030443285

Coherence Score:  0.18701226697068404


In [31]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
