In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning, module='.*/IPython/.*')
warnings.filterwarnings('ignore', category=DeprecationWarning, module='pyLDAvis')

import pyLDAvis
import pyLDAvis.sklearn

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

pyLDAvis.enable_notebook()

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import spacy, re
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from string import punctuation, printable
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [4]:
engine = create_engine('postgresql://jordanhelen:password@localhost:5432/firewise')
sql = "SELECT event_desc, event_type FROM under_sampling_data;"

In [5]:
def df_from_sql(sql_code_str, engine):
    print("Bringing data in...")
    df = pd.read_sql(sql_code_str, engine)
    contents = df[df.columns[0]].values
    return df, contents

In [6]:
df, contents = df_from_sql(sql, engine)

Bringing data in...


In [67]:
def clean_text(contents):
    print("Lemmatizing, removing stop words, cleaning text...")
    punc_dict = {ord(punc): None for punc in punctuation}
    nlp = spacy.load("en")
    for i, line in enumerate(contents):
        line = line.translate(punc_dict)
        clean_doc = "".join([char for char in line if char in printable])
        line = nlp(clean_doc)
        line_list = [re.sub("\W+","",token.lemma_.lower()) for token in line if token.is_stop == False]
        line_list = [token for token in line_list if token not in ('2015','2014','2016','fire','firewise')]
        contents[i] = ' '.join(line_list)
    return contents

In [68]:
contents = clean_text(contents)

Lemmatizing, removing stop words, cleaning text...


In [69]:
def tf(contents):
    print("Extracting tf features for LDA...")
    tf_vectorizer = CountVectorizer(max_df=.95, min_df=0.05, max_features=80)
    X = tf_vectorizer.fit_transform(contents)
    feature_names = tf_vectorizer.get_feature_names()
    return X, feature_names, tf_vectorizer

In [70]:
X, features_name, tf_vectorizer = tf(contents)

Extracting tf features for LDA...


In [83]:
def run_lda(X):
    print("Running LDA model...")
    model = LatentDirichletAllocation(n_components=5,max_iter=100,learning_method='online',learning_offset=50.,random_state=0)
    return model.fit(X)

In [84]:
model = run_lda(X)

Running LDA model...


In [85]:
pyLDAvis.sklearn.prepare(model, X, tf_vectorizer, R=20)

In [18]:
#Save HTML Code
p = pyLDAvis.sklearn.prepare(model, X, tf_vectorizer, R=20)
pyLDAvis.save_html(p, 'lda.html')

## LDA Indicies from above graph

Topic 1 [Distribution Event]: community booth information event annual hold meeting sign attend home

Topic 2 [Education Event]: community presentation property resident fuel wildfire forest service provide discuss

Topic 3 [Home assessment]: home homeowner assessment member information wildland conduct answer question risk

Topic 4 [Community Preparedness]: day community program material county event hold mitigation member chip

Topic 5 [Mitigation Event]: brush chip road volunteer year hour community area property chipper

### From LDA Python Script and mapped to counts below
Topic # 0: day community program material county event hold mitigation member chip

Topic # 1: home homeowner assessment member information wildland conduct answer question risk

Topic # 2: community booth information event annual hold meeting sign attend home

Topic # 3: brush chip road volunteer year hour community area property chipper

Topic # 4: community presentation property resident fuel wildfire forest service provide discuss

 topics | count 
 
    0 |  2364
    1 |   560
    2 |  3131
    3 |  1106 
    4 |  2095