In [16]:
import pandas as pd
import numpy as np
import spacy
import re
import pickle

from gensim.models import Phrases, LdaModel, CoherenceModel, Word2Vec
from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary, MmCorpus
from gensim.test.utils import datapath

import pyLDAvis
import pyLDAvis.gensim

from ast import literal_eval

In [17]:
data = pd.read_csv("../data/interim/trigrams3.csv")
data['TRIGRAMS'] = data['TRIGRAMS'].apply(literal_eval)
data.head()

Unnamed: 0.1,Unnamed: 0,NUM,FACILITY,DATE,FINE,NARRATIVE,TRIGRAMS
0,0,20008964,FREMONT HEALTHCARE CENTER,2012-02-01,750.0,F323 483.25(h) FREE OF ACCIDENT HAZARDS/SUPERV...,"[hazard_supervision_device, device_prevent_acc..."
1,1,20009068,WILLOW TREE NURSING CENTER,2012-03-02,750.0,Title 22 72520 (a) If a patient of a skilled n...,"[skilled_nursing_facility, hospital_define_sec..."
2,2,20009069,KINDRED NURSING AND REHABILITATION - YGNACIO V...,2012-03-02,750.0,483.12(b) (3) Permitting Resident to Return to...,"[bed_hold_period, facility_immediately_availab..."
3,3,20009078,"BAY VIEW REHABILITATION HOSPITAL, LLC",2012-03-05,37500.0,483.25 PROVIDE CARE/SERVICES FOR HIGHEST WELL ...,"[service_high_beingeach, facility_provide_nece..."
4,4,20009082,LONE TREE CONVALESCENT HOSPITAL,2012-03-06,600.0,T22 DIV5 CH3 ART3-72311(a)(1)(A) Nursing Servi...,"[nursing_service_shall, include_limit_followin..."


In [18]:
# Generate a sample of 80% of the database (holding out 20% for validation)

train = data.sample(frac=0.8, random_state=42)
test = data.drop(train.index)
train['TRIGRAMS']

471     [punishment_involuntary_seclusion, physical_ab...
1453    [abuse_involuntary_seclusion, physical_abuse_c...
2379    [send_request_toÿchcqdata@cdph.ca.govÿto, obta...
1601    [health_safety_code, long_term_health, facilit...
1094    [unannounced_visit_facility, base_observation_...
                              ...                        
1852    [483.25(j_sufficient_fluid, sufficient_fluid_i...
2220    [report_irregularity_attend, base_observation_...
2523    [f322_483.25(g_2, service_restore_eating, serv...
2105    [facility_provide_necessary, care_service_atta...
133     [abuse_neglect_etc, policy_483.12_b, facility_...
Name: TRIGRAMS, Length: 2306, dtype: object

In [19]:
test['TRIGRAMS']

1       [skilled_nursing_facility, hospital_define_sec...
4       [nursing_service_shall, include_limit_followin...
11      [72311_nursing_service, shall_include_limit, i...
16      [483.12(a)(2_transfer_discharge, need_meet_fac...
19      [need_meet_facility, ii_transfer_discharge, 's...
                              ...                        
2858    [property_483.12(b_facility, establish_policy_...
2860    [t22_div_5, shall_include_limit, 3_notify_atte...
2861    [facility_hospitalize_place, therapeutic_leave...
2864    [facility_provide_necessary, care_service_atta...
2875    [comfortable_safe_temperature, p.m._unannounce...
Name: TRIGRAMS, Length: 577, dtype: object

In [20]:
def remove_numbers(corpus):
    regexp = re.compile(r'^[0-9]*$')
    return [[word for word in line if not regexp.search(word)] for line in corpus]

In [21]:
# Fill a dictionary with all of the words and phrases in the corpus, except for pure numbers like "2016", which
# do not contain much valuable info about what topic a narrative belongs to. Filter out words that occur too often
# or not often enough. Then convert the dictionary to vectors.

corpus = train['TRIGRAMS'].tolist()
corpus = remove_numbers(corpus)
dct = Dictionary(corpus)
dct.filter_extremes(no_below=2, no_above=0.15)
corpus_nums = [dct.doc2bow(text) for text in corpus]

In [22]:
# Train an LDA model, somewhat more carefully, using the hyperparameters that were close to optimal in the previous
# notebook. Although a 3-topic model had slightly more coherence, that is less interesting than an 8-topic model,
# and the coherence statistics were very similar.

retrainModel = False

if retrainModel:
    lda_model = LdaModel(corpus_nums, num_topics=8, id2word=dct, passes=60, iterations=1200, 
                         alpha=0.9, eta=0.9, random_state=42)
    lda_model.save("../models/ldatrain")
    dct.save_as_text("../models/ldatrain_dct")

lda_model = LdaModel.load("../models/ldatrain")
dct = Dictionary.load_from_text("../models/ldatrain_dct")

In [23]:
# Preview the most-frequently occuring words in each of the 8 topics, showing the percentage of each topic represented
# by each word, e.g., "oxygen, 0.51" means oxygen is 0.51% of the words in the topic.

for topic in range(8):
    print([(a, round(b*100, 2)) for (a, b) in lda_model.show_topic(topic, topn=15)])
    print("\n")

[('oxygen', 0.51), ('fluid', 0.51), ('weight', 0.37), ('food', 0.37), ('catheter', 0.34), ('restraint', 0.34), ('intake', 0.3), ('urinary', 0.29), ('respiratory', 0.27), ('drug', 0.26), ('eat', 0.26), ('ln', 0.25), ('gach', 0.24), ('dehydration', 0.24), ('diet', 0.23)]


[('alarm', 0.39), ('elopement', 0.34), ('ln', 0.29), ('wander', 0.28), ('july', 0.27), ('smoking', 0.24), ('psychiatric', 0.21), ('mood', 0.19), ('hallway', 0.19), ('behavioral', 0.17), ('depression', 0.16), ('exit', 0.16), ('aggressive', 0.16), ('station', 0.16), ('monitoring', 0.15)]


[('wound', 0.76), ('ulcer', 0.61), ('lift', 0.54), ('sore', 0.46), ('pressure_ulcer', 0.4), ('cm', 0.36), ('hip', 0.35), ('alarm', 0.29), ('pressure_sore', 0.29), ('shower', 0.28), ('stage', 0.28), ('september', 0.26), ('knee', 0.22), ('heel', 0.21), ('rating', 0.21)]


[('snf', 1.12), ('transfer_discharge', 0.98), ('ssd', 0.76), ('rp', 0.38), ('adm', 0.37), ('bed_hold', 0.31), ('gach', 0.28), ('bom', 0.25), ('trust_fund', 0.23), ('soc

In [24]:
%%time

# Prepare a graphical summary of the topics based on principal component analysis (very time consuming)
PrepVisual = False

if PrepVisual:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus_nums, dct)
    pickle.dump( LDAvis_prepared, open( "../models/visual.p", "wb" ) )
    pyLDAvis.save_html(LDAvis_prepared, '../models/lda.html')




Wall time: 0 ns


In [25]:
LDAvis_prepared = pickle.load( open( "../models/visual.p", "rb" ) )


In [26]:
# Launch an interactive visualization of the topics in a new window (using this window distorts the width of all cells)
pyLDAvis.show(LDAvis_prepared)


Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]


127.0.0.1 - - [01/Mar/2020 19:52:48] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [01/Mar/2020 19:52:49] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [01/Mar/2020 19:52:49] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [01/Mar/2020 19:52:49] "GET /LDAvis.js HTTP/1.1" 200 -



stopping Server...
