In [1]:
import pandas as pd
import numpy as np
import spacy

from gensim.models import Phrases, LdaModel, CoherenceModel, Word2Vec
from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary, MmCorpus

from ast import literal_eval

In [2]:
data = pd.read_csv("../data/interim/trigrams3.csv")
data['TRIGRAMS'] = data['TRIGRAMS'].apply(literal_eval)
data.head()

Unnamed: 0.1,Unnamed: 0,NUM,FACILITY,DATE,FINE,NARRATIVE,TRIGRAMS
0,0,20008964,FREMONT HEALTHCARE CENTER,2012-02-01,750.0,F323 483.25(h) FREE OF ACCIDENT HAZARDS/SUPERV...,"[hazard_supervision_device, device_prevent_acc..."
1,1,20009068,WILLOW TREE NURSING CENTER,2012-03-02,750.0,Title 22 72520 (a) If a patient of a skilled n...,"[skilled_nursing_facility, hospital_define_sec..."
2,2,20009069,KINDRED NURSING AND REHABILITATION - YGNACIO V...,2012-03-02,750.0,483.12(b) (3) Permitting Resident to Return to...,"[bed_hold_period, facility_immediately_availab..."
3,3,20009078,"BAY VIEW REHABILITATION HOSPITAL, LLC",2012-03-05,37500.0,483.25 PROVIDE CARE/SERVICES FOR HIGHEST WELL ...,"[service_high_beingeach, facility_provide_nece..."
4,4,20009082,LONE TREE CONVALESCENT HOSPITAL,2012-03-06,600.0,T22 DIV5 CH3 ART3-72311(a)(1)(A) Nursing Servi...,"[nursing_service_shall, include_limit_followin..."


In [3]:
# Generate a sample of 25% of the database

sample = data.sample(n=700, random_state=42)
#sample = [trigram_list for trigram_list in sample['TRIGRAMS']]
sample['TRIGRAMS']

471     [punishment_involuntary_seclusion, physical_ab...
1453    [abuse_involuntary_seclusion, physical_abuse_c...
2379    [send_request_toÿchcqdata@cdph.ca.govÿto, obta...
1601    [health_safety_code, long_term_health, facilit...
1094    [unannounced_visit_facility, base_observation_...
                              ...                        
840     [nursing_service_shall, include_limited_follow...
1071    [individual_united_state, state_california_qua...
1480    [shall_report_incident, facility_department_im...
1509    [f309_483.25_provide, facility_provide_necessa...
1940    [72315_nursing_service, shall_treat_individual...
Name: TRIGRAMS, Length: 700, dtype: object

In [4]:
slist = sample['TRIGRAMS'].tolist()
print(type(slist), type(slist[0]), type(slist[0][0]))

<class 'list'> <class 'list'> <class 'str'>


In [5]:
%%time
# Create a dictionary with all of the words in the sample corpus
corpus = slist
dct = Dictionary(corpus)
dct.filter_extremes(no_below=2, no_above=0.15)

Wall time: 768 ms


In [6]:
%%time
# Convert the dictionary to numeric wordIDs
corpus_nums = [dct.doc2bow(text) for text in corpus]

Wall time: 285 ms


In [7]:
%%time
# Build a model using the numeric wordIDs, letting the function know how to convert these id's back into words
lda_model = LdaModel(corpus_nums, num_topics=10, id2word=dct, passes=30, iterations=600, random_state=42)

Wall time: 1min 3s


In [8]:
# Print the most commonly occurring words in each topic

for topic in range(10):
    print(lda_model.show_topic(topic, topn=15))
    print("\n")


[('sexual', 0.0054619634), ('agency', 0.004882252), ('ln', 0.004799997), ('unknown', 0.004142089), ('drug', 0.003913924), ('ombudsman', 0.0037669768), ('2010', 0.003672329), ('injury_unknown', 0.0035132563), ('cdph', 0.0034108385), ('2017', 0.0033757444), ('allege', 0.003329853), ('suspect_abuse', 0.0032456613), ('abuse_neglect', 0.0031641761), ('july', 0.0027832366), ('alleged_abuse', 0.0025729362)]


[('rash', 0.008130928), ('scabie', 0.0058620395), ('rating', 0.0048802714), ('2013', 0.0044930633), ('post', 0.004411541), ('medicare', 0.0026742974), ('star', 0.0026174255), ('assault', 0.0025274742), ('medicaid', 0.0025196967), ('peer', 0.0024492075), ('cream', 0.0023718833), ('overall', 0.0023132127), ('shall_post', 0.002311891), ('bite', 0.0022812288), ('center', 0.0022359977)]


[('lift', 0.008191604), ('hip', 0.0075363694), ('alarm', 0.005965842), ('2017', 0.0057267915), ('ln', 0.0046718884), ('fall_risk', 0.004078161), ('rail', 0.0038923551), ('food', 0.003812933), ('mechanical', 

In [9]:
# Define an informal grid of hyperparameters to test

topicnums = [3, 5, 8, 10, 20]
alphas = [0.01, 0.1, 0.3, 0.9]
etas = [0.01, 0.1, 0.3, 0.9]

In [10]:
# Evaluate each set of hyperparameters based on its c_v coherence score; this is time-consuming and has been set to 'off'
# by default.

hyperTest = False

if hyperTest:
    results = {}
    for t in topicnums:
        for a in alphas:
            for e in etas:
                lda_model = LdaModel(corpus_nums, id2word=dct, passes=30, iterations=600, random_state=42,
                                     num_topics=t, alpha=a, eta=e)
                coherence_model_lda = CoherenceModel(model=lda_model, texts=corpus, dictionary=dct, coherence='c_v')
                results[(t, a, e)] = coherence_model_lda.get_coherence()
    import csv
    w = csv.writer(open("output.csv", "w"))
    for key, val in results.items():
        w.writerow([key, val])

In [11]:
printout = pd.read_csv("output.csv", names=['Topics, Alpha, Eta', 'Coherence Score'])
printout.sort_values('Coherence Score', ascending=False).head(20)

Unnamed: 0,"Topics, Alpha, Eta",Coherence Score
10,"(3, 0.3, 0.3)",0.485832
13,"(3, 0.9, 0.1)",0.481271
14,"(3, 0.9, 0.3)",0.479637
0,"(3, 0.01, 0.01)",0.479008
5,"(3, 0.1, 0.1)",0.479008
1,"(3, 0.01, 0.1)",0.479008
4,"(3, 0.1, 0.01)",0.479008
15,"(3, 0.9, 0.9)",0.478763
7,"(3, 0.1, 0.9)",0.478763
11,"(3, 0.3, 0.9)",0.478763
