In [1]:
import pandas as pd
import numpy as np
import spacy
import re
from scipy import stats

from gensim.models import Phrases, LdaModel, CoherenceModel, Word2Vec
from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary, MmCorpus
from gensim.test.utils import datapath

import pyLDAvis
import pyLDAvis.gensim

from ast import literal_eval

In [2]:
# Import model from disc
lda_model =  LdaModel.load("../models/ldatrain")
dct = Dictionary.load_from_text("../models/ldatrain_dct")

In [3]:
# Test that import was successful
for topic in range(2):
    print([(a, round(b*100, 2)) for (a, b) in lda_model.show_topic(topic, topn=5)])
    print("\n")

[('oxygen', 0.51), ('fluid', 0.51), ('weight', 0.37), ('food', 0.37), ('catheter', 0.34)]


[('alarm', 0.39), ('elopement', 0.34), ('ln', 0.29), ('wander', 0.28), ('july', 0.27)]




In [4]:
data = pd.read_csv("../data/interim/trigrams3.csv")
data = data.drop(['Unnamed: 0'], axis=1)
data['TRIGRAMS'] = data['TRIGRAMS'].apply(literal_eval)
data.head()

Unnamed: 0,NUM,FACILITY,DATE,FINE,NARRATIVE,TRIGRAMS
0,20008964,FREMONT HEALTHCARE CENTER,2012-02-01,750.0,F323 483.25(h) FREE OF ACCIDENT HAZARDS/SUPERV...,"[hazard_supervision_device, device_prevent_acc..."
1,20009068,WILLOW TREE NURSING CENTER,2012-03-02,750.0,Title 22 72520 (a) If a patient of a skilled n...,"[skilled_nursing_facility, hospital_define_sec..."
2,20009069,KINDRED NURSING AND REHABILITATION - YGNACIO V...,2012-03-02,750.0,483.12(b) (3) Permitting Resident to Return to...,"[bed_hold_period, facility_immediately_availab..."
3,20009078,"BAY VIEW REHABILITATION HOSPITAL, LLC",2012-03-05,37500.0,483.25 PROVIDE CARE/SERVICES FOR HIGHEST WELL ...,"[service_high_beingeach, facility_provide_nece..."
4,20009082,LONE TREE CONVALESCENT HOSPITAL,2012-03-06,600.0,T22 DIV5 CH3 ART3-72311(a)(1)(A) Nursing Servi...,"[nursing_service_shall, include_limit_followin..."


In [5]:
def remove_numbers(corpus):
    regexp = re.compile(r'^[0-9]*$')
    return [[word for word in line if not regexp.search(word)] for line in corpus]
full_corpus = data['TRIGRAMS'].tolist()
full_corpus = remove_numbers(full_corpus)
full_dct = dct
full_corpus_nums = [full_dct.doc2bow(text) for text in full_corpus]

In [6]:
# Test that topics have actually been assigned to documents
lda_model.get_document_topics(full_corpus_nums[2])

[(0, 0.23195742), (2, 0.04504645), (3, 0.6843919), (6, 0.019357733)]

In [7]:
# Write a function to identify the topic most strongly associated with each document

def get_best_topic(document, model):
    best_topic = None
    best_prob = 0
    candidates = model.get_document_topics(document)
    for (topic, prob) in candidates:
        if prob > best_prob:
            best_topic = topic
            best_prob = prob
    return best_topic

In [8]:
# Test the function
get_best_topic(full_corpus_nums[2], lda_model)

3

In [9]:
get_best_topic(full_corpus_nums[1828], lda_model)

2

In [10]:
%%time

# Assign each document in the corpus to a particular topic
data.loc[:, 'TOPIC'] = [get_best_topic(full_corpus_nums[row], lda_model) 
                        for row in range(len(data))]

Wall time: 2.51 s


In [11]:
data.head()

Unnamed: 0,NUM,FACILITY,DATE,FINE,NARRATIVE,TRIGRAMS,TOPIC
0,20008964,FREMONT HEALTHCARE CENTER,2012-02-01,750.0,F323 483.25(h) FREE OF ACCIDENT HAZARDS/SUPERV...,"[hazard_supervision_device, device_prevent_acc...",4
1,20009068,WILLOW TREE NURSING CENTER,2012-03-02,750.0,Title 22 72520 (a) If a patient of a skilled n...,"[skilled_nursing_facility, hospital_define_sec...",3
2,20009069,KINDRED NURSING AND REHABILITATION - YGNACIO V...,2012-03-02,750.0,483.12(b) (3) Permitting Resident to Return to...,"[bed_hold_period, facility_immediately_availab...",3
3,20009078,"BAY VIEW REHABILITATION HOSPITAL, LLC",2012-03-05,37500.0,483.25 PROVIDE CARE/SERVICES FOR HIGHEST WELL ...,"[service_high_beingeach, facility_provide_nece...",0
4,20009082,LONE TREE CONVALESCENT HOSPITAL,2012-03-06,600.0,T22 DIV5 CH3 ART3-72311(a)(1)(A) Nursing Servi...,"[nursing_service_shall, include_limit_followin...",2


In [12]:
train = data.sample(frac=0.8, random_state=42)
test = data.drop(train.index)
train.head()

Unnamed: 0,NUM,FACILITY,DATE,FINE,NARRATIVE,TRIGRAMS,TOPIC
471,40012798,FRANCISCAN CONVALESCENT HOSPITAL,2016-12-08,2000.0,F 223: 483.13(b) Free From Abuse The resident ...,"[punishment_involuntary_seclusion, physical_ab...",4
1453,230009521,WILLOWS CENTER,2012-10-23,2000.0,"F223 483.13(b), 483.13(c)(1)(i) Free from abus...","[abuse_involuntary_seclusion, physical_abuse_c...",4
2379,940013137,WOODRUFF CONVALESCENT CENTER,2017-05-31,75000.0,The citation narrative for this penalty will n...,"[send_request_toÿchcqdata@cdph.ca.govÿto, obta...",2
1601,240012808,RIALTO POST ACUTE CENTER,2016-12-09,13000.0,REGULTAION VIOLATION: Health and Safety Code 1...,"[health_safety_code, long_term_health, facilit...",2
1094,120010126,"POSITIVE DIRECTIONS, INC. #4",2013-12-18,8000.0,CFR W120 483.410(d)(3) (3) The facility must a...,"[unannounced_visit_facility, base_observation_...",2


In [13]:
test.head()

Unnamed: 0,NUM,FACILITY,DATE,FINE,NARRATIVE,TRIGRAMS,TOPIC
1,20009068,WILLOW TREE NURSING CENTER,2012-03-02,750.0,Title 22 72520 (a) If a patient of a skilled n...,"[skilled_nursing_facility, hospital_define_sec...",3
4,20009082,LONE TREE CONVALESCENT HOSPITAL,2012-03-06,600.0,T22 DIV5 CH3 ART3-72311(a)(1)(A) Nursing Servi...,"[nursing_service_shall, include_limit_followin...",2
11,20009298,VALE HEALTHCARE CENTER,2012-05-14,19000.0,"72311 (a) Nursing service shall include, but n...","[72311_nursing_service, shall_include_limit, i...",0
16,20009799,WINDSOR ROSEWOOD CARE CENTER,2013-04-25,2000.0,?483.12(a)(2) Transfer and Discharge Requireme...,"[483.12(a)(2_transfer_discharge, need_meet_fac...",3
19,20009811,WINDSOR ROSEWOOD CARE CENTER,2013-04-25,2000.0,F201 ?483.12(a)(2) Transfer and Discharge Requ...,"[need_meet_facility, ii_transfer_discharge, 's...",3


In [14]:
train_means = train.groupby(['TOPIC']).mean()
train_means

Unnamed: 0_level_0,NUM,FINE
TOPIC,Unnamed: 1_level_1,Unnamed: 2_level_1
0,476772700.0,13830.708185
1,410853300.0,9240.396491
2,427432800.0,9852.278826
3,195070200.0,1987.794118
4,407270200.0,2360.606432
5,471800200.0,1709.349593
6,444403900.0,10403.271028
7,369392200.0,2039.695238


In [15]:
test_means = test.groupby(['TOPIC']).mean()
test_means

Unnamed: 0_level_0,NUM,FINE
TOPIC,Unnamed: 1_level_1,Unnamed: 2_level_1
0,382820100.0,13194.94382
1,423237200.0,12405.645161
2,370683100.0,8346.08209
3,296934300.0,2414.102564
4,388582700.0,2242.664835
5,549535400.0,1785.714286
6,650537100.0,11097.368421
7,282592100.0,2346.774194


In [16]:
compare_means = pd.DataFrame({"Train": train_means['FINE'], "Test": test_means['FINE']})
compare_means

Unnamed: 0_level_0,Train,Test
TOPIC,Unnamed: 1_level_1,Unnamed: 2_level_1
0,13830.708185,13194.94382
1,9240.396491,12405.645161
2,9852.278826,8346.08209
3,1987.794118,2414.102564
4,2360.606432,2242.664835
5,1709.349593,1785.714286
6,10403.271028,11097.368421
7,2039.695238,2346.774194


In [17]:
# Display average fines by topic for training set vs. test set
compare_means.index=['Life Support', 'Escape', 'Sores', 'Administration', 'Abuse', 'Theft', 'Diabetes', 'Rashes']
pd.options.display.float_format = '${:,.0f}'.format
compare_means.sort_values('Train', ascending=False)


Unnamed: 0,Train,Test
Life Support,"$13,831","$13,195"
Diabetes,"$10,403","$11,097"
Sores,"$9,852","$8,346"
Escape,"$9,240","$12,406"
Abuse,"$2,361","$2,243"
Rashes,"$2,040","$2,347"
Administration,"$1,988","$2,414"
Theft,"$1,709","$1,786"


In [18]:
# Average fines are very tightly correlated bewteen topics, r=0.96
stats.pearsonr(compare_means['Train'], compare_means['Test']) 

(0.9635930788181462, 0.00011737003354033992)