In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import spacy
nlp = spacy.load('en_core_web_md')

working = pd.read_csv("../data/interim/working.csv")
working = working.drop(['Unnamed: 0'], axis=1)
working.head()

## Note: this notebook takes 30 minutes to run, primarily because of cell 5. Please feel free to skip it if you
## have downloaded the repository, as all of the work done by this notebook has been saved in the export file.

Unnamed: 0,NUM,FACILITY,DATE,FINE,NARRATIVE
0,20008964,FREMONT HEALTHCARE CENTER,2012-02-01,750.0,F323 483.25(h) FREE OF ACCIDENT HAZARDS/SUPERV...
1,20009068,WILLOW TREE NURSING CENTER,2012-03-02,750.0,Title 22 72520 (a) If a patient of a skilled n...
2,20009069,KINDRED NURSING AND REHABILITATION - YGNACIO V...,2012-03-02,750.0,483.12(b) (3) Permitting Resident to Return to...
3,20009078,"BAY VIEW REHABILITATION HOSPITAL, LLC",2012-03-05,37500.0,483.25 PROVIDE CARE/SERVICES FOR HIGHEST WELL ...
4,20009082,LONE TREE CONVALESCENT HOSPITAL,2012-03-06,600.0,T22 DIV5 CH3 ART3-72311(a)(1)(A) Nursing Servi...


In [2]:
from gensim.models import Phrases

In [3]:
# Generate a list of strings from the database

corpus = working['NARRATIVE'].tolist()
(corpus[0][0:100], corpus[1][0:100], corpus[2][300:400],), 
(type(corpus), type(corpus[0]))

(list, str)

In [4]:
# This function will convert a string into a list of unigrams and remove whitespace, punctuation, and stopwords.

def unigrammize(input_text):
    for word in nlp.Defaults.stop_words:
        for w in (word, word[0].upper() + word[1:], word.upper()):
            lex = nlp.vocab[w]
            lex.is_stop = True
        
    lower = nlp(input_text.lower())
    unigrams = [token.lemma_ for token in lower 
                   if (not token.is_stop) and (not token.is_punct) and (not token.is_space)]
    return unigrams

In [5]:
%%time

# Convert the corpus into a list of lists of unigrams
unigrams = [unigrammize(document) for document in corpus]

Wall time: 26min 36s


In [6]:
# View a portion of the unigrams
[unigrams[0][0:10], unigrams[1][0:10], unigrams[2][0:10]]

[['f323',
  '483.25(h',
  'free',
  'accident',
  'hazard',
  'supervision',
  'device',
  'facility',
  'ensure',
  'resident'],
 ['title',
  '22',
  '72520',
  'patient',
  'skilled',
  'nursing',
  'facility',
  'transfer',
  'general',
  'acute'],
 ['483.12(b',
  '3',
  'permit',
  'resident',
  'return',
  'facility',
  'nursing',
  'facility',
  'establish',
  'follow']]

In [7]:
# This function adds bigrams and trigrams to a list of list of unigrams without deleting the original unigrams.

def add_trigrams(unigrams):
    result = []
    bigram_model = Phrases(unigrams, min_count=1, delimiter=b'_')
    trigram_model = Phrases(bigram_model[unigrams], min_count=1, delimiter=b'_')
    
    for document in unigrams:
        bigrams_ = [b for b in bigram_model[document] if b.count('_') == 1]
        trigrams_ = [t for t in trigram_model[bigram_model[document]] if t.count('_') == 2]
        merged_ = trigrams_ + bigrams_ + document
        result.append(merged_)

    return result

In [8]:
# View a sample of the list of lists that includes trigrams

trigrams = add_trigrams(unigrams)
[trigrams[0][0:30], trigrams[1][0:30], trigrams[2][0:30], trigrams[3][0:30]]

[['hazard_supervision_device',
  'device_prevent_accident',
  'facility_violate_regulation',
  'receive_adequate_supervision',
  'burn_left_thigh',
  'area_1.5_x',
  'accord_recent_minimum',
  'datum_set_md',
  'need_extensive_assistance',
  'use_wheel_walker',
  'range_motion_joint',
  'write_6:30_p.m.',
  'hot_tea_spill',
  'order_silvadene_cream',
  'study_thermal_injury',
  'thickness_burn_occur',
  '155_degree_fahrenheit',
  'f323_483.25(h',
  'free_accident',
  'hazard_supervision',
  'environment_remain',
  'free_accident',
  'hazard_possible',
  'receive_adequate',
  'supervision_assistance',
  'device_prevent',
  'violate_regulation',
  'fail_ensure',
  'receive_adequate',
  'prevent_accident'],
 ['skilled_nursing_facility',
  'hospital_define_section',
  'skilled_nursing_facility',
  'patient_bedhold_seven',
  'facility_violate_regulation',
  'ready_return_facility',
  'acute_care_hospital',
  'physician_order_date',
  'nausea_vomiting_complaint',
  'write_bed_hold',
  'bed_h

In [9]:
# View another sample of the list of lists to make sure it still includes unigrams and not just trigrams
trigrams[0][40:]

['datum_set',
 'date_5/26/11',
 'independent_activity',
 'daily_live',
 'need_extensive',
 'wheel_walker',
 'range_motion',
 '6:30_p.m.',
 'hot_tea',
 'cup_tea',
 'piece_paper',
 'accidentally_pull',
 'hot_tea',
 'skin_peel',
 'left_thigh',
 'silvadene_cream',
 'twice_day',
 'seven_day',
 'pressure_ulcer',
 'skin_peel',
 'left_thigh',
 '10:00_a.m.',
 'cup_tea',
 'tea_styrofoam',
 'cup_lid',
 'piece_paper',
 'cup_spill',
 'styrofoam_cup',
 'regular_coffee',
 'dietary_service',
 'supervisor_dss',
 '10:20_a.m.',
 '8_ounce',
 'styrofoam_cup',
 'wall_adjacent',
 'coffee_machine',
 'telephone_interview',
 '10:35_a.m.',
 'cup_tea',
 'styrofoam_cup',
 'hot_water',
 'middle_spigot',
 'coffee_machine',
 'styrofoam_cup',
 '9:30_a.m.',
 'plastic_coffee',
 'hot_water',
 'middle_spigot',
 'coffee_machine',
 'water_temperature',
 'degree_fahrenheit',
 'study_thermal',
 '3rd_degree',
 'thickness_burn',
 '155_degree',
 'prevent_accidental',
 'hot_cup',
 'tea_styrofoam',
 'violation_direct',
 'immediate

In [10]:
# Add the trigrams to the database and save it to disk.
working.loc[:, 'TRIGRAMS'] = trigrams
working.to_csv("../data/interim/trigrams3.csv")