In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = 15, 10

from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer

data = pd.read_csv("../data/interim/trigrams3.csv")
data = data.drop(['Unnamed: 0'], axis=1)
data['TRIGRAMS'] = data['TRIGRAMS'].apply(literal_eval)
data.head()

Unnamed: 0,NUM,FACILITY,DATE,FINE,NARRATIVE,TRIGRAMS
0,20008964,FREMONT HEALTHCARE CENTER,2012-02-01,750.0,F323 483.25(h) FREE OF ACCIDENT HAZARDS/SUPERV...,"[hazard_supervision_device, device_prevent_acc..."
1,20009068,WILLOW TREE NURSING CENTER,2012-03-02,750.0,Title 22 72520 (a) If a patient of a skilled n...,"[skilled_nursing_facility, hospital_define_sec..."
2,20009069,KINDRED NURSING AND REHABILITATION - YGNACIO V...,2012-03-02,750.0,483.12(b) (3) Permitting Resident to Return to...,"[bed_hold_period, facility_immediately_availab..."
3,20009078,"BAY VIEW REHABILITATION HOSPITAL, LLC",2012-03-05,37500.0,483.25 PROVIDE CARE/SERVICES FOR HIGHEST WELL ...,"[service_high_beingeach, facility_provide_nece..."
4,20009082,LONE TREE CONVALESCENT HOSPITAL,2012-03-06,600.0,T22 DIV5 CH3 ART3-72311(a)(1)(A) Nursing Servi...,"[nursing_service_shall, include_limit_followin..."


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [3]:
# Build a corpus of strings out of the trigram column in the main database

corpus = [' '.join(data.loc[row, 'TRIGRAMS']) for row in range(len(data))] 
corpus[0][0:100], corpus[1][0:100], corpus[2][0:100]

('hazard_supervision_device device_prevent_accident facility_violate_regulation receive_adequate_super',
 'skilled_nursing_facility hospital_define_section skilled_nursing_facility patient_bedhold_seven faci',
 'bed_hold_period facility_immediately_availability resident-_require_service facility_ii_eligible fac')

<h3>Generate the data for the $1,000 categorization problem</h3>

In [4]:
# Define a function that will convert the corpus into a vectorized bag of words, then apply it.
# The function also generates a binary outcome variable that is 1 if the fine was $1,000 or more.

def make_xy(data, vectorizer=None):
    #Your code here    
    if vectorizer is None:
        vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(corpus)
    X = X.tocsc()  # some versions of sklearn return COO format
    y = (data.FINE > 1000).values.astype(np.int)
    return X, y
X, y = make_xy(data)

<h3 style="color:blue">Multinomial Naive Bayes predicting if fine was $1,000 or more</h3>

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

foo = MultinomialNB()
foo.fit(X_train, y_train)
pred1 = foo.predict(X_train)
pred2 = foo.predict(X_test)
score1_nb1 = metrics.accuracy_score(y_train, pred1)
score2_nb1 = metrics.accuracy_score(y_test, pred2)
f1score_nb1 = metrics.f1_score(y_test, pred2)
naive_nb1 = sum(data.FINE > 1000) / sum(data.FINE > -1)

print(["training score:", score1_nb1, "test score:", score2_nb1, 
       "F1 Score", f1score_nb1, "naive:", naive_nb1])

['training score:', 0.867195242814668, 'test score:', 0.7734104046242775, 'F1 Score', 0.832764505119454, 'naive:', 0.7270204647936177]


<h3 style="color:red">Random Forest Generator predicting if fine was $1,000 or more</h3>

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

foo = RandomForestClassifier(max_depth = 4, random_state=48)
foo.fit(X_train, y_train)
pred1 = foo.predict(X_train)
pred2 = foo.predict(X_test)
score1_rf1 = foo.score(X_train, y_train)
score2_rf1 = foo.score(X_test, y_test)
f1score_rf1 = metrics.f1_score(y_test, pred2)
naive_rf1 = sum(data.FINE > 1000) / sum(data.FINE > -1)

print(["training score:", score1_rf1, "test score:", score2_rf1, 
       "F1 score:", f1score_rf1, "naive:", naive_rf1])



['training score:', 0.7413280475718533, 'test score:', 0.7086705202312139, 'F1 score:', 0.8290366350067843, 'naive:', 0.7270204647936177]


<h3 style="color:#3cb371">Gradient Boosting Machine predicting if fine was $1,000 or more</h3>

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

foo = GradientBoostingClassifier(random_state=48)
foo.fit(X_train, y_train)
pred1 = foo.predict(X_train)
pred2 = foo.predict(X_test)
score1_gb1 = foo.score(X_train, y_train)
score2_gb1 = foo.score(X_test, y_test)
f1score_gb1 = metrics.f1_score(y_test, pred2)
naive_gb1 = sum(data.FINE > 1000) / sum(data.FINE > -1)

print(["training score:", score1_gb1, "test score:", score2_gb1, "Naive Model:", naive_gb1])

['training score:', 0.9147670961347869, 'test score:', 0.8335260115606936, 'Naive Model:', 0.7270204647936177]


<h3>Generate the data for the $5,000 categorization problem</h3>

In [8]:
# Define a function that will convert the corpus into a vectorized bag of words, then apply it.
# The function also generates a binary outcome variable that is 1 if the fine was $5,000 or more.

def make_xy(data, vectorizer=None):
    #Your code here    
    if vectorizer is None:
        vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(corpus)
    X = X.tocsc()  # some versions of sklearn return COO format
    y = (data.FINE > 5000).values.astype(np.int)
    return X, y
X, y = make_xy(data)

<h3 style="color:blue">Multinomial Naive Bayes predicting if fine was $5,000 or more</h3>

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

foo = MultinomialNB()
foo.fit(X_train, y_train)
pred1 = foo.predict(X_train)
pred2 = foo.predict(X_test)
score1_nb2 = metrics.accuracy_score(y_train, pred1)
score2_nb2 = metrics.accuracy_score(y_test, pred2)
f1score_nb2 = metrics.f1_score(y_test, pred2)
naive_nb2 = sum(data.FINE > 5000) / sum(data.FINE > -1)

print(["training score:", score1_nb2, "test score:", score2_nb2, 
       "F1 Score", f1score_nb2, "naive:", naive_nb2])

['training score:', 0.8632309217046581, 'test score:', 0.7780346820809249, 'F1 Score', 0.6631578947368421, 'naive:', 0.269857787027402]


<h3 style="color:red">Random Forest Generator predicting if fine was $5,000 or more</h3>

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

foo = RandomForestClassifier(max_depth = 4, random_state=48)
foo.fit(X_train, y_train)
pred1 = foo.predict(X_train)
pred2 = foo.predict(X_test)
score1_rf2 = foo.score(X_train, y_train)
score2_rf2 = foo.score(X_test, y_test)
f1score_rf2 = metrics.f1_score(y_test, pred2)
naive_rf2 = sum(data.FINE > 5000) / sum(data.FINE > -1)

print(["training score:", score1_rf2, "test score:", score2_rf2,
       "F1 score:", f1score_rf2, "naive:", naive_rf2])

['training score:', 0.8166501486620417, 'test score:', 0.7884393063583816, 'F1 score:', 0.39202657807308966, 'naive:', 0.269857787027402]




<h3 style="color:#3cb371">Gradient Boosting Machine predicting if fine was $5,000 or more</h3>

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

foo = GradientBoostingClassifier(random_state=48)
foo.fit(X_train, y_train)
pred1 = foo.predict(X_train)
pred2 = foo.predict(X_test)
score1_gb2 = foo.score(X_train, y_train)
score2_gb2 = foo.score(X_test, y_test)
f1score_gb2 = metrics.f1_score(y_test, pred2)
naive_gb2 = sum(data.FINE > 5000) / sum(data.FINE > -1)

print(["training score:", score1_gb2, "test score:", score2_gb2, 
       "F1 Score", f1score_nb2, "Naive Model:", naive_gb2])

['training score:', 0.9767096134786918, 'test score:', 0.945664739884393, 'F1 Score', 0.6631578947368421, 'Naive Model:', 0.269857787027402]


<h3>Summary of performance of categorical models</h3>

In [12]:
summary = pd.DataFrame(
    {"MultinomialNB": [score1_nb1, score2_nb1, f1score_nb1, naive_nb1,
                      score1_nb2, score2_nb2, f1score_nb2, naive_nb2],
    "RandomForest": [score1_rf1, score2_rf1, f1score_rf1, naive_rf1,
                    score1_rf2, score2_rf2, f1score_rf2, naive_rf2],
    "GradientBoost": [score1_gb1, score2_gb1, f1score_gb1, naive_gb1,
                     score1_gb2, score2_gb2, f1score_gb2, naive_gb2]})
summary.index = ['Train >$1,000', 'Test >$1,000', 'F1 >$1,000', 'Naive >$1,000',
                'Train >$5,000', 'Test >$5,000', 'F1 >$5,000', 'Naive >$5,000']
summary.round(3)

Unnamed: 0,MultinomialNB,RandomForest,GradientBoost
"Train >$1,000",0.867,0.741,0.915
"Test >$1,000",0.773,0.709,0.834
"F1 >$1,000",0.833,0.829,0.889
"Naive >$1,000",0.727,0.727,0.727
"Train >$5,000",0.863,0.817,0.977
"Test >$5,000",0.778,0.788,0.946
"F1 >$5,000",0.663,0.392,0.899
"Naive >$5,000",0.27,0.27,0.27
