In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from keras.models import Sequential
from keras import layers
from keras.backend import clear_session
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV

In [3]:
dfNEISS = pd.read_csv('HPGH_labeled.csv')

In [4]:
dfNEISS.head()

Unnamed: 0.1,Unnamed: 0,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Body_Part,Diagnosis,Disposition,Location,Fire_Involvement,Alcohol,Drug,Product,Narrative
0,0,190103353,2019-01-01,26,Male,Black/African American,Hand,Fracture,Treated and Admitted/Hospitalized,Home,No Fire Involved or Fire Involvement Not Recorded,Yes,No/No information,FIREWORKS,26YOM WAS INTOXICATED ATTEMPTING TO LIGHT A LA...
1,1,190103412,2019-01-01,73,Female,White,Head,Hematoma,Treated and Admitted/Hospitalized,Home,No Fire Involved or Fire Involvement Not Recorded,No/No Information,No/No information,GLASS DOORS OR DOORS WITH GLASS PANELS,73YOF FELL STRUCK HEAD ON A GLASS DOOR AND SUS...
2,2,190103415,2019-01-01,85,Female,White,Lower Trunk,Fracture,Treated and Admitted/Hospitalized,Home,No Fire Involved or Fire Involvement Not Recorded,No/No Information,No/No information,FLOORS OR FLOORING MATERIALS,85YOF WAS TAKING FOOD OUT OF THE MICROWAVE OVE...
3,3,190103459,2019-01-01,4,Male,White,Shoulder,Fracture,Treated/Examined and Released,Home,No Fire Involved or Fire Involvement Not Recorded,No/No Information,No/No information,"BEDS OR BEDFRAMES, OTHER OR NOT SPECIFIED",4 YO M FELL OFF BED AND LANDED ON RT SHOULDER ...
4,4,190104001,2019-01-01,23,Male,White,Wrist,Laceration,Treated/Examined and Released,Home,No Fire Involved or Fire Involvement Not Recorded,No/No Information,No/No information,"KNIVES, NOT ELSEWHERE CLASSIFIED","23YOM TRIPPED AND FELL INTO OPEN DISHWASAHER, ..."


## Prepare Data

In [5]:
df2 = dfNEISS[['Disposition','Narrative']]

In [6]:
df2['Disposition'].value_counts()

Treated/Examined and Released        237324
Treated and Admitted/Hospitalized     30376
Treated and Transferred                3732
Held for Observation                   3527
Left Without Being Seen                2446
Fatality, Incl. DOA, Died in ER         234
Name: Disposition, dtype: int64

In [7]:
df2 = df2.drop(df2[df2.Disposition == 'Fatality, Incl. DOA, Died in ER'].index)

## Changing dependent variable to binary

In [8]:
df_binary_NEISS = df2
df_binary_NEISS = df_binary_NEISS.replace(['Treated and Admitted/Hospitalized',
                                     'Treated and Transferred'], 1)
df_binary_NEISS = df_binary_NEISS.replace(['Treated/Examined and Released',
                                     'Held for Observation',
                                         'Left Without Being Seen'], 0)

In [9]:
df_binary_NEISS.head()

Unnamed: 0,Disposition,Narrative
0,1,26YOM WAS INTOXICATED ATTEMPTING TO LIGHT A LA...
1,1,73YOF FELL STRUCK HEAD ON A GLASS DOOR AND SUS...
2,1,85YOF WAS TAKING FOOD OUT OF THE MICROWAVE OVE...
3,0,4 YO M FELL OFF BED AND LANDED ON RT SHOULDER ...
4,0,"23YOM TRIPPED AND FELL INTO OPEN DISHWASAHER, ..."


In [10]:
df_binary_NEISS = df_binary_NEISS.dropna()

## Create Train Test Split

In [11]:
sentences = df_binary_NEISS['Narrative'].values
y = df_binary_NEISS['Disposition'].values
sentences_train, sentences_test, y_train, y_test = train_test_split(
    sentences, y, test_size=0.25, random_state=1000)
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)
X_train

<208053x34047 sparse matrix of type '<class 'numpy.int64'>'
	with 3732384 stored elements in Compressed Sparse Row format>

## Logistic Regression

In [12]:
classifier = LogisticRegression(solver='lbfgs', max_iter=10000)
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

print("Accuracy:", score)

Accuracy: 0.9234052370515631


## Artificial Neural Network (ANN)

In [13]:
def create_model(num_filters, kernel_size, vocab_size, embedding_dim, maxlen):
    model = Sequential()
    model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
    model.add(layers.Conv1D(num_filters, kernel_size, activation='relu'))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(10, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [14]:
param_grid = dict(num_filters=[32, 64, 128],
                  kernel_size=[3, 5, 7],
                  vocab_size=[5000], 
                  embedding_dim=[50],
                  maxlen=[100])

In [16]:
# Main settings
epochs = 20
embedding_dim = 50
maxlen = 100
output_file = 'data/output.txt'
source = 'NEISS'

# Train-test split
sentences = df_binary_NEISS['Narrative'].values
y = df_binary_NEISS['Disposition'].values
sentences_train, sentences_test, y_train, y_test = train_test_split(
    sentences, y, test_size=0.25, random_state=1000)

# Tokenize words
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)
X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

# Pad sequences with zeros
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

# Parameter grid for grid search
param_grid = dict(num_filters=[32, 64, 128],
                  kernel_size=[3, 5, 7],
                  vocab_size=[vocab_size],
                  embedding_dim=[embedding_dim],
                  maxlen=[maxlen])
model = KerasClassifier(build_fn=create_model,
                        epochs=epochs, batch_size=10,
                        verbose=False)
grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid,
                          cv=4, verbose=1, n_iter=5)
grid_result = grid.fit(X_train, y_train)

# Evaluate testing set
test_accuracy = grid.score(X_test, y_test)

# Save and evaluate results
s = ('Running {} data set\nBest Accuracy : '
             '{:.4f}\n{}\nTest Accuracy : {:.4f}\n\n')
output_string = s.format(
    source,
    grid_result.best_score_,
    grid_result.best_params_,
    test_accuracy)
print(output_string)

  model = KerasClassifier(build_fn=create_model,


Fitting 4 folds for each of 5 candidates, totalling 20 fits
Running NEISS data set
Best Accuracy : 0.9002
{'vocab_size': 34808, 'num_filters': 64, 'maxlen': 100, 'kernel_size': 3, 'embedding_dim': 50}
Test Accuracy : 0.9038


