In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from keras.models import Sequential
from keras import layers
from keras.backend import clear_session
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV

In [2]:
dfCH = pd.read_excel('CPSC Clearinghouse.xlsx')

In [3]:
dfCH.head()

Unnamed: 0,STATE,INCIDENT YEAR,INCIDENT MONTH,PRODUCT 1,PRODUCT HAZARD TYPE,PRODUCT HAZARD,VICTIM 1 GENDER,VICTIM 1 AGE YEARS,VICTIM 1 INJURY,VICTIM 1 BODY PART,VICTIM 1 SEVERITY,INCIDENT DESCRIPTION
0,SC,2001.0,8.0,Pressure Cookers or Canners (412),Thermal,Hot Liquid or Steam,Female,69,Burn-Scald,Trunk,Hospital Admission,69 YOF WAS COOKING IN PRESSURE COOKER. POT STA...
1,PA,2001.0,9.0,Oven Cleaners (942),Chemical,Poisoning,Female,17,Poisoning,All Parts of the body,Death,28 YOF DECEDENT WHEN SHE WAS 17 YO INGESTED ...
2,MA,2001.0,11.0,Gas Ranges or Ovens (279),Thermal,Hot surface,Female,40,Burn-Thermal,Hand,No First Aid or Medical Attention Received,THE CONSUMER REPORTED SEVERAL ISSUES WITH HER ...
3,CA,2001.0,11.0,Ladders Other or Not Specified (4078),Mechanical,Fall,Male,60,Internal Organ Injury,Head,Death,73 YOM DECEDENT FELL FROM LADDER WHEN HE WAS 6...
4,FL,2001.0,10.0,House Repair or Construction Materials nec (1876),Mechanical,Struck by,Male,45,Other/Not Stated,Unspecified,Death,THE MALE SUBJECT WAS STRUCK BY ROOFING MATERIA...


## Prepare Data

In [4]:
df2 = dfCH[['VICTIM 1 SEVERITY','INCIDENT DESCRIPTION']]

In [5]:
df2 = df2.rename({'VICTIM 1 SEVERITY':'SEVERITY'}, axis=1)

In [6]:
df2 = df2.drop(df2[df2.SEVERITY == 'Level of care not known'].index)
df2 = df2.drop(df2[df2.SEVERITY == 'Incident  No Injury'].index)
df2 = df2.drop(df2[df2.SEVERITY == 'Death'].index)

## Changing dependent variable to binary

In [7]:
df_binary_CH = df2
df_binary_CH = df_binary_CH.replace(['Emergency Department Treatment Received',
                                     'Seen by Medical Professional',
                                     'Hospital Admission'], 1)
df_binary_CH = df_binary_CH.replace(['First Aid Received by Non-Medical Professional',
                                     'No First Aid or Medical Attention Received'], 0)

In [8]:
df_binary_CH.head()

Unnamed: 0,SEVERITY,INCIDENT DESCRIPTION
0,1,69 YOF WAS COOKING IN PRESSURE COOKER. POT STA...
2,0,THE CONSUMER REPORTED SEVERAL ISSUES WITH HER ...
8,0,THE CONSUMER STATED THAT SHE NOTICED THAT THE ...
10,0,GE GLASS TOP GAS RANGE IS NOT RECALLED HOWEVE...
12,1,8 YOF SUFFERED CRUSHED FEMUR WHEN PORTABLE SOC...


In [9]:
df_binary_CH = df_binary_CH.dropna()

## Create Train Test Split

In [10]:
sentences = df_binary_CH['INCIDENT DESCRIPTION'].values
y = df_binary_CH['SEVERITY'].values
sentences_train, sentences_test, y_train, y_test = train_test_split(
    sentences, y, test_size=0.25, random_state=1000)
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)
X_train

<11304x24216 sparse matrix of type '<class 'numpy.int64'>'
	with 663535 stored elements in Compressed Sparse Row format>

## Logistic Regression

In [11]:
classifier = LogisticRegression(solver='lbfgs', max_iter=10000)
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

print("Accuracy:", score)

Accuracy: 0.8946390658174098


## Artificial Neural Network (ANN)

In [12]:
def create_model(num_filters, kernel_size, vocab_size, embedding_dim, maxlen):
    model = Sequential()
    model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
    model.add(layers.Conv1D(num_filters, kernel_size, activation='relu'))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(10, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [13]:
param_grid = dict(num_filters=[32, 64, 128],
                  kernel_size=[3, 5, 7],
                  vocab_size=[5000], 
                  embedding_dim=[50],
                  maxlen=[100])

In [18]:
# Main settings
epochs = 20
embedding_dim = 50
maxlen = 100
output_file = 'data/output.txt'
source = 'Clearinghouse'

# Train-test split
sentences = df_binary_CH['INCIDENT DESCRIPTION'].values
y = df_binary_CH['SEVERITY'].values
sentences_train, sentences_test, y_train, y_test = train_test_split(
    sentences, y, test_size=0.25, random_state=1000)

# Tokenize words
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)
X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

# Pad sequences with zeros
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

# Parameter grid for grid search
param_grid = dict(num_filters=[32, 64, 128],
                  kernel_size=[3, 5, 7],
                  vocab_size=[vocab_size],
                  embedding_dim=[embedding_dim],
                  maxlen=[maxlen])
model = KerasClassifier(build_fn=create_model,
                        epochs=epochs, batch_size=10,
                        verbose=False)
grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid,
                          cv=4, verbose=1, n_iter=5)
grid_result = grid.fit(X_train, y_train)

# Evaluate testing set
test_accuracy = grid.score(X_test, y_test)

# Save and evaluate results
s = ('Running {} data set\nBest Accuracy : '
             '{:.4f}\n{}\nTest Accuracy : {:.4f}\n\n')
output_string = s.format(
    source,
    grid_result.best_score_,
    grid_result.best_params_,
    test_accuracy)
print(output_string)

  model = KerasClassifier(build_fn=create_model,


Fitting 4 folds for each of 5 candidates, totalling 20 fits
Running Clearinghouse data set
Best Accuracy : 0.8830
{'vocab_size': 25897, 'num_filters': 32, 'maxlen': 100, 'kernel_size': 5, 'embedding_dim': 50}
Test Accuracy : 0.8899


