In [2]:
import numpy as np
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
stemmer = WordNetLemmatizer()

In [3]:
df_raw = pd.read_excel("Injury Triage 2022.xlsx")
#df_raw.head()

In [4]:
def change_labels(category):
    if category[:4].lower() == 'ergo'.lower():
        category = "Ergo"
    else:
        category = "Others"
    return category

In [5]:
df=df_raw[['Description of Event','ACS Category']].copy()
df['Category'] = df['ACS Category'].apply(change_labels)
#df.head()

Unnamed: 0,Description of Event,ACS Category,Category
0,Had a large family with all heavy bags. pick u...,Ergo - Lobby,Ergo
1,while loading bags into cart at inbound flight...,Ergo - Ramp Bags,Ergo
2,Agent was walking up and down stairs in bagroo...,Stepping Up,Others
3,"On July 23, 2021 at approximately 0630, the EE...",Stepping Up,Others
4,Agent was assisting a passenger deplaning from...,Stepping Down,Others


In [6]:
def clean_description(description):
    
    # Remove all the special characters
    description = re.sub(r'\W', ' ', description)    
    # remove all single characters
    description = re.sub(r'\s+[a-zA-Z]\s+', ' ', description)    
    # Remove single characters from the start
    description = re.sub(r'\^[a-zA-Z]\s+', ' ', description)     
    # Substituting multiple spaces with single space
    description = re.sub(r'\s+', ' ', description, flags=re.I)    
    # Removing prefixed 'b'
    description = re.sub(r'^b\s+', '', description)    
    # Converting to Lowercase
    description = description.lower()    
    # Lemmatization
    description = description.split()
    description = [stemmer.lemmatize(word) for word in description]
    description = ' '.join(description)    
    return description

In [7]:
df["Description"] = df["Description of Event"].apply(clean_description)

In [8]:
category_mapping = {'Ergo': 1,'Others': 2}
df.Category = [category_mapping[item] for item in df.Category]
df.head()

Unnamed: 0,Description of Event,ACS Category,Category,Description
0,Had a large family with all heavy bags. pick u...,Ergo - Lobby,1,had large family with all heavy bag pick up 61...
1,while loading bags into cart at inbound flight...,Ergo - Ramp Bags,1,while loading bag into cart at inbound flight ...
2,Agent was walking up and down stairs in bagroo...,Stepping Up,2,agent wa walking up and down stair in bagroom ...
3,"On July 23, 2021 at approximately 0630, the EE...",Stepping Up,2,on july 23 2021 at approximately 0630 the ee w...
4,Agent was assisting a passenger deplaning from...,Stepping Down,2,agent wa assisting passenger deplaning from re...


In [9]:
X=df['Description']
y=df['Category']
x_train, x_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, random_state=7)

In [10]:
# text vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))

# logistic Regression classifier
lr_clf = LogisticRegression()

# create pipeline object
pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('clf', lr_clf)
])
# fit the pipeline on the training data
pipeline.fit(x_train, y_train)

In [11]:
# use the pipeline for predicting using test data
predictions = pipeline.predict(x_validation)

In [50]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,classification_report,confusion_matrix
print('Accuracy score: {}'.format(accuracy_score(y_validation, predictions)))
print('Precision score: {}'.format(precision_score(y_validation, predictions)))
print('Recall score: {}'.format(recall_score(y_validation, predictions)))
print('F1 score: {}'.format(f1_score(y_validation, predictions)))

Accuracy score: 0.9236111111111112
Precision score: 0.9565217391304348
Recall score: 0.8918918918918919
F1 score: 0.9230769230769231


In [13]:
print(classification_report(y_validation,predictions))

              precision    recall  f1-score   support

           1       0.96      0.89      0.92        74
           2       0.89      0.96      0.92        70

    accuracy                           0.92       144
   macro avg       0.92      0.92      0.92       144
weighted avg       0.93      0.92      0.92       144



In [51]:
print(confusion_matrix(y_validation,predictions))

[[66  8]
 [ 3 67]]


In [14]:
precision_score

<function sklearn.metrics._classification.precision_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division='warn')>

In [64]:
df_results=pd.concat([x_validation,y_validation],axis=1).reset_index()
df_results['Pred_Category']=predictions
df_results['Category'] = df_results['Category'].map({1:'Ergo',2:'Others'})
df_results['Pred_Category'] = df_results['Pred_Category'].map({1:'Ergo',2:'Others'})
df_results

Unnamed: 0,index,Description,Category,Pred_Category
0,186,the employee wa working at the ticket counter ...,Ergo,Ergo
1,496,while standing and bending he re injuried lowe...,Others,Others
2,320,mouhamed tall wa off uploading flight dl0438 s...,Ergo,Ergo
3,362,while working at gate c2 today employee wa tra...,Others,Others
4,609,while offloading bag from the belt loader to h...,Ergo,Ergo
...,...,...,...,...
139,390,picking up back from car after not being able ...,Ergo,Others
140,281,agent gene malkut had gone to the ramp with fu...,Ergo,Ergo
141,548,wind closed van door on right foot,Others,Others
142,666,agent wa loading in the belly of cr9 ac on or ...,Ergo,Ergo
