In [1]:
import numpy as np
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
stemmer = WordNetLemmatizer()

In [2]:
df_raw = pd.read_excel("Injury Triage 2022.xlsx")
pd.set_option('display.max_rows',None)
pd.set_option('display.max_Colwidth',None)
#df_raw.head()

In [3]:
def change_labels(category):
    if category[:4].lower() == 'ergo'.lower():
        category = "Ergo"
    else:
        category = "Others"
    return category

In [4]:
def clean_description(description):
    
    # Remove all the special characters
    description = re.sub(r'\W', ' ', description)    
    # remove all single characters
    description = re.sub(r'\s+[a-zA-Z]\s+', ' ', description)    
    # Remove single characters from the start
    description = re.sub(r'\^[a-zA-Z]\s+', ' ', description)     
    # Substituting multiple spaces with single space
    description = re.sub(r'\s+', ' ', description, flags=re.I)    
    # Removing prefixed 'b'
    description = re.sub(r'^b\s+', '', description)    
    # Converting to Lowercase
    description = description.lower()    
    # Lemmatization
    description = description.split()
    description = [stemmer.lemmatize(word) for word in description]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    description = [w for w in description if not w in stop_words]
    description = ' '.join(description)    
    return description

In [5]:
df=df_raw[['Description of Event','ACS Category']].copy()
category_mapping = {'Ergo': 1,'Others': 2}
df['Category'] = df['ACS Category'].apply(change_labels)
df["Description"] = df["Description of Event"].apply(clean_description)
df.Category = [category_mapping[item] for item in df.Category]
#df.head()

In [6]:
x=df['Description']
y=df['Category']
# train test split
x_train, x_validation, y_train, y_validation = train_test_split(x, y, test_size=0.2, random_state=7)

# text vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))

# logistic Regression classifier
lr_clf = LogisticRegression()

# create pipeline object
pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('clf', lr_clf)
])
# fit the pipeline on the training data
pipeline.fit(x_train, y_train)

In [7]:
# use the pipeline for predicting using test data
predictions = pipeline.predict(x_validation)

In [8]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,classification_report,confusion_matrix
print('Accuracy score: {}'.format(accuracy_score(y_validation, predictions)))
print('Precision score: {}'.format(precision_score(y_validation, predictions)))
print('Recall score: {}'.format(recall_score(y_validation, predictions)))
print('F1 score: {}'.format(f1_score(y_validation, predictions)))

Accuracy score: 0.9236111111111112
Precision score: 0.9565217391304348
Recall score: 0.8918918918918919
F1 score: 0.9230769230769231


In [9]:
print(classification_report(y_validation,predictions))

              precision    recall  f1-score   support

           1       0.96      0.89      0.92        74
           2       0.89      0.96      0.92        70

    accuracy                           0.92       144
   macro avg       0.92      0.92      0.92       144
weighted avg       0.93      0.92      0.92       144



In [10]:
print(confusion_matrix(y_validation,predictions))

[[66  8]
 [ 3 67]]


In [11]:
#precision_score

In [12]:
df_results=pd.concat([x_validation,y_validation],axis=1).reset_index()
df_results['Pred_Category']=predictions
df_results['Category'] = df_results['Category'].map({1:'Ergo',2:'Others'})
df_results['Pred_Category'] = df_results['Pred_Category'].map({1:'Ergo',2:'Others'})
#df_results

In [13]:
result = pipeline.predict(['EE lifted heavy bag off belt, felt pain in r. wrist/forearm'])
print(result)

[1]


In [14]:
result = pipeline.predict(['EE was arriving a flight when she needed to retrieve an item left on board by a passenger. When the agent came back up the jet bridge she tripped on a broken metal piece.'])
print(result)


[2]


In [15]:
result = pipeline.predict(['had large family with all heavy bag pick up 61lbs when to twisted to place bag on the belt from scale pain in left lower back the ee ha been continuing treatment in hcm since doi and wa moved to prevention but wa still being seen treated for lingering pain and discomfort on 6 3 22 the ee reaggravated the old injury by same mechanism lifting heavy bag to the point where after being treated in hcm couple more time she decided to see provider'])
print(result)

[1]


### Preditions

In [17]:
from datetime import datetime
df_text = pd.read_excel("Injury Triage 2022_October.xlsx",sheet_name=datetime.now().strftime('%B'))
df_text.shape

(48, 16)

In [19]:
df_text = pd.read_excel("Injury Triage 2022_October.xlsx",sheet_name="October")

x_test = df_text["Description of Event"].apply(clean_description)

test_predictions = pipeline.predict(x_test)

df_text['Pred_Category'] = test_predictions
df_text['Pred_Category'] = df_text['Pred_Category'].map({1:'Ergo',2:'Others'})

df_text.to_excel('Injury Triage 2022_Preditions.xlsx')

In [20]:
test_predictions

array([1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1,
       2, 2, 2, 2, 1, 2, 2, 1, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2,
       2, 2, 2, 1], dtype=int64)

In [21]:
currentMonth = datetime.now().strftime('%B')
currentMonth

'October'