In [1]:
import joblib
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Multinomial Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB

# Import Tf-idf Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Import the Label Encoder
from sklearn.preprocessing import LabelEncoder

# Import the train test split
from sklearn.model_selection import train_test_split

# To evaluate our model
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joshu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joshu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\joshu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_csv("Resources/work_contacts.csv")

In [3]:
df.head()


Unnamed: 0,Date,Observation Time,Purchase Order Number,RCS Number,Work Order Number,Job Start Time,Job Finish Time,Work group,Risk,OC,Authority To Proceed Description,Does ARP walk the job. In the alloted time frames,Field Verifications Completed (ARP),Field Verifications Completed (CRP),Supervision Present,# of People on Job
0,5/07/2021,10:45:00,7175923,,2285000.0,10:00:00,,Refeak,Low,1,Set up welder and spools for capital projects,No,Yes,Yes,No,3.0
1,5/07/2021,11:49:00,7180602,,,11:00:00,12:00:00,Awayclean,Medium,1,108w drain. vacuum out pit. N/E. of car park,No,Yes,Yes,No,2.0
2,5/07/2021,13:00:00,7180602,,,13:00:00,14:00:00,Awayclean,Medium,1,108w drain. vacuum out pit. N/W. of car park,No,Yes,Yes,No,4.0
3,5/07/2021,08:50:00,7181286,90145.0,,08:00:00,,Awayclean,High,1,HPW clean vent lines,No,Yes,Yes,No,2.0
4,5/07/2021,08:40:00,7181625,87318.0,,08:45:00,,GLU Scaff,High,1,Erect scaffold #51 Blow off tank,No,Yes,Yes,No,2.0


In [4]:
df_clean = df[['Risk', 'Authority To Proceed Description']]
df_clean = df_clean.dropna()
df_clean['Risk'] = df_clean['Risk'].str.lower()
df_clean.head()

Unnamed: 0,Risk,Authority To Proceed Description
0,low,Set up welder and spools for capital projects
1,medium,108w drain. vacuum out pit. N/E. of car park
2,medium,108w drain. vacuum out pit. N/W. of car park
3,high,HPW clean vent lines
4,high,Erect scaffold #51 Blow off tank


In [5]:
# Make the letters lower case and tokenize the words
tokenized_messages = df_clean['Authority To Proceed Description'].str.lower().apply(word_tokenize)

# Print the tokens to see how it looks like
print(tokenized_messages)

0      [set, up, welder, and, spools, for, capital, p...
1      [108w, drain, ., vacuum, out, pit, ., n/e, ., ...
2      [108w, drain, ., vacuum, out, pit, ., n/w, ., ...
3                              [hpw, clean, vent, lines]
4              [erect, scaffold, #, 51, blow, off, tank]
                             ...                        
378                                   [reclaim, hydrate]
379                                   [reclaim, hydrate]
380                              [check, power, outlets]
381                                    [replace, lights]
382                                  [fbti, precip, i55]
Name: Authority To Proceed Description, Length: 383, dtype: object


In [6]:
# Define a function to returns only alphanumeric tokens
def alpha(tokens):
    """This function removes all non-alphanumeric characters"""
    alpha = []
    for token in tokens:
        if str.isalpha(token) or token in ['n\'t','won\'t']:
            if token=='n\'t':
                alpha.append('not')
                continue
            elif token == 'won\'t':
                alpha.append('wont')
                continue
            alpha.append(token)
    return alpha

# Apply our function to tokens
tokenized_messages = tokenized_messages.apply(alpha)

print(tokenized_messages)

0      [set, up, welder, and, spools, for, capital, p...
1               [drain, vacuum, out, pit, of, car, park]
2               [drain, vacuum, out, pit, of, car, park]
3                              [hpw, clean, vent, lines]
4                     [erect, scaffold, blow, off, tank]
                             ...                        
378                                   [reclaim, hydrate]
379                                   [reclaim, hydrate]
380                              [check, power, outlets]
381                                    [replace, lights]
382                                       [fbti, precip]
Name: Authority To Proceed Description, Length: 383, dtype: object


In [7]:
# Define a function to remove stop words
def remove_stop_words(tokens):
    """This function removes all stop words in terms of nltk stopwords"""
    no_stop = []
    for token in tokens:
        if token not in stopwords.words('english'):
            no_stop.append(token)
    return no_stop

# Apply our function to tokens
tokenized_messages = tokenized_messages.apply(remove_stop_words)

print(tokenized_messages)

0      [set, welder, spools, capital, projects]
1               [drain, vacuum, pit, car, park]
2               [drain, vacuum, pit, car, park]
3                     [hpw, clean, vent, lines]
4                 [erect, scaffold, blow, tank]
                         ...                   
378                          [reclaim, hydrate]
379                          [reclaim, hydrate]
380                     [check, power, outlets]
381                           [replace, lights]
382                              [fbti, precip]
Name: Authority To Proceed Description, Length: 383, dtype: object


In [8]:
# Define a function to lemmatization
def lemmatize(tokens):
    """This function lemmatize the messages"""
    # Initialize the WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    # Create the lemmatized list
    lemmatized = []
    for token in tokens:
            # Lemmatize and append
            lemmatized.append(lemmatizer.lemmatize(token))
    return " ".join(lemmatized)

# Apply our function to tokens
tokenized_messages = tokenized_messages.apply(lemmatize)

print(tokenized_messages)

0      set welder spool capital project
1             drain vacuum pit car park
2             drain vacuum pit car park
3                   hpw clean vent line
4              erect scaffold blow tank
                     ...               
378                     reclaim hydrate
379                     reclaim hydrate
380                  check power outlet
381                       replace light
382                         fbti precip
Name: Authority To Proceed Description, Length: 383, dtype: object


In [9]:
# Replace the columns with tokenized messages
df_clean['Authority To Proceed Description'] = tokenized_messages

# Display the first five rows
df_clean.head()

Unnamed: 0,Risk,Authority To Proceed Description
0,low,set welder spool capital project
1,medium,drain vacuum pit car park
2,medium,drain vacuum pit car park
3,high,hpw clean vent line
4,high,erect scaffold blow tank


In [10]:
labelEncoder = LabelEncoder()
df_clean = df_clean.sort_values("Risk", ignore_index=True)
df_clean['risk_encoded'] = labelEncoder.fit_transform(df_clean['Risk']) #Identify unique values
df_clean.head()

Unnamed: 0,Risk,Authority To Proceed Description,risk_encoded
0,high,drill hole valve safety pin,0
1,high,install liner digester vessel,0
2,high,install liner digester vessel,0
3,high,erect scaffold mill,0
4,high,erect scaff mill,0


In [23]:
# Select the features and the target
X = df_clean['Authority To Proceed Description']
y = df_clean['risk_encoded']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=34)

In [25]:
# Create the tf-idf vectorizer
vectorizer = TfidfVectorizer(strip_accents='ascii')

# First fit the vectorizer with our training set
tfidf_train = vectorizer.fit_transform(X_train)

# Now we can fit our test data with the same vectorizer
tfidf_test = vectorizer.transform(X_test)

In [26]:
# Initialize the Multinomial Naive Bayes classifier
nb = MultinomialNB()

# Fit the model
nb.fit(tfidf_train, y_train)

# Print the accuracy score
print("Accuracy:",nb.score(tfidf_test, y_test))

Accuracy: 0.8701298701298701


In [27]:
y_predicted = nb.predict(tfidf_test)

In [28]:
y_predicted

array([0, 2, 0, 2, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 2, 2,
       2, 0, 2, 2, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0,
       0, 2, 0, 2, 0, 0, 2, 0, 2, 0, 0])

In [29]:
test_results = pd.DataFrame({
    "Actual":y_test,
    "Predicted":y_predicted
})
test_results

Unnamed: 0,Actual,Predicted
40,0,0
339,2,2
161,0,0
371,2,2
27,0,0
...,...,...
251,2,2
83,0,0
94,0,2
204,0,0


In [30]:
df_clean[["Risk", "risk_encoded"]].groupby(["Risk", "risk_encoded"]).count()

Risk,risk_encoded
high,0
low,1
medium,2


In [31]:
joblib.dump(nb,'naive_bayes.pkl')

['naive_bayes.pkl']

In [32]:
joblib.dump(vectorizer, "vectorizer.pkl")

['vectorizer.pkl']