In [1]:
import joblib
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Multinomial Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB

# Import Tf-idf Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Import the Label Encoder
from sklearn.preprocessing import LabelEncoder

# Import the train test split
from sklearn.model_selection import train_test_split

# To evaluate our model
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joshu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joshu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\joshu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_csv("Resources/work_contacts.csv")

In [3]:
df.head()


Unnamed: 0,Date,Observation Time,Purchase Order Number,RCS Number,Work Order Number,Job Start Time,Job Finish Time,Work group,Risk,OC,Authority To Proceed Description,Does ARP walk the job. In the alloted time frames,Field Verifications Completed (ARP),Field Verifications Completed (CRP),Supervision Present,Total number of people on job
0,5/07/2021,8:35:00,7154232cs,85039.0,,8:30:00,,Dartla,High,1,Repair insulation PFT #31,No,Yes,Yes,No,2
1,5/07/2021,8:45:00,7190673,,,8:00:00,,Awayclean,High,1,Bog out lake duck,No,Yes,Yes,No,2
2,5/07/2021,8:50:00,7181286,90145.0,,8:00:00,,Awayclean,High,1,HPW clean vent lines,No,Yes,Yes,No,2
3,5/07/2021,11:49:00,7180602,,,11:00:00,12:00:00,Awayclean,High,1,108w drain. vacuum out pit. N/E. of car park,No,Yes,Yes,No,2
4,5/07/2021,13:00:00,7180602,,,13:00:00,14:00:00,Awayclean,High,1,108w drain. vacuum out pit. N/W. of car park,No,Yes,Yes,No,4


In [4]:
df_clean = df[['Risk', 'Authority To Proceed Description']]
df_clean = df_clean.dropna()
df_clean['Risk'] = df_clean['Risk'].str.lower()
df_clean.head()

Unnamed: 0,Risk,Authority To Proceed Description
0,high,Repair insulation PFT #31
1,high,Bog out lake duck
2,high,HPW clean vent lines
3,high,108w drain. vacuum out pit. N/E. of car park
4,high,108w drain. vacuum out pit. N/W. of car park


In [5]:
# Make the letters lower case and tokenize the words
tokenized_messages = df_clean['Authority To Proceed Description'].str.lower().apply(word_tokenize)

# Print the tokens to see how it looks like
print(tokenized_messages)

0                       [repair, insulation, pft, #, 31]
1                                 [bog, out, lake, duck]
2                              [hpw, clean, vent, lines]
3      [108w, drain, ., vacuum, out, pit, ., n/e, ., ...
4      [108w, drain, ., vacuum, out, pit, ., n/w, ., ...
                             ...                        
297                        [rewire, vtat, transportable]
298             [earth, works, on, potable, water, line]
299                   [earthworks, potable, water, line]
300                       [repair, potable, water, line]
301                                  [build, sand, bund]
Name: Authority To Proceed Description, Length: 302, dtype: object


In [6]:
# Define a function to returns only alphanumeric tokens
def alpha(tokens):
    """This function removes all non-alphanumeric characters"""
    alpha = []
    for token in tokens:
        if str.isalpha(token) or token in ['n\'t','won\'t']:
            if token=='n\'t':
                alpha.append('not')
                continue
            elif token == 'won\'t':
                alpha.append('wont')
                continue
            alpha.append(token)
    return alpha

# Apply our function to tokens
tokenized_messages = tokenized_messages.apply(alpha)

print(tokenized_messages)

0                     [repair, insulation, pft]
1                        [bog, out, lake, duck]
2                     [hpw, clean, vent, lines]
3      [drain, vacuum, out, pit, of, car, park]
4      [drain, vacuum, out, pit, of, car, park]
                         ...                   
297               [rewire, vtat, transportable]
298    [earth, works, on, potable, water, line]
299          [earthworks, potable, water, line]
300              [repair, potable, water, line]
301                         [build, sand, bund]
Name: Authority To Proceed Description, Length: 302, dtype: object


In [7]:
# Define a function to remove stop words
def remove_stop_words(tokens):
    """This function removes all stop words in terms of nltk stopwords"""
    no_stop = []
    for token in tokens:
        if token not in stopwords.words('english'):
            no_stop.append(token)
    return no_stop

# Apply our function to tokens
tokenized_messages = tokenized_messages.apply(remove_stop_words)

print(tokenized_messages)

0                 [repair, insulation, pft]
1                         [bog, lake, duck]
2                 [hpw, clean, vent, lines]
3           [drain, vacuum, pit, car, park]
4           [drain, vacuum, pit, car, park]
                       ...                 
297           [rewire, vtat, transportable]
298    [earth, works, potable, water, line]
299      [earthworks, potable, water, line]
300          [repair, potable, water, line]
301                     [build, sand, bund]
Name: Authority To Proceed Description, Length: 302, dtype: object


In [8]:
# Define a function to lemmatization
def lemmatize(tokens):
    """This function lemmatize the messages"""
    # Initialize the WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    # Create the lemmatized list
    lemmatized = []
    for token in tokens:
            # Lemmatize and append
            lemmatized.append(lemmatizer.lemmatize(token))
    return " ".join(lemmatized)

# Apply our function to tokens
tokenized_messages = tokenized_messages.apply(lemmatize)

print(tokenized_messages)

0              repair insulation pft
1                      bog lake duck
2                hpw clean vent line
3          drain vacuum pit car park
4          drain vacuum pit car park
                   ...              
297        rewire vtat transportable
298    earth work potable water line
299     earthwork potable water line
300        repair potable water line
301                  build sand bund
Name: Authority To Proceed Description, Length: 302, dtype: object


In [9]:
# Replace the columns with tokenized messages
df_clean['Authority To Proceed Description'] = tokenized_messages

# Display the first five rows
df_clean.head()

Unnamed: 0,Risk,Authority To Proceed Description
0,high,repair insulation pft
1,high,bog lake duck
2,high,hpw clean vent line
3,high,drain vacuum pit car park
4,high,drain vacuum pit car park


In [10]:
labelEncoder = LabelEncoder()
df_clean = df_clean.sort_values("Risk", ignore_index=True)
df_clean['risk_encoded'] = labelEncoder.fit_transform(df_clean['Risk']) #Identify unique values
df_clean.head()

Unnamed: 0,Risk,Authority To Proceed Description,risk_encoded
0,high,repair insulation pft,0
1,high,splice mill feed conveyor,0
2,high,splice mill feed conveyor,0
3,high,routine testing,0
4,high,replace dsm segment mill,0


In [11]:
# Select the features and the target
X = df_clean['Authority To Proceed Description']
y = df_clean['risk_encoded']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=34)

In [13]:
# Create the tf-idf vectorizer
vectorizer = TfidfVectorizer(strip_accents='ascii')

# First fit the vectorizer with our training set
tfidf_train = vectorizer.fit_transform(X_train)

# Now we can fit our test data with the same vectorizer
tfidf_test = vectorizer.transform(X_test)

In [14]:
# Initialize the Multinomial Naive Bayes classifier
nb = MultinomialNB()

# Fit the model
nb.fit(tfidf_train, y_train)

# Print the accuracy score
print("Accuracy:",nb.score(tfidf_test, y_test))

Accuracy: 0.9180327868852459


In [15]:
y_predicted = nb.predict(tfidf_test)

In [16]:
y_predicted

array([0, 2, 2, 0, 2, 0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0,
       2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 2, 0, 0])

In [17]:
test_results = pd.DataFrame({
    "Actual":y_test,
    "Predicted":y_predicted
})
test_results

Unnamed: 0,Actual,Predicted
56,0,0
296,2,2
290,2,2
170,0,0
74,0,2
...,...,...
293,2,2
128,0,0
228,2,2
28,0,0


In [18]:
df_clean[["Risk", "risk_encoded"]].groupby(["Risk", "risk_encoded"]).count()

Risk,risk_encoded
high,0
low,1
medium,2


In [19]:
joblib.dump(nb,'naive_bayes.pkl')

['naive_bayes.pkl']

In [20]:
joblib.dump(vectorizer, "vectorizer.pkl")

['vectorizer.pkl']