In [77]:
import joblib
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Multinomial Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB

# Import Tf-idf Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Import the Label Encoder
from sklearn.preprocessing import LabelEncoder

# Import the train test split
from sklearn.model_selection import train_test_split

# To evaluate our model
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joshu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joshu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\joshu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [42]:
df = pd.read_csv("Resources/Work_Contacts.csv")

In [43]:
df.head()


Unnamed: 0,Date,Observation Time,Purchase Order Number,RCS Number,Work Order Number,Job Start Time,Job Finish Time,Work group,Risk,OC,Authority To Proceed Description,Does ARP walk the job. In the alloted time frames,Field Verifications Completed (ARP),Field Verifications Completed (CRP),Supervision Present,Total number of people on job
0,16/07/2021,12:13:00,718707465,,,12:15:00,,Allrubber,High,1.0,Splice Mill #4 feed conveyor,No,Yes,Yes,Yes,3.0
1,16/07/2021,13:55:00,718707465,,,12:14:00,,Allrubber,High,1.0,Splice Mill #4 feed conveyor,No,Yes,Yes,Yes,3.0
2,6/07/2021,11:50:00,7188654,,,8:00:00,,ALS,Low,1.0,Sub station thermal checks on cabinets,Yes,Yes,Yes,No,1.0
3,8/07/2021,8:00:00,82775,,4302013.0,8:00:00,,ALS,Low,1.0,Routine thickness testing Bld 30,No,Yes,Yes,No,2.0
4,8/07/2021,11:50:00,82775,,4302013.0,8:00:00,,ALS,Low,1.0,Routine thickness testing Bld 30,No,Yes,Yes,No,2.0


In [62]:
df_clean = df[['Risk', 'Authority To Proceed Description']]
df_clean = df_clean.dropna()
df_clean['Risk'] = df_clean['Risk'].str.lower()
df_clean.head()

Unnamed: 0,Risk,Authority To Proceed Description
0,high,Splice Mill #4 feed conveyor
1,high,Splice Mill #4 feed conveyor
2,low,Sub station thermal checks on cabinets
3,low,Routine thickness testing Bld 30
4,low,Routine thickness testing Bld 30


In [63]:
# Make the letters lower case and tokenize the words
tokenized_messages = df_clean['Authority To Proceed Description'].str.lower().apply(word_tokenize)

# Print the tokens to see how it looks like
print(tokenized_messages)

0                   [splice, mill, #, 4, feed, conveyor]
1                   [splice, mill, #, 4, feed, conveyor]
2          [sub, station, thermal, checks, on, cabinets]
3                 [routine, thickness, testing, bld, 30]
4                 [routine, thickness, testing, bld, 30]
                             ...                        
297       [drill, holes, #, 34, s/trap, tank, stiffener]
298    [replace, potable, water, line, at, vtat, offi...
299                       [repair, potable, water, line]
300                     [d1/d2, tripper, floor, repairs]
301                     [d1/d2, tripper, floor, repairs]
Name: Authority To Proceed Description, Length: 302, dtype: object


In [64]:
# Define a function to returns only alphanumeric tokens
def alpha(tokens):
    """This function removes all non-alphanumeric characters"""
    alpha = []
    for token in tokens:
        if str.isalpha(token) or token in ['n\'t','won\'t']:
            if token=='n\'t':
                alpha.append('not')
                continue
            elif token == 'won\'t':
                alpha.append('wont')
                continue
            alpha.append(token)
    return alpha

# Apply our function to tokens
tokenized_messages = tokenized_messages.apply(alpha)

print(tokenized_messages)

0                         [splice, mill, feed, conveyor]
1                         [splice, mill, feed, conveyor]
2          [sub, station, thermal, checks, on, cabinets]
3                     [routine, thickness, testing, bld]
4                     [routine, thickness, testing, bld]
                             ...                        
297                      [drill, holes, tank, stiffener]
298    [replace, potable, water, line, at, vtat, offi...
299                       [repair, potable, water, line]
300                            [tripper, floor, repairs]
301                            [tripper, floor, repairs]
Name: Authority To Proceed Description, Length: 302, dtype: object


In [65]:
# Define a function to remove stop words
def remove_stop_words(tokens):
    """This function removes all stop words in terms of nltk stopwords"""
    no_stop = []
    for token in tokens:
        if token not in stopwords.words('english'):
            no_stop.append(token)
    return no_stop

# Apply our function to tokens
tokenized_messages = tokenized_messages.apply(remove_stop_words)

print(tokenized_messages)

0                      [splice, mill, feed, conveyor]
1                      [splice, mill, feed, conveyor]
2           [sub, station, thermal, checks, cabinets]
3                  [routine, thickness, testing, bld]
4                  [routine, thickness, testing, bld]
                            ...                      
297                   [drill, holes, tank, stiffener]
298    [replace, potable, water, line, vtat, offices]
299                    [repair, potable, water, line]
300                         [tripper, floor, repairs]
301                         [tripper, floor, repairs]
Name: Authority To Proceed Description, Length: 302, dtype: object


In [66]:
# Define a function to lemmatization
def lemmatize(tokens):
    """This function lemmatize the messages"""
    # Initialize the WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    # Create the lemmatized list
    lemmatized = []
    for token in tokens:
            # Lemmatize and append
            lemmatized.append(lemmatizer.lemmatize(token))
    return " ".join(lemmatized)

# Apply our function to tokens
tokenized_messages = tokenized_messages.apply(lemmatize)

print(tokenized_messages)

0                   splice mill feed conveyor
1                   splice mill feed conveyor
2           sub station thermal check cabinet
3               routine thickness testing bld
4               routine thickness testing bld
                        ...                  
297                 drill hole tank stiffener
298    replace potable water line vtat office
299                 repair potable water line
300                      tripper floor repair
301                      tripper floor repair
Name: Authority To Proceed Description, Length: 302, dtype: object


In [67]:
# Replace the columns with tokenized messages
df_clean['Authority To Proceed Description'] = tokenized_messages

# Display the first five rows
df_clean.head()

Unnamed: 0,Risk,Authority To Proceed Description
0,high,splice mill feed conveyor
1,high,splice mill feed conveyor
2,low,sub station thermal check cabinet
3,low,routine thickness testing bld
4,low,routine thickness testing bld


In [68]:
labelEncoder = LabelEncoder()
df_clean = df_clean.sort_values("Risk", ignore_index=True)
df_clean['risk_encoded'] = labelEncoder.fit_transform(df_clean['Risk']) #Identify unique values
df_clean.head()

Unnamed: 0,Risk,Authority To Proceed Description,risk_encoded
0,high,splice mill feed conveyor,0
1,high,install tapping point contact heater,0
2,high,install tapping point contact heater,0
3,high,install hot tap contact heater,0
4,high,install hot tap contact heater,0


In [69]:
# Select the features and the target
X = df_clean['Authority To Proceed Description']
y = df_clean['risk_encoded']

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=34)

In [71]:
# Create the tf-idf vectorizer
vectorizer = TfidfVectorizer(strip_accents='ascii')

# First fit the vectorizer with our training set
tfidf_train = vectorizer.fit_transform(X_train)

# Now we can fit our test data with the same vectorizer
tfidf_test = vectorizer.transform(X_test)

In [72]:
# Initialize the Multinomial Naive Bayes classifier
nb = MultinomialNB()

# Fit the model
nb.fit(tfidf_train, y_train)

# Print the accuracy score
print("Accuracy:",nb.score(tfidf_test, y_test))

Accuracy: 0.9016393442622951


In [73]:
y_predicted = nb.predict(tfidf_test)

In [74]:
y_predicted

array([0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       2, 2, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0])

In [75]:
test_results = pd.DataFrame({
    "Actual":y_test,
    "Predicted":y_predicted
})
test_results

Unnamed: 0,Actual,Predicted
56,0,0
296,2,2
290,2,2
170,0,0
74,0,0
...,...,...
293,2,2
128,0,2
228,2,2
28,0,0


In [76]:
df_clean[["Risk", "risk_encoded"]].groupby(["Risk", "risk_encoded"]).count()

Risk,risk_encoded
high,0
low,1
medium,2


In [78]:
joblib.dump(nb,'naive_bayes.pkl')

['naive_bayes.pkl']