In [None]:
'''LIBRARY IMPORTS'''

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import seaborn as sb
import re
import os, types

from datasets import list_datasets,load_dataset

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, precision_score, recall_score, accuracy_score, balanced_accuracy_score, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.utils.multiclass import unique_labels

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download("punkt")
nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
''' DATA IMPORTS '''

df = ### YOUR DATA HERE, IN A PANDAS DATAFRAME

"""

In your dataframe, you must have a column with the text, a column with the text category, and then the respective code for the text category. 

"""

In [None]:
""" DATA CLEANING """

'''SOURCE CODE: ['https://github.com/ekavlako/think/blob/main/tutorials/naive-bayes/naive-bayes-tutorial.ipynb?utm_source=ibm_developer&utm_content=in_content_link&utm_id=tutorials_awb-classifying-data-multinomial-naive-bayes-algorithm]'''


  ## This function takes a text, requests a parameter for stemming, and then if stopwords should be removed. It uses regex to strip out any text irregularities like dates or special characters and then passes the cleaned text into filtered tokwns.
  ## The output for this function is a cleaned string of text.
def text_clean(text, method, rm_stop):
    text = re.sub(r"\n","",text)   #remove line breaks
    text = text.lower() #convert to lowercase
    text = re.sub(r"\d+","",text)   #remove digits and currencies
    text = re.sub(r'[\$\d+\d+\$]', "", text)
    text = re.sub(r'\d+[\.\/-]\d+[\.\/-]\d+', '', text)   #remove dates
    text = re.sub(r'\d+[\.\/-]\d+[\.\/-]\d+', '', text)
    text = re.sub(r'\d+[\.\/-]\d+[\.\/-]\d+', '', text)
    text = re.sub(r'[^\x00-\x7f]',r' ',text)   #remove non-ascii
    text = re.sub(r'[^\w\s]','',text)   #remove punctuation
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)   #remove hyperlinks

    #remove stop words
    if rm_stop == True:
        filtered_tokens = [word for word in word_tokenize(text) if not word in set(stopwords.words('english'))]
        text = " ".join(filtered_tokens)

    #lemmatization: typically preferred over stemming
    if method == 'L':
        lemmer = WordNetLemmatizer()
        lemm_tokens = [lemmer.lemmatize(word) for word in word_tokenize(text)]
        return " ".join(lemm_tokens)

    #stemming
    if method == 'S':
        porter = PorterStemmer()
        stem_tokens = [porter.stem(word) for word in word_tokenize(text)]
        return " ".join(stem_tokens)

    return text

In [None]:
''' PRERPOCESSING USING LEMMAS '''

  ## This section of code uses the above function to utilize PorterStemmer to create a bag of words utterance. This removes any potential inconsistencies in the data and makes it easier to convert during feature extraction.
corpus = []
for i in range (0,len(df)):
    text = text_clean(df['Utterance'][i],"S",True)
    corpus.append(text)

  ## Let's create a column in the dataframe to append the corpus next to the utterances

df.insert(1,'PS',corpus)

In [None]:
''' DATASET SPLIT EDA '''
ax = sb.countplot(x=df['COLUMN CLASSIFICATION NAME'])
plt.title("Obersvations by Classification Type")
plt.show()

In [None]:
''' FEATURE EXTRACTION '''
count_vectorizer = CountVectorizer()

'''X is defined as the qualitative data and y is defined as the numerical label for the coded utterances.'''
X = count_vectorizer.fit_transform(df['TEXT DATA'])
y = df['NUMERICAL LABEL'] 

'''This cell assigns the variables of the list output of the train_test_split. These outputs will be used to put into the MNB.'''

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=81)

In [None]:
''' TRAINING THE MNB '''

model = MultinomialNB()
model.fit(X_train, y_train)

In [None]:
''' EVALUATING PREFORMANCE '''
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print (f'Accuracy: {accuracy*100:.2f}%')

In [None]:
''' CONFUSION MATRIX '''
cm = confusion_matrix(y_test,y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()
plt.show()

In [None]:
''' PREDICTIONS '''

## First creating a list of new utterances. The list passed in will have one of each from the above dialogue acts
new_utterance = ['I need you to finalize the reports by Monday',
                 'The documentation can better explain it sometimes',
                 'Plants function of phtosynthesis is crucial for sustaining life on Earth',
                 'What are the strategies that you use to validate a models accuracy?'
                 ]


  ## Now lets tranform the new utterances using the same count_vectorizer as earlier
new_X = count_vectorizer.transform(new_utterance)

  ## Use model.predict() to identify the predicted labels for the text and print them out
predicted_label = model.predict(new_X)
print (predicted_label)