<a href="https://colab.research.google.com/github/Ian-Sinclair/NLP-Multiclass-Topic-Modeling/blob/main/COMP_4705_AdvTpcs_Final_Project_Ian_Sinclair.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Support Ticket Topic Modeling
COMP 4705 Data Analysis- Industry

Professor Dalton Crutchfield

Ian Sinclair

# Imports

In [None]:
import pandas as pd
import nltk
import numpy as np
import io
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from nltk.corpus import stopwords
from scipy import sparse
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
import sys
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize, pos_tag
from google.colab import drive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Collection (Uploading)

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)


In [None]:
dataset_link = []

for file_index in range(7,99) :
  copied_path = 'Classified File Path'
  data = pd.read_csv(copied_path, usecols=['Summary',
                                               #'Issue Type',
                                               #'Status',
                                               #'Project name', 
                                               #'Project type',
                                               #'Labels',
                                               #'Description',
                                               #'Original estimate',
                                               #'Custom field (Category)',
                                               'Custom field (Job Failure Cause)',
                                               'Custom field (Job Failure Error)',
                                               #'Custom field (Job Failure Source)',
                                               'Custom field (Job Failure Sub-Cause)',
                                               #'Custom field (Job Name)',
                                               #'Custom field (Job Title)'
                                               ])
  dataset_link.append(data)

data = pd.concat(dataset_link)
data



# Data Processing

Here we filtered out empty row information from the input document corpus and reduced the dataset to two columns, with a single feature, (the error messages) and a set of classes, (the sub-catagories).

In [None]:

messy_data = data.dropna(subset=['Custom field (Job Failure Sub-Cause)',"Custom field (Job Failure Error)"])

filtered_data = messy_data[["Custom field (Job Failure Sub-Cause)",
                   "Custom field (Job Failure Error)"]]


sub_causes_headers = []

for index, row in data.iterrows() :
  if row["Custom field (Job Failure Sub-Cause)"] not in sub_causes_headers :
    sub_causes_headers.append(row["Custom field (Job Failure Sub-Cause)"])

#print(sub_causes_headers)

data = filtered_data
data


Initialized variables for the bag of words tokenizer, to be used in the pipeline.

In [None]:

BoW = CountVectorizer()

bag = BoW.fit_transform(data['Custom field (Job Failure Error)'])


Initialized tfidf vectorizer transformation and L2 norm preprocessing on document corpus.

In [None]:
tfidf = TfidfTransformer(use_idf=True, norm=None, smooth_idf=True)

raw_tfidf = tfidf.fit_transform(BoW.fit_transform(data['Custom field (Job Failure Error)'])).toarray()
raw_tfidf 

l2_tfidf = raw_tfidf / np.sqrt(np.sum(raw_tfidf**2))
l2_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

Initialized functions to stem words in each document in the corpus, reducing the complexity of the vocabulary matrix.

In [None]:
porter = PorterStemmer()

def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
    

Set up stop words so unnecessary words can be removed from the document corpus.


In [None]:
nltk.download('stopwords')
stop = stopwords.words('english')
for i in data['Custom field (Job Failure Error)'] :  
  [w for w in tokenizer_porter(i) if w not in stop]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Model Optimization and Tokenizing

Initializes pipeline for bag of words, L2 norm vectorizer, and support vector classifier.

In [None]:
param_grid = [{'vect__stop_words': [stop, None],
               'vect__ngram_range': ((1, 1), (1, 2)),
               'clf__C': [0.1, 1, 10, 100, 1000],
               'clf__gamma': [1, 0.1, 0.01, 0.001, 0.0001],
               'clf__kernel': ['rbf']},
              {'vect__ngram_range': ((1, 1), (1, 2)),
               'vect__stop_words': [stop, None],
               'tfidf__use_idf':[True, False],
               'tfidf__norm':[None,'l1', 'l2'],
               'clf__C': [0.1, 1, 10, 100, 1000],
               'clf__gamma': [1, 0.1, 0.01, 0.001, 0.0001],
               'clf__kernel': ['rbf']}]
sgd_tfidf = Pipeline([('vect', BoW),
                      ('tfidf', tfidf),
                      ('clf', SVC())])

gs_sgd_tfidf = GridSearchCV(sgd_tfidf, param_grid,
                           scoring='accuracy',
                           cv=3, #cv = 5
                           verbose=2,
                           n_jobs=-1)

Splits document corpus into training and testing sets, 
NOTE: because of the size limitations of the dataset and the number of
classes, special care was taken to ensure the training set contained at  least one entry for every class.

In [None]:

X_train = []
y_train = []
X_test = []
y_test = []


for index, row in data.iterrows() :
  if row['Custom field (Job Failure Sub-Cause)'] not in y_test :
    X_test.append(row['Custom field (Job Failure Error)'])
    y_test.append(row['Custom field (Job Failure Sub-Cause)'])
  else :
    X_train.append(row['Custom field (Job Failure Error)'])
    y_train.append(row['Custom field (Job Failure Sub-Cause)'])

print(len(X_train))

for index in range(0,78) :
  X_test.append(X_train[index])
  y_test.append(y_train[index])
  X_train.pop(index)
  y_train.pop(index)


478


Fits the training data to the pipline.

In [None]:
gs_sgd_tfidf.fit(X_train, y_train)

Fitting 3 folds for each of 700 candidates, totalling 2100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:   44.9s
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 1009 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 1454 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 1981 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 2100 out of 2100 | elapsed:  9.1min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

Records the CV accuracy from the training data after leaving the pipeline.

In [None]:
print('Best parameter set: %s ' % gs_sgd_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_sgd_tfidf.best_score_)

Best parameter set: {'clf__C': 100, 'clf__gamma': 0.01, 'clf__kernel': 'rbf', 'tfidf__norm': 'l2', 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2), 'vect__stop_words': ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'th

Final accuracy of the testing data based on the support vector machine classifier from the training data.

In [None]:
clf = gs_sgd_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

Test Accuracy: 0.348
