In [204]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")



import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import GridSearchCV

import re
# Tutorial about Python regular expressions: https://pymotw.com/2/re/
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

from tqdm import tqdm
import os

from bs4 import BeautifulSoup


In [205]:
df = pd.read_csv('./incident.csv', encoding='latin')

In [206]:
df.head()

Unnamed: 0,u_product,assignment_group,short_description
0,Ariba (DRR) (703440),ITOPS-FIT-TTR-L2,IMPORTED RECEIPTS IN ARIBA FROM SAP!
1,SonicWall(DRR) (422697),ITOPS-DISD-L1,hi
2,Ariba (DRR) (703440),ITOPS-CORP-SUPPORT,Jabra Headset not functioning
3,Ariba (DRR) (703440),ITOPS-CORP-SUPPORT,I have a plantronics headset and it is not wor...
4,SonicWall(DRR) (422697),ITOPS-DISD-L1,Couldnot connect to VPN


In [207]:
df['u_product'].value_counts()

Corporate Oracle WebCenter (COWC) (379227)                 2835
Corporate Extended Oracle WebCenter (CEOWC) (417127)       1813
BQMS - BRAZIL QUALITY MANAGEMENT SYSTEM (35998)             995
Ariba (DRR) (703440)                                        343
HRDM - Human Resources Document Management (32300)          306
SonicWall(DRR) (422697)                                     235
EDMS - ELECTRONIC DOCUMENT MANAGEMENT SYSTEM (35994)         48
ES - EMPLOYMENT SERVICES (35995)                             37
MARS (Marketing and Advertising Review System) (250250)      17
Gale(DRR) (422694)                                            7
Performance Link (DRR) (883344)                               2
Softheon (DRR) (883348)                                       2
Force 10(DRR) (422695)                                        2
GOE - GLOBAL OPERATIONS ENGINEERING (35997)                   2
Name: u_product, dtype: int64

In [208]:
df['level'] = df['assignment_group'].apply(lambda x: x.split('-')[-1])

In [209]:
df.head()

Unnamed: 0,u_product,assignment_group,short_description,level
0,Ariba (DRR) (703440),ITOPS-FIT-TTR-L2,IMPORTED RECEIPTS IN ARIBA FROM SAP!,L2
1,SonicWall(DRR) (422697),ITOPS-DISD-L1,hi,L1
2,Ariba (DRR) (703440),ITOPS-CORP-SUPPORT,Jabra Headset not functioning,SUPPORT
3,Ariba (DRR) (703440),ITOPS-CORP-SUPPORT,I have a plantronics headset and it is not wor...,SUPPORT
4,SonicWall(DRR) (422697),ITOPS-DISD-L1,Couldnot connect to VPN,L1


In [210]:
# https://stackoverflow.com/a/47091490/4084039
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

# https://gist.github.com/sebleier/554280
# we are removing the words from the stop words list: 'no', 'nor', 'not'
# <br /><br /> ==> after the above steps, we are getting "br br"
# we are including them into stop words list
# instead of <br /> if we have <br/> these tags would have revmoved in the 1st step

stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

In [211]:
# Combining all the above stundents 
from tqdm import tqdm
short_desc = []
# tqdm is for printing the status bar
for sentance in tqdm(df['short_description'].values):
    sentance = re.sub(r"http\S+", "", sentance)
    sentance = BeautifulSoup(sentance, 'lxml').get_text()
    sentance = decontracted(sentance)
    sentance = re.sub("\S*\d\S*", "", sentance).strip()
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    # https://gist.github.com/sebleier/554280
    sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in stopwords)
    short_desc.append(sentance.strip())

100%|██████████| 6644/6644 [00:02<00:00, 3199.64it/s]


In [212]:
df['cleaned'] = short_desc

In [213]:
X_train, X_test, Y_train, Y_test = train_test_split(df['cleaned'], df['u_product'],
                                                    shuffle=False, test_size=0.3)

In [214]:
tf_idf_vect = TfidfVectorizer(ngram_range=(1,2))
tf_idf_vect.fit(X_train)
print("some sample features(unique words in the corpus)",tf_idf_vect.get_feature_names()[0:10])
print('='*50)

X_train_tf_idf = tf_idf_vect.transform(X_train)
print("the type of count vectorizer ",type(X_train_tf_idf))
print("the shape of out text TFIDF vectorizer ",X_train_tf_idf.get_shape())
print("the number of unique words including both unigrams and bigrams ", X_train_tf_idf.get_shape()[1])

some sample features(unique words in the corpus) ['able', 'able access', 'able approve', 'able connect', 'able connecting', 'able create', 'able log', 'able login', 'able push', 'able raise']
the type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
the shape of out text TFIDF vectorizer  (4650, 3602)
the number of unique words including both unigrams and bigrams  3602


In [215]:
with open('tf_idf_vect_incidents_product_6k.pickle', 'wb') as f:
    pickle.dump(tf_idf_vect, f, pickle.HIGHEST_PROTOCOL)


In [216]:
X_test_tfidf = tf_idf_vect.transform(X_test)

In [217]:
X_test_tfidf

<1994x3602 sparse matrix of type '<class 'numpy.float64'>'
	with 22606 stored elements in Compressed Sparse Row format>

In [218]:
from xgboost import XGBClassifier

In [219]:
clf = XGBClassifier()
clf.fit(X_train_tf_idf, Y_train)
clf.score(X_test_tfidf, Y_test)

0.6780341023069207

In [221]:
with open('clf_incidents_product_6k.pickle', 'wb') as f:
    pickle.dump(clf, f, pickle.HIGHEST_PROTOCOL)

In [220]:
Y_train

0                                    Ariba (DRR) (703440)
1                                 SonicWall(DRR) (422697)
2                                    Ariba (DRR) (703440)
3                                    Ariba (DRR) (703440)
4                                 SonicWall(DRR) (422697)
5                                 SonicWall(DRR) (422697)
6       Corporate Extended Oracle WebCenter (CEOWC) (4...
7                                    Ariba (DRR) (703440)
8                                 SonicWall(DRR) (422697)
9                                    Ariba (DRR) (703440)
10                                SonicWall(DRR) (422697)
11             Corporate Oracle WebCenter (COWC) (379227)
12                                SonicWall(DRR) (422697)
13                       ES - EMPLOYMENT SERVICES (35995)
14                                SonicWall(DRR) (422697)
15                                SonicWall(DRR) (422697)
16                                SonicWall(DRR) (422697)
17            

In [128]:
df = pd.read_csv('./largest.csv', encoding='latin')

In [130]:
df['ClosureProductName'].value_counts()

MICROSOFT SQL SERVER                                      5702
ORACLE DATABASE                                           5577
DELTA (36122)                                             5348
OMEGA (34653)                                             4533
NETWORK DATA DEVICE                                       4253
POWEREDGE                                                 4087
Oracle Enterprise Manager (32850)                         3268
SFDC - Dell Main (31930)                                  3027
DragonFx DAO (32090)                                      2596
Oracle Transportation Management (OTM) (416055)           2152
OFS (34520)                                               2141
Foglight (Dell IT) (720743)                               2128
WINDOWS 2008 SERVER                                       2090
DSA - Dell Sales Application (903974)                     2090
GCM Pipeline Services (537008)                            2021
GII - APJ (36268)                                      

In [222]:
X_train, X_test, Y_train, Y_test = train_test_split(df['cleaned'], df['level'], shuffle=False, test_size=0.2)

In [223]:
tf_idf_vect = TfidfVectorizer(ngram_range=(1,2))
tf_idf_vect.fit(X_train)
print("some sample features(unique words in the corpus)",tf_idf_vect.get_feature_names()[0:10])
print('='*50)

X_train_tf_idf = tf_idf_vect.transform(X_train)
print("the type of count vectorizer ",type(X_train_tf_idf))
print("the shape of out text TFIDF vectorizer ",X_train_tf_idf.get_shape())
print("the number of unique words including both unigrams and bigrams ", X_train_tf_idf.get_shape()[1])

X_test_tfidf = tf_idf_vect.transform(X_test)

some sample features(unique words in the corpus) ['able', 'able access', 'able approve', 'able connect', 'able connecting', 'able create', 'able log', 'able login', 'able push', 'able raise']
the type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
the shape of out text TFIDF vectorizer  (5315, 3783)
the number of unique words including both unigrams and bigrams  3783


In [224]:
with open('tf_idf_vect_level_product_6k.pickle', 'wb') as f:
    pickle.dump(tf_idf_vect, f, pickle.HIGHEST_PROTOCOL)


In [225]:
clf = XGBClassifier()
clf.fit(X_train_tf_idf, Y_train)
clf.score(X_test_tfidf, Y_test)

0.7863054928517682

In [226]:
with open('clf_level_product_6k.pickle', 'wb') as f:
    pickle.dump(clf, f, pickle.HIGHEST_PROTOCOL)
