In [227]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt 

from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings('ignore')

# problem statement 

# data collection

In [228]:
data = pd.read_csv('Helpdesk Sample Emails - Train Set.tsv','\t')
data

Unnamed: 0,Target Category,Email Body
0,Desktop Hardware,Hey IT support team - my monitor is not turnin...
1,Desktop Software,Hey - I just got a new laptop and I need some ...
2,Networking,IT Support team - my wireless internet keeps g...
3,Account Access,Hey - we just got a new team member on the ma...
4,General Question,Hey how do I delete my internet cookies? I saw...
...,...,...
221,Networking,Team! I need a new LAN Cable. Please could you...
222,Networking,Im trying to connect my phone to the wifi netw...
223,Networking,Im going to be traveling to a conference soon ...
224,Networking,The network cable on the back of my machine ke...


In [229]:
data.iloc[100,1]

'Brett has received his new SIM card for EFB data and would like to know what to do with the old one.'

In [230]:
data.iloc[100,0]

'Networking'

In [231]:
data.shape

(226, 2)

# text cleaning and preprocessing

### 1.tokenization 
### 2.removing stopwords 
### 3.removing punchuations 
### 4.sentence to words 
### 5.words to its basic form using stemming and lamitization

In [232]:
import re 
from nltk.corpus import stopwords

## Stemming

In [233]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [234]:
# cleaning using re --- lowering -- removing stopwords ---- stemming 
def preprocess(data):
    corpus = []
    for i in range(len(data)):
        review = re.sub('[^a-zA-Z0-9]',' ',data['Email Body'][i])
        review = review.lower()
        review = review.split()
        review = [ps.stem(word) for word in review if word not in set(stopwords.words('english')) ]
        review = ' '.join(review)
        corpus.append(review)
    return corpus
   

In [235]:
corpus = preprocess(data)

## lematization

In [236]:
from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()

In [237]:
# cleaning using re --- lowering -- removing stopwords ---- stemming 

def preprocess_lmtiz(data):
    c = []
    for i in range(len(data)):
        r = re.sub('[^a-zA-Z0-9]',' ',data['Email Body'][i])
        r = r.lower()
        r = r.split()
        r = [wl.lemmatize(word) for word in r if word not in set(stopwords.words('english'))]
        r = ' '.join(r)
        c.append(r)
    return c

In [238]:
c= preprocess_lmtiz(data)

In [239]:
data['stemed']=corpus
data['lematized'] = c

## Word Embedding -- word to vector 

In [240]:
data

Unnamed: 0,Target Category,Email Body,stemed,lematized
0,Desktop Hardware,Hey IT support team - my monitor is not turnin...,hey support team monitor turn tri plug still n...,hey support team monitor turning tried plug st...
1,Desktop Software,Hey - I just got a new laptop and I need some ...,hey got new laptop need softwar old laptop som...,hey got new laptop need software old laptop so...
2,Networking,IT Support team - my wireless internet keeps g...,support team wireless internet keep go reset r...,support team wireless internet keep going rese...
3,Account Access,Hey - we just got a new team member on the ma...,hey got new team member market team tri get se...,hey got new team member marketing team trying ...
4,General Question,Hey how do I delete my internet cookies? I saw...,hey delet internet cooki saw movi netflix larg...,hey delete internet cooky saw movie netflix la...
...,...,...,...,...
221,Networking,Team! I need a new LAN Cable. Please could you...,team need new lan cabl pleas could send,team need new lan cable please could send
222,Networking,Im trying to connect my phone to the wifi netw...,im tri connect phone wifi network work keep dr...,im trying connect phone wifi network work keep...
223,Networking,Im going to be traveling to a conference soon ...,im go travel confer soon need mobil hotspot st...,im going traveling conference soon need mobile...
224,Networking,The network cable on the back of my machine ke...,network cabl back machin keep come unplug caus...,network cable back machine keep coming unplugg...


In [241]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary=True)

In [242]:
# creating BOW model 

#x = cv.fit_transform(data['stemed']).toarray()
x = cv.fit(data['lematized'])

In [279]:
x = cv.transform(data['lematized']).toarray()

In [280]:
y = data['Target Category']

# model building 

### 1. K- nearest Neighbour

In [281]:
from sklearn.model_selection import train_test_split

In [282]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0,stratify=y)

In [283]:
from sklearn.neighbors import KNeighborsClassifier

In [284]:
classifier = KNeighborsClassifier()
classifier.fit(x_train,y_train)

In [285]:
y_pred = classifier.predict(x_test)

### Evaluation

In [286]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report 

In [287]:
matrix = confusion_matrix(y_test,y_pred)

In [288]:
matrix

array([[9, 0, 2, 0, 0],
       [4, 0, 2, 0, 2],
       [4, 0, 9, 0, 0],
       [5, 0, 2, 0, 0],
       [6, 0, 0, 0, 1]], dtype=int64)

In [289]:
accuracy_score(y_test,y_pred)

0.41304347826086957

In [290]:
cr = classification_report(y_test,y_pred)
print(cr)

                  precision    recall  f1-score   support

  Account Access       0.32      0.82      0.46        11
Desktop Hardware       0.00      0.00      0.00         8
Desktop Software       0.60      0.69      0.64        13
General Question       0.00      0.00      0.00         7
      Networking       0.33      0.14      0.20         7

        accuracy                           0.41        46
       macro avg       0.25      0.33      0.26        46
    weighted avg       0.30      0.41      0.32        46



### 2 . Naive Bayes

#### A. Guassian NB  - continous bag of words

In [291]:
from sklearn.naive_bayes  import GaussianNB

In [292]:
nb = GaussianNB()

In [293]:
nb.fit(x_train,y_train)

In [294]:
nb_y = nb.predict(x_test)

In [295]:
acc1 = accuracy_score(nb_y,y_test)

In [296]:
acc1

0.5434782608695652

In [297]:
confusion_matrix(nb_y,y_test)

array([[5, 0, 4, 2, 0],
       [0, 5, 0, 1, 0],
       [3, 1, 5, 1, 0],
       [2, 0, 1, 3, 0],
       [1, 2, 3, 0, 7]], dtype=int64)

#### B.   Multinomial NB   ----- TF-IDF

In [298]:
from sklearn.naive_bayes import MultinomialNB

In [299]:
mnnb = MultinomialNB()

In [300]:
mnnb.fit(x_train,y_train)

In [301]:
mnnb_y = mnnb.predict(x_test)

In [302]:
acc2 = accuracy_score(mnnb_y,y_test)

In [303]:
acc2

0.6086956521739131

#### C.   Bernulis NB - Bag of Words

In [304]:
from sklearn.naive_bayes import BernoulliNB

In [305]:
brnb = BernoulliNB()

In [306]:
brnb.fit(x_train,y_train)

In [307]:
brnb_y = brnb.predict(x_test)

In [308]:
acc3 = accuracy_score(y_test,brnb_y)

In [309]:
acc3

0.41304347826086957

### 3.SVM --- support vector machine 

In [310]:
from sklearn.svm import SVC

In [311]:
svc = SVC()

In [312]:
svc.fit(x_train,y_train)

In [313]:
svc_pred = svc.predict(x_test)

In [314]:
svc_acc = accuracy_score(y_test,svc_pred)

In [315]:
svc_acc

0.5217391304347826

In [316]:
import pickle

In [317]:
# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [318]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
# predict1 = loaded_model.predict('hey i am facing software issue can you help it please')
result = loaded_model.score(x_test, y_test)
print(result)

0.6086956521739131


In [319]:
loaded_model.predict([x[0]])

array(['Desktop Hardware'], dtype='<U16')

In [320]:
y[0]

'Desktop Hardware'

In [335]:
def desk_email(input_):
    input1 = re.sub('[^a-zA-Z0-9]',' ',input_)
    input1 = input1.lower()
    input1 = input1.split()
    input1 = [wl.lemmatize(word) for word in input1 if word not in set(stopwords.words('english'))]
    input1 = ' '.join(input1)
    x_test = cv.transform(input1)
    y_pred = loaded_model.predict(x_test)
    return y_pred
    
    

In [336]:
data = 'Brett has received his new SIM card for EFB data and would like to know what to do with the old one.'
desk_email(input_)

ValueError: Iterable over raw text documents expected, string object received.

In [337]:
input_ = ")()(((("
def c(input_):
    count=0
    for i in input_:
        if i==")":
            count+=1
    return count

In [338]:
c(input_)

2