In [136]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [140]:
df = pd.read_csv("/Users/anbschool0015/Downloads/Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv")
df.head()

Unnamed: 0,flags,instruction,category,intent,response
0,B,question about cancelling order {{Order Number}},ORDER,cancel_order,I've understood you have a question regarding ...
1,BQZ,i have a question about cancelling oorder {{Or...,ORDER,cancel_order,I've been informed that you have a question ab...
2,BLQZ,i need help cancelling puchase {{Order Number}},ORDER,cancel_order,I can sense that you're seeking assistance wit...
3,BL,I need to cancel purchase {{Order Number}},ORDER,cancel_order,I understood that you need assistance with can...
4,BCELN,"I cannot afford this order, cancel purchase {{...",ORDER,cancel_order,I'm sensitive to the fact that you're facing f...


In [141]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26872 entries, 0 to 26871
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   flags        26872 non-null  object
 1   instruction  26872 non-null  object
 2   category     26872 non-null  object
 3   intent       26872 non-null  object
 4   response     26872 non-null  object
dtypes: object(5)
memory usage: 1.0+ MB


In [142]:
df = df.drop(['flags','category'],axis = 1)
df.head()

Unnamed: 0,instruction,intent,response
0,question about cancelling order {{Order Number}},cancel_order,I've understood you have a question regarding ...
1,i have a question about cancelling oorder {{Or...,cancel_order,I've been informed that you have a question ab...
2,i need help cancelling puchase {{Order Number}},cancel_order,I can sense that you're seeking assistance wit...
3,I need to cancel purchase {{Order Number}},cancel_order,I understood that you need assistance with can...
4,"I cannot afford this order, cancel purchase {{...",cancel_order,I'm sensitive to the fact that you're facing f...


In [77]:
stopwords = stopwords.words('english')
print(stopwords[0:10])
print(len(stopwords))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]
179


In [81]:
lm = WordNetLemmatizer()
corpus = []
for i in range (len(df)):
    review = re.sub('^a-zA-Z0-9',' ', df['instruction'][i])
    review = review.lower()
    review = review.split()
    review = [lm.lemmatize(x) for x in review if x not in stopwords]
    review = " ".join(review)
    corpus.append(review)

In [82]:
len(corpus)

26872

In [83]:
tf = TfidfVectorizer()
x = tf.fit_transform(corpus).toarray()
x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [84]:
y = df['intent']
y.head()

0    cancel_order
1    cancel_order
2    cancel_order
3    cancel_order
4    cancel_order
Name: intent, dtype: object

In [85]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.3, random_state = 10, stratify = y )

In [86]:
len(x_train),len(y_train)

(18810, 18810)

In [87]:
len(x_test), len(y_test)

(8062, 8062)

In [88]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)

In [89]:
y_pred = rf.predict(x_test)
accuracy_score_ = accuracy_score(y_test,y_pred) 
accuracy_score_

0.9875961299925576

In [90]:
class Evaluation:
    def __init__(self, model, x_train, x_test, y_train, y_test):
        self.model = model
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test
        
    def train_evaluation(self):
        y_pred_train = self.model.predict(self.x_train)
        
        acc_scr_train = accuracy_score(self.y_train, y_pred_train)
        print("Accuracy Score On Training Data Set:", acc_scr_train)
        print()
        
        con_mat_train = confusion_matrix(self.y_train, y_pred_train)
        print("Confusion Matrix On Training Data Set:\n", con_mat_train)
        print()
        
        class_rep_train = classification_report(self.y_train, y_pred_train)
        print("Classification Report On Training Data Set:\n", class_rep_train)
        
        
    def test_evaluation(self):
        y_pred_test = self.model.predict(self.x_test)
        
        acc_scr_test = accuracy_score(self.y_test, y_pred_test)
        print("Accuracy Score On Testing Data Set:", acc_scr_test)
        print()
        
        con_mat_test = confusion_matrix(self.y_test, y_pred_test)
        print("Confusion Matrix On Testing Data Set:\n", con_mat_test)
        print()
        
        class_rep_test = classification_report(self.y_test, y_pred_test)
        print("Classification Report On Testing Data Set:\n", class_rep_test)


In [91]:
#Checking the accuracy on training dataset

Evaluation(rf,x_train, x_test, y_train, y_test).train_evaluation()


Accuracy Score On Training Data Set: 1.0

Confusion Matrix On Training Data Set:
 [[699   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0]
 [  0 698   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0]
 [  0   0 681   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0]
 [  0   0   0 665   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0]
 [  0   0   0   0 700   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0 699   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0 698   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0 700   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0]
 [  0   0   0 

In [92]:
#Checking the accuracy on testing dataset
Evaluation(rf,x_train, x_test, y_train, y_test).test_evaluation()

Accuracy Score On Testing Data Set: 0.9875961299925576

Confusion Matrix On Testing Data Set:
 [[294   4   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   1   0   0   0   0   0   0   0]
 [  5 284   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
    0   9   0   0   0   0   0   0   0]
 [  0   1 284   0   0   0   0   0   1   0   0   1   1   0   1   0   0   0
    0   0   0   0   0   3   0   0   0]
 [  0   0   0 285   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0]
 [  0   0   0   0 292   0   0   0   0   0   0   0   0   0   0   8   0   0
    0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0 300   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0 298   0   0   0   0   0   0   0   0   0   1   0
    0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0 300   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0]
 

In [93]:
class Preprocessing:
    
    def __init__(self,data):
        self.data = data
        
    def text_preprocessing_user(self):
        lm = WordNetLemmatizer()
        pred_data = [self.data]    
        preprocess_data = []
        for data in pred_data:
            review = re.sub('^a-zA-Z0-9',' ', data)
            review = review.lower()
            review = review.split()
            review = [lm.lemmatize(x) for x in review if x not in stopwords]
            review = " ".join(review)
            preprocess_data.append(review)
        return preprocess_data    

In [95]:
class Prediction:
    
    def __init__(self,pred_data, model):
        self.pred_data = pred_data
        self.model = model
        
    def prediction_model(self):
        preprocess_data = Preprocessing(self.pred_data).text_preprocessing_user()
        data = tf.transform(preprocess_data)
        prediction = self.model.predict(data)
        
        return prediction
        

In [134]:
while True:
    data = input("You: ")
    print('Ah Chak:',Prediction(data,rf).prediction_model())
    if data == 'quit':
        break

You: hi
Ah Chak: ['contact_human_agent']
You: i want to purchase something
Ah Chak: ['place_order']
You: how can i get my money back
Ah Chak: ['get_refund']
You: pls tell me about create fking acc
Ah Chak: ['create_account']
You: how many delivery do you have
Ah Chak: ['delivery_options']
You: how long does my shipping arrive
Ah Chak: ['delivery_period']
You: tell me about product
Ah Chak: ['place_order']
You: quit
Ah Chak: ['contact_human_agent']
