- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

## Text clasification using Ensemble Techniques

1. Author - Krishnav Dave 

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

In [1]:
# Natural Langauge Tool Kit
import nltk  # spacy
from nltk.stem.lancaster import LancasterStemmer  # Lancaster stemmer
stemmer = LancasterStemmer() # Lancaster stemmer

# Numeric computation
import numpy
import random



In [2]:
# Importing corpus
import json

# Import corpus file
with open('corpus.json') as file:
    Corpus = json.load(file)

# Display corpus file
print(Corpus)

{'intents': [{'tag': 'Intro', 'patterns': ['hi', 'how are you', 'is anyone there', 'hello', 'whats up', 'hey', 'yo', 'listen', 'please help me', 'i am learner from', 'i belong to', 'aiml batch', 'aifl batch', 'i am from', 'my pm is', 'blended', 'online', 'i am from', 'hey ya', 'talking to you for first time'], 'responses': ['Hello! how can i help you ?'], 'context_set': ''}, {'tag': 'Exit', 'patterns': ['thank you', 'thanks', 'cya', 'see you', 'later', 'see you later', 'goodbye', 'i am leaving', 'have a Good day', 'you helped me', 'thanks a lot', 'thanks a ton', 'you are the best', 'great help', 'too good', 'you are a good learning buddy'], 'responses': ['I hope I was able to assist you, Good Bye'], 'context_set': ''}, {'tag': 'Olympus', 'patterns': ['olympus', 'explain me how olympus works', 'I am not able to understand olympus', 'olympus window not working', 'no access to olympus', 'unable to see link in olympus', 'no link visible on olympus', 'whom to contact for olympus', 'lot of p

In [3]:
# Download "punkt" if missing
# nltk.download('punkt')

# Extract data
W = [] # Tokens 
L = [] # Identified Tags or Labels
doc_x = [] # Tokenised words
doc_y = [] # Tags or Labels

for intent in Corpus['intents']:
    for pattern in intent['patterns']:
        w_temp = nltk.word_tokenize(pattern)
        W.extend(w_temp)
        doc_x.append(w_temp)
        doc_y.append(intent["tag"])
    
    # Add the mising tag if any    
    if intent['tag'] not in L:
        L.append(intent['tag'])

In [4]:
# Stemming
W = [stemmer.stem(w.lower()) for w in W if w != "?"] # Stemming or learning the root word
W = sorted(list(set(W))) # Sorted words
L = sorted(L) # Sorted list of tags or labels

In [5]:
# Words
W

['a',
 'abl',
 'access',
 'act',
 'ad',
 'adam',
 'aifl',
 'aiml',
 'am',
 'an',
 'anyon',
 'ar',
 'art',
 'backward',
 'bad',
 'bag',
 'batch',
 'bay',
 'belong',
 'best',
 'blend',
 'bloody',
 'boost',
 'bot',
 'buddy',
 'class',
 'contact',
 'cre',
 'cross',
 'cya',
 'day',
 'deep',
 'did',
 'diffult',
 'do',
 'ensembl',
 'epoch',
 'explain',
 'first',
 'for',
 'forest',
 'forward',
 'from',
 'funct',
 'good',
 'goodby',
 'grady',
 'gre',
 'hat',
 'hav',
 'hel',
 'hello',
 'help',
 'hey',
 'hi',
 'hid',
 'hour',
 'how',
 'hyp',
 'i',
 'imput',
 'in',
 'intellig',
 'is',
 'jerk',
 'jok',
 'knn',
 'lat',
 'lay',
 'learn',
 'leav',
 'link',
 'list',
 'log',
 'lot',
 'machin',
 'me',
 'ml',
 'my',
 'naiv',
 'nam',
 'nb',
 'net',
 'network',
 'neur',
 'no',
 'not',
 'of',
 'olymp',
 'olyp',
 'on',
 'onlin',
 'op',
 'opert',
 'otim',
 'paramet',
 'piec',
 'pleas',
 'pm',
 'problem',
 'prop',
 'random',
 'regress',
 'relu',
 'screw',
 'see',
 'sgd',
 'shit',
 'sigmoid',
 'sl',
 'smart',
 '

In [6]:
# Tags
L

['Bot', 'Exit', 'Intro', 'NN', 'Olympus', 'Profane', 'SL', 'Ticket']

In [7]:
Train = [] # Training data for NN
Target = [] # Target data for NN

out_empty = [0 for _ in range(len(L))]

# Loop to create bag of words and put the frequency count on each word
for x, doc in enumerate(doc_x):
    bag = []

    w_temp = [stemmer.stem(w.lower()) for w in doc]

    for w in W:
        if w in w_temp:
            bag.append(1)
        else:
            bag.append(0)

    output_row = out_empty[:]
    output_row[L.index(doc_y[x])] = 1

    Train.append(bag) # List
    Target.append(output_row) # List

In [8]:
# convert training data and output to numpy arrays

Train = numpy.array(Train) # List to numpy arrray
Target = numpy.array(Target) # List to numpy arrray

In [11]:
Train[:1]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [10]:
# Test Train Split

from sklearn.model_selection import train_test_split
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(Train, Target, test_size=0.25, random_state=10)

In [12]:
# Decision Tree

# Library
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score, confusion_matrix

# Model
model_DT=DecisionTreeClassifier(criterion='entropy',
                                splitter='best',
                                max_depth=5,
                                min_samples_split=2,
                                min_samples_leaf=1,
                                min_weight_fraction_leaf=0.0,
                                max_features=None,
                                random_state=None,
                                max_leaf_nodes=None,
                                min_impurity_decrease=0.0,
                                class_weight=None,
                                ccp_alpha=0.0,)

model_DT.fit(X_train_scaled, y_train)

# Accuracy
DT_Train=model_DT.score(X_train_scaled, y_train)
DT_Test=model_DT.score(X_test_scaled, y_test)

# Output
print("Train Accuracy:",DT_Train)
print("Test Accuracy:",DT_Test)

Train Accuracy: 0.2604166666666667
Test Accuracy: 0.3125


In [13]:
# RANDOM FOREST

# Library
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# Model
model_RF = RandomForestClassifier(n_estimators=100,
                                    criterion='gini',
                                    max_depth=None,
                                    min_samples_split=2,
                                    min_samples_leaf=1,
                                    min_weight_fraction_leaf=0.0,
                                    max_features='auto',
                                    max_leaf_nodes=None,
                                    min_impurity_decrease=0.0,
                                    bootstrap=True,
                                    oob_score=False,
                                    n_jobs=None,
                                    random_state=None,
                                    verbose=0,
                                    warm_start=False,
                                    class_weight=None,
                                    ccp_alpha=0.0,
                                    max_samples=None,)
model_RF.fit(X_train_scaled, y_train)

# Accuracy
pred_RF = model_RF.predict(X_test_scaled)
RF_Train = model_RF.score(X_train_scaled, y_train)
RF_Test = accuracy_score(y_test, pred_RF)

# Output
print("Train Accuracy:",RF_Train)
print("Test Accuracy:",RF_Test)

Train Accuracy: 1.0
Test Accuracy: 0.53125


In [14]:
def bag_of_words(s, W):
    bag = [0 for _ in range(len(W))]

    s_words = nltk.word_tokenize(s)
    s_words = [stemmer.stem(word.lower()) for word in s_words]

    for se in s_words:
        for i, w in enumerate(W):
            if w == se:
                bag[i] = 1
            
    return numpy.array(bag)


def chat():
    print("Chat with Ramos (type: stop to quit)")
    print("If answer is not right (type: *)")
    while True:
        inp = input("\n\nYou: ")
        if inp.lower()=="*":
            print("BOT: Please rephrase your question and try again")
        if inp.lower() == "quit":
            break

        results = model_RF.predict([bag_of_words(inp, W)])
        results_index = numpy.argmax(results)
        tag = L[results_index]

        for tg in Corpus["intents"]:
            if tg['tag'] == tag:
                responses = tg['responses']

        print(random.choice(responses))

In [15]:
chat()

Chat with Ramos (type: stop to quit)
If answer is not right (type: *)


You: Hello
Hello! how can i help you ?


You: hey yo
Hello! how can i help you ?


You: my name is Krishnav
I am your virtual learning assistant


You: not able to understand neural networks
Link: Neural Nets wiki


You: you are stupid
I am your virtual learning assistant


You: jerk
Please use respectful words


You: *
BOT: Please rephrase your question and try again
I am your virtual learning assistant


You: quit
