# <center><b>NLP_Project_1_Part_1</b></center>

In [1]:
# Mounting the personal drive to import the data for the project

from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
# Importing libraries

# Basic
import os
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Related to NLP and ML
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
# nltk.download('wordnet')
# from nltk.stem import WordNetLemmatizer
# from nltk.tokenize import WhitespaceTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
file_path = r'/content/drive/MyDrive/Colab Data/NLP_Project/Part_1'
os.chdir(file_path)
os.getcwd()

'/content/drive/MyDrive/Colab Data/NLP_Project/Part_1'

## 1. Import and analyse the data set

In [4]:
blog_data = pd.read_csv("blogtext.csv")

In [5]:
print('Shape of the blog data : ', blog_data.shape)

Shape of the blog data :  (681284, 7)


In [6]:
# Since the dataset is too big at computational level. Hence we will take it partially for faster execution.

blog_data = blog_data.head(11111)

In [7]:
# Let us look at some random samples
# Re-run to check different samples

blog_data.sample(5)

Unnamed: 0,id,gender,age,topic,sign,date,text
5164,1103575,female,17,indUnk,Scorpio,"23,January,2004",Damien Rice - Cannonball Still a littl...
9895,3291253,male,27,Education,Virgo,"26,May,2004",urlLink The New York Times > Opini...
1307,3647746,male,39,Education,Virgo,"11,August,2004","Seriously, has it come to this? La..."
8014,2635745,female,15,Student,Pisces,"01,August,2004",I felt like several of these answers su...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...


In [8]:
# Checking the info on the dataset

blog_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11111 entries, 0 to 11110
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      11111 non-null  int64 
 1   gender  11111 non-null  object
 2   age     11111 non-null  int64 
 3   topic   11111 non-null  object
 4   sign    11111 non-null  object
 5   date    11111 non-null  object
 6   text    11111 non-null  object
dtypes: int64(2), object(5)
memory usage: 607.8+ KB


Let's analyse the distribution of data based on various attributes which can be categorised swiftly

In [9]:
# Based on gender

blog_data['gender'].value_counts()

male      6484
female    4627
Name: gender, dtype: int64

Here we can infer that blogging can't be discriminated based on gender as we can see that numbers are almost equal

In [10]:
# Based on gender

blog_data['age'].value_counts()

35    2333
36    1708
17    1360
27    1186
24     655
15     602
34     553
16     512
25     489
23     453
26     360
33     351
14     229
39     102
13      60
38      46
37      37
41      20
45      16
42      14
47       8
46       7
43       6
44       3
40       1
Name: age, dtype: int64

We can see that most bloggers are of young age

In [11]:
# Based on topic

blog_data['topic'].value_counts()

indUnk                     3666
Technology                 2773
Fashion                    1622
Student                    1340
Education                   299
Arts                        183
Marketing                   156
Internet                    137
Engineering                 127
Communications-Media         99
BusinessServices             91
Sports-Recreation            80
Non-Profit                   79
InvestmentBanking            70
Science                      67
Government                   65
Transportation               46
LawEnforcement-Security      46
Architecture                 39
Banking                      21
Consulting                   21
Museums-Libraries            17
Military                     16
Automotive                   14
Law                          11
Religion                      9
Advertising                   5
Publishing                    4
Accounting                    4
HumanResources                2
Telecommunications            2
Name: to

So, the blogs are about different topics with few more in numbers than rest

In [12]:
# Checking for null values

blog_data.isnull().sum()

id        0
gender    0
age       0
topic     0
sign      0
date      0
text      0
dtype: int64

We can see that there are no null values

## 2. Perform data pre-processing on the data:
- Data cleansing by removing unwanted characters, spaces, stop words etc 
- Convert text to lowercase.
- Target/label merger and transformation
- Train and test split
- Vectorisation, etc.

In [13]:
# Removing unwanted characters other than alphanumeric ones and displaying the changes

pattern = '[^\w ]'
blog_data['text'] = blog_data['text'].apply(lambda s : re.sub(pattern,'',s))
blog_data['text'].head()

0               Info has been found  100 pages and ...
1               These are the team members   Drewes...
2               In het kader van kernfusie op aarde...
3                           testing  testing          
4                 Thanks to Yahoos Toolbar I can no...
Name: text, dtype: object

In [14]:
# Removing unwanted spaces and displaying the changes

blog_data['text'] = blog_data['text'].apply(lambda s: s.strip())
blog_data['text'].head()

0    Info has been found  100 pages and 45 MB of pd...
1    These are the team members   Drewes van der La...
2    In het kader van kernfusie op aarde  MAAK JE E...
3                                     testing  testing
4    Thanks to Yahoos Toolbar I can now capture the...
Name: text, dtype: object

In [15]:
# Converting text to lower and displaying the changes

blog_data.text = blog_data.text.apply(lambda s: s.lower())
blog_data.text.head()

0    info has been found  100 pages and 45 mb of pd...
1    these are the team members   drewes van der la...
2    in het kader van kernfusie op aarde  maak je e...
3                                     testing  testing
4    thanks to yahoos toolbar i can now capture the...
Name: text, dtype: object

In [16]:
# Removing stopwords and displaying the changes

stopwords = set(stopwords.words('english'))

blog_data['text'] = blog_data['text'].apply(lambda t: ' '.join([words for words in t.split() if words not in stopwords]))
blog_data['text'].head()

0    info found 100 pages 45 mb pdf files wait unti...
1    team members drewes van der laag urllink mail ...
2    het kader van kernfusie op aarde maak je eigen...
3                                      testing testing
4    thanks yahoos toolbar capture urls popupswhich...
Name: text, dtype: object

In [17]:
# # Performing lemmatization for getting the roots
# # Step has been skipped owing to time constraints

# w_s_tokenizer = WhitespaceTokenizer()
# w_n_lemmatizer = WordNetLemmatizer()

# def lemmatize_text(text):
#     lemma = [w_n_lemmatizer.lemmatize(w) for w in w_s_tokenizer.tokenize(text)]
#     return(' '.join(lemma)) 

# blog_data['text'] = blog_data['text'].apply(lemmatize_text)
# blog_data['text'].head()

In [18]:
# Merging relevant columns as we are interested in Multi Label Classification

blog_data['multi_label'] = blog_data.apply(lambda col : [col['gender'], col['age'], col['topic'], col['sign']], axis=1)
blog_data.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,multi_label
0,2059027,male,15,Student,Leo,"14,May,2004",info found 100 pages 45 mb pdf files wait unti...,"[male, 15, Student, Leo]"
1,2059027,male,15,Student,Leo,"13,May,2004",team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,2059027,male,15,Student,Leo,"12,May,2004",het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,2059027,male,15,Student,Leo,"12,May,2004",testing testing,"[male, 15, Student, Leo]"
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks yahoos toolbar capture urls popupswhich...,"[male, 33, InvestmentBanking, Aquarius]"


In [19]:
# Displaying data for analysing the columns that needs to be filtered out

blog_data.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,multi_label
0,2059027,male,15,Student,Leo,"14,May,2004",info found 100 pages 45 mb pdf files wait unti...,"[male, 15, Student, Leo]"
1,2059027,male,15,Student,Leo,"13,May,2004",team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,2059027,male,15,Student,Leo,"12,May,2004",het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,2059027,male,15,Student,Leo,"12,May,2004",testing testing,"[male, 15, Student, Leo]"
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks yahoos toolbar capture urls popupswhich...,"[male, 33, InvestmentBanking, Aquarius]"


In [20]:
# Dropping irrelevant columns

blog_data.drop(labels=['id', 'gender', 'age', 'topic', 'sign', 'date'], axis=1, inplace=True)
blog_data.head(5)

Unnamed: 0,text,multi_label
0,info found 100 pages 45 mb pdf files wait unti...,"[male, 15, Student, Leo]"
1,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,testing testing,"[male, 15, Student, Leo]"
4,thanks yahoos toolbar capture urls popupswhich...,"[male, 33, InvestmentBanking, Aquarius]"


In [21]:
# Splitting the data as features and labels

X = blog_data['text']
y = blog_data['multi_label']

In [22]:
# Splitting the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=101, test_size = 0.25)

In [23]:
# Checking the shapes

print('Training instances are : {}'.format(X_train.shape[0]))
print('Validation instances are : {}'.format(X_test.shape[0]))

Training instances are : 8333
Validation instances are : 2778


In [24]:
# Vectorising the data
# Creating Bag of Words

vectorizer = CountVectorizer(min_df = 2,ngram_range = (1,2),stop_words = "english")
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [25]:
# Checking the vocablury size

len(vectorizer.vocabulary_)

72033

In [26]:
label_counts=dict()

for labels in blog_data['multi_label'].values:
    for label in labels:
        if label in label_counts:
            label_counts[str(label)]+=1
        else:
            label_counts[str(label)]=1

label_counts

{'13': 1,
 '14': 1,
 '15': 1,
 '16': 1,
 '17': 1,
 '23': 1,
 '24': 1,
 '25': 1,
 '26': 1,
 '27': 1,
 '33': 1,
 '34': 1,
 '35': 1,
 '36': 1,
 '37': 1,
 '38': 1,
 '39': 1,
 '40': 1,
 '41': 1,
 '42': 1,
 '43': 1,
 '44': 1,
 '45': 1,
 '46': 1,
 '47': 1,
 'Accounting': 4,
 'Advertising': 5,
 'Aquarius': 595,
 'Architecture': 39,
 'Aries': 4198,
 'Arts': 183,
 'Automotive': 14,
 'Banking': 21,
 'BusinessServices': 91,
 'Cancer': 877,
 'Capricorn': 382,
 'Communications-Media': 99,
 'Consulting': 21,
 'Education': 299,
 'Engineering': 127,
 'Fashion': 1622,
 'Gemini': 194,
 'Government': 65,
 'HumanResources': 2,
 'Internet': 137,
 'InvestmentBanking': 70,
 'Law': 11,
 'LawEnforcement-Security': 46,
 'Leo': 303,
 'Libra': 491,
 'Marketing': 156,
 'Military': 16,
 'Museums-Libraries': 17,
 'Non-Profit': 79,
 'Pisces': 469,
 'Publishing': 4,
 'Religion': 9,
 'Sagittarius': 1219,
 'Science': 67,
 'Scorpio': 1066,
 'Sports-Recreation': 80,
 'Student': 1340,
 'Taurus': 882,
 'Technology': 2773,
 '

In [27]:
# Using multi label binarizer as we are solving multi label classification

ml_binarizer=MultiLabelBinarizer(classes=sorted(label_counts.keys()))
ml_binarizer.fit(y_train)

MultiLabelBinarizer(classes=['13', '14', '15', '16', '17', '23', '24', '25',
                             '26', '27', '33', '34', '35', '36', '37', '38',
                             '39', '40', '41', '42', '43', '44', '45', '46',
                             '47', 'Accounting', 'Advertising', 'Aquarius',
                             'Architecture', 'Aries', ...])

In [28]:
y_train = ml_binarizer.transform(y_train)
y_train

  "unknown class(es) {0} will be ignored".format(sorted(unknown, key=str))


array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])

In [29]:
y_test = ml_binarizer.transform(y_test)
y_test

  "unknown class(es) {0} will be ignored".format(sorted(unknown, key=str))


array([[0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 0, 0]])

## 3. Design, train, tune and test the best text classifier

#### Random Forest with OneVsRestClassifier

In [30]:
# Training and tuning the model
# Please note that the tuning process was done manually in order to save execution time 

model1 =  RandomForestClassifier(n_estimators=100, random_state = 101) 
model1 = OneVsRestClassifier(model1)
model1.fit(X_train,y_train)
y_pred = model1.predict(X_test)

  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples.

In [31]:
print("Train Accuracy:",model1.score(X_train,y_train))

Train Accuracy: 0.9871594863794552


In [32]:
print("Test Accuracy:",model1.score(X_test,y_test))

Test Accuracy: 0.17062634989200864


Here we can see that the model is overfitting

In [33]:
# # Please note that the tuning process was done manually in order to save execution time
# # Code put here is to showcase the capability

# # Tuning the hyperparameters

# parameters = {
#                 'max_depth': [1,5,10,15,20,45],
#                 'max_features': ['auto', 'sqrt'],
#                 'min_samples_split': [5, 10],
#                 'n_estimators': [20, 65, 110, 155, 200]
#              }

# random_grid = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=101),param_distributions=parameters, 
#                           scoring='accuracy', verbose=1, n_jobs=-1,cv=5,random_state=101) 

# random_grid_result = random_grid.fit(X_train, y_train)

# print('Best Score: ', random_grid_result.best_score_) 
# print('Best Params: ', random_grid_result.best_params_) 

In [34]:
# Classification Report

print("Classification Report : ")
print(classification_report(y_test, y_pred))

Classification Report : 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         0
  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Logistic Regression with OneVsRestClassifier

In [35]:
# Training and tuning the model
# Please note that the tuning process was done manually in order to save execution time 

model1 = LogisticRegression(C=4.0, penalty='l2', dual=False, solver='liblinear', random_state=101, max_iter=1000) 
model1 = OneVsRestClassifier(model1)
model1.fit(X_train,y_train)
y_pred = model1.predict(X_test)

  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples.

In [36]:
print("Train Accuracy:",model1.score(X_train,y_train))

Train Accuracy: 0.9566782671306853


In [37]:
print("Test Accuracy:",model1.score(X_test,y_test))

Test Accuracy: 0.2609791216702664


Here we can see that the model is overfitting

In [38]:
# # Please note that the tuning process was done manually in order to save execution time
# # Code put here is to showcase the capability
# # Create the parameter grid based on the results of random search 
# param_grid = {
#     #'bootstrap': [True],
#     #'max_depth': [80, 90, 100, 110],
#     #'max_features': [2, 3],
#     #'min_samples_leaf': [3, 4, 5],
#     #'min_samples_split': [8, 10, 12],
#     'n_estimators': [100, 200, 500]
# }

# # Instantiate the grid search model
# grid_search = GridSearchCV(estimator = RandomForestClassifier(), param_grid = param_grid, 
#                           cv = 3, n_jobs = -1, verbose = 2)

# # Fit the grid search to the data
# grid_search.fit(X_train, y_train)
# grid_search.best_params_

## 4. Display and explain detail the classification report

In [39]:
# Classification Report

print("Classification Report : ")
print(classification_report(y_test, y_pred))

Classification Report : 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         0
  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Explanation of classification report :

- We can see that the class level accuracies are too low.
- The low values can be owing to imbalance in our data.
- Gap can be seen in the values of precision and recall and is totally dependent on business requirements and hence currently nothing can be commented on the same.
- So, in the current scenario, micro - average score will be appropriate as it takes into account each instances equally rather than its counter part macro - avgerage score which considers all classes equally.
- As we have used data partially owing to faster execution and to keep computations simpler we see missing values in the report as well.
- We can see that Logistic Regression with OneVsRestClassifier performs better than Random Forest with OneVsRestClassifier in terms of overall and class level accuracy.


## 5.Print the true vs predicted labels for any 5 entries from the dataset

In [40]:
# Pulling out 5 entries
rec_no = []
for i in range(5):
  rec_no.append(random.randint(10, len(y_pred)))

# Counter for 5 entries  
count = 0

for j in rec_no:
  print('Entry No :',count+1)
  print('True Labels : ', ml_binarizer.inverse_transform(y_test)[j])
  print('Predicted Labels : ', ml_binarizer.inverse_transform(y_pred)[j])
  print('\n')
  count+=1

Entry No : 1
True Labels :  ('Aquarius', 'Education', 'female')
Predicted Labels :  ('male',)


Entry No : 2
True Labels :  ('Sagittarius', 'Technology', 'female')
Predicted Labels :  ()


Entry No : 3
True Labels :  ('Capricorn', 'Technology', 'male')
Predicted Labels :  ('Aries', 'Technology', 'male')


Entry No : 4
True Labels :  ('Pisces', 'female', 'indUnk')
Predicted Labels :  ('Aries', 'male')


Entry No : 5
True Labels :  ('Aries', 'Technology', 'male')
Predicted Labels :  ('indUnk',)




As can be seen here as well that predictions are not perfect and there are missing values owing to the reason as stated before that only a part of the data has been utilised and to the fact that data is imbalanced.

# <center><b>NLP_Project_1_Part_2</b></center>

In [41]:
# Mounting the personal drive to import the data for the project

from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [42]:
# Importing the libraries

# For removal of the warnings
import warnings
warnings.filterwarnings("ignore")

# Basic
import os
import json
import string
import random
import numpy as np

# NLP related
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download("wordnet")
%tensorflow_version 1.x
import tensorflow as tf

# DL related
from tensorflow.python.keras import Sequential
from tensorflow.python.keras.layers import Dense, Dropout
from tensorflow.python.keras.optimizers import Adam

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
TensorFlow 1.x selected.


In [43]:
# Importing the corpus
# Please note that the corpus has been modified as per the requirements mentioned in the project

file_path = r'/content/drive/MyDrive/Colab Data/NLP_Project/Part_2'
os.chdir(file_path)
os.getcwd()

'/content/drive/MyDrive/Colab Data/NLP_Project/Part_2'

In [44]:
# Accessing the modified corpus

with open('GL_Bot_Corpus_Enhanced.json') as bot_corpus:
    corpus_data = json.load(bot_corpus)

In [45]:
# Displaying the data in the corpus

print(corpus_data)

{'intents': [{'tag': 'Intro', 'patterns': ['hi', 'how are you', 'is anyone there', 'hello', 'whats up', 'hey', 'yo', 'listen', 'please help me', 'i am learner from', 'i belong to', 'aiml batch', 'aifl batch', 'i am from', 'my pm is', 'blended', 'online', 'i am from', 'hey ya', 'talking to you for first time', "hey what's up", 'how are you doing', 'can you help me', 'hey ya', 'hola'], 'responses': ['Hello! how can i help you ?'], 'context_set': ''}, {'tag': 'Exit', 'patterns': ['thank you', 'thanks', 'cya', 'see you', 'later', 'see you later', 'goodbye', 'i am leaving', 'have a Good day', 'you helped me', 'thanks a lot', 'thanks a ton', 'you are the best', 'great help', 'too good', 'you are a good learning buddy', 'ciao', 'adios', 'thanks a lot buddy', 'helped a lot', 'thanks for assistance', 'great aid', 'thanks for aid'], 'responses': ['I hope I was able to assist you, Good Bye'], 'context_set': ''}, {'tag': 'Olympus', 'patterns': ['olympus', 'explain me how olympus works', 'I am not 

In [46]:
# Displaying in the original format for clarity

print(json.dumps(corpus_data, indent = 3))

{
   "intents": [
      {
         "tag": "Intro",
         "patterns": [
            "hi",
            "how are you",
            "is anyone there",
            "hello",
            "whats up",
            "hey",
            "yo",
            "listen",
            "please help me",
            "i am learner from",
            "i belong to",
            "aiml batch",
            "aifl batch",
            "i am from",
            "my pm is",
            "blended",
            "online",
            "i am from",
            "hey ya",
            "talking to you for first time",
            "hey what's up",
            "how are you doing",
            "can you help me",
            "hey ya",
            "hola"
         ],
         "responses": [
            "Hello! how can i help you ?"
         ],
         "context_set": ""
      },
      {
         "tag": "Exit",
         "patterns": [
            "thank you",
            "thanks",
            "cya",
            "see you",
            "l

In [47]:
# Using lemmatizer to get the base of words
 
lemmatizer = WordNetLemmatizer()

In [48]:
# Placeholders for data extraction

words = []
classes = []
doc_X = []
doc_y = []

# Performing the tokenisation

for intent in corpus_data["intents"]:
    for pattern in intent["patterns"]:
        tokens = nltk.word_tokenize(pattern)
        words.extend(tokens)
        doc_X.append(pattern)
        doc_y.append(intent["tag"])
    
    # Adding the missing tag in any 
    if intent["tag"] not in classes:
        classes.append(intent["tag"])

In [49]:
# Lemmatizing all the words in the vocablury
# Converting to lowercase
# For words not appearing in punctuation


words = [lemmatizer.lemmatize(word.lower()) for word in words if word not in string.punctuation]

In [50]:
# Sorting the words and classes and removing any scope for duplication

words = sorted(set(words))
classes = sorted(set(classes))

In [51]:
import pandas as pd

print('The tokens are:\n', pd.Series(words), '\n') 
print('The identified tags are:\n', pd.Series(classes), '\n')
print('The tokenised words are:\n', pd.Series(doc_X), '\n')
print('The labels are:\n', pd.Series(doc_y), '\n')

The tokens are:
 0              's
1               a
2            able
3          access
4      activation
          ...    
192         wrong
193            ya
194            yo
195           you
196          your
Length: 197, dtype: object 

The identified tags are:
 0        Bot
1      Email
2       Exit
3      Intro
4         NN
5     Office
6    Olympus
7    Profane
8         SL
9     Ticket
dtype: object 

The tokenised words are:
 0                      hi
1             how are you
2         is anyone there
3                   hello
4                whats up
              ...        
163    login unsuccessful
164           error login
165      wrong login cred
166                 email
167       unable to login
Length: 168, dtype: object 

The labels are:
 0      Intro
1      Intro
2      Intro
3      Intro
4      Intro
       ...  
163    Email
164    Email
165    Email
166    Email
167    Email
Length: 168, dtype: object 



In [52]:
# Placeholder for training data 

training = []

out_empty = [0 for i in range(len(classes))]

# Creating the bag of words and adding the frequency

for idx, doc in enumerate(doc_X):
    
    bow = []
    text = lemmatizer.lemmatize(doc.lower())
    
    for word in words:
        bow.append(1) if word in text else bow.append(0)
    
    output_row = list(out_empty)
    output_row[classes.index(doc_y[idx])] = 1
    
    # For training adding the OHE BOW and relative classes  
    training.append([bow, output_row])

training = np.array(training, dtype=object)

In [53]:
# Splitting the features and target

train_X = np.array(list(training[:, 0]))
train_y = np.array(list(training[:, 1]))

In [54]:
# Required data and params for modelling the classifier

input_shape = (len(train_X[0]),)
output_shape = len(train_y[0])
epochs = 300

In [55]:
# Building the model

# Initializing model

model = Sequential()

# Input layer 

model.add(Dense(128, input_shape=input_shape, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.2))

# Output layer
model.add(Dense(output_shape, activation = "softmax"))

# Compiling the model
model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=["accuracy"])

# Model summary
print(model.summary())

# Fitting the model
model.fit(x=train_X, y=train_y, epochs=epochs, verbose=1)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               25344     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                650       
Total params: 34,250
Trainable params: 34,250
Non-trainable params: 0
_________________________________________________________________
None
Train on 168 samples
Epoch 1/300

<tensorflow.python.keras.callbacks.History at 0x7f1ea65d4d50>

In [56]:
# Impelementations for interactive seesion with Chatbot 

def lemma_s(text): 
  tokens = nltk.word_tokenize(text)
  tokens = [lemmatizer.lemmatize(word) for word in tokens]
  return tokens

def bow_s(text, vocab): 
  tokens = lemma_s(text)
  bow = [0] * len(vocab)
  for w in tokens: 
    for idx, word in enumerate(vocab):
      if word == w: 
        bow[idx] = 1
  return np.array(bow)

def pred_label_s(text, vocab, labels): 
  bow = bow_s(text, vocab)
  result = model.predict(np.array([bow]))[0]
  threshold = 0.2
  y_pred = [[idx, res] for idx, res in enumerate(result) if res > threshold]

  y_pred.sort(key=lambda x: x[1], reverse=True)
  return_list = []
  for r in y_pred:
    return_list.append(labels[r[0]])
  return return_list

def response(intents_list, intents_json): 
  tag = intents_list[0]
  list_of_intents = intents_json["intents"]
  for i in list_of_intents: 
    if i["tag"] == tag:
      result = random.choice(i["responses"])
      break
  return result

In [57]:
# Running the chatbot

print("GL_BOT : Chat with the GL_BOT [Type 'quit' to stop the session] ")
print("\nGL_BOT : If answer given is not correct [Type '*'] !\n\n")

while True:
  
  # Reading Input

  message = input('You: ')

  # Handling inadequate reply
  
  if message.lower() == '*':
    print('\nGL_BOT:Please rephrase your question and try again')

  # Stopping the session
  
  if message.lower() == 'quit':
    print('\nGL_BOT : ****See you again soon****')
    break

  # Predicting and printing the response of the bot  
  
  intents = pred_label_s(message, words, classes)
  result = response(intents, corpus_data)
  print("\nGL_BOT : ", result)

GL_BOT : Chat with the GL_BOT [Type 'quit' to stop the session] 

GL_BOT : If answer given is not correct [Type '*'] !


You: hi

GL_BOT :  Hello! how can i help you ?
You: my office is at bangalore

GL_BOT :  I understand you are at Bangalore office,India
You: workplace

GL_BOT :  Hello! how can i help you ?
You: work place

GL_BOT :  I understand you are at Bangalore office,India
You: email not functional

GL_BOT :  Please use this link [emailhelp.com] to get step by step solution
You: where to find ml tutorials

GL_BOT :  Link: Machine Learning wiki 
You: ciao

GL_BOT :  I hope I was able to assist you, Good Bye
You: quit

GL_BOT : ****See you again soon****
