In [261]:
import numpy as np
import nltk
import json
from langdetect import detect
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from sklearn import svm
import re
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

<span style="font-family: Arial; font-weight:bold;font-size:2.5em;color:#00b3e5;"> Part-1

<span style="font-family: Arial; font-weight:bold;font-size:2em;color:#00b3e5;"> 1. Read and Analyse Dataset

In [27]:
data_blog = pd.read_csv("./blogs/blogtext.csv")

<span style="font-family: Arial; font-weight:bold;font-size:1.5em;color:#00b3e5;"> A. Clearly write outcome of data analysis

- understand the shape and size of the data.
- understand the data contained in the columns.
- print the number of unique data points each columns.
- print the number of data points in each class of the 'topic' column.
- perform stastical analysis of the numeric data.

In [28]:
data_blog.shape

(681284, 7)

In [29]:
data_blog.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [30]:
data_blog.columns

Index(['id', 'gender', 'age', 'topic', 'sign', 'date', 'text'], dtype='object')

In [31]:
data_blog.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681284 entries, 0 to 681283
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      681284 non-null  int64 
 1   gender  681284 non-null  object
 2   age     681284 non-null  int64 
 3   topic   681284 non-null  object
 4   sign    681284 non-null  object
 5   date    681284 non-null  object
 6   text    681284 non-null  object
dtypes: int64(2), object(5)
memory usage: 36.4+ MB


In [32]:
data_blog.nunique()

id         19320
gender         2
age           26
topic         40
sign          12
date        2616
text      611652
dtype: int64

In [33]:
data_blog['topic'].value_counts()

indUnk                     251015
Student                    153903
Technology                  42055
Arts                        32449
Education                   29633
Communications-Media        20140
Internet                    16006
Non-Profit                  14700
Engineering                 11653
Law                          9040
Publishing                   7753
Science                      7269
Government                   6907
Consulting                   5862
Religion                     5235
Fashion                      4851
Marketing                    4769
Advertising                  4676
BusinessServices             4500
Banking                      4049
Chemicals                    3928
Telecommunications           3891
Accounting                   3832
Military                     3128
Museums-Libraries            3096
Sports-Recreation            3038
HumanResources               3010
RealEstate                   2870
Transportation               2326
Manufacturing 

<span style="font-family: Arial; font-weight:bold;font-size:1.5em;color:#00b3e5;"> B. Clean the Structured Data

<span style="font-family: Arial; font-weight:bold;font-size:1em;color:#00b3e5;"> i. Missing value analysis and imputation

In [34]:
data_blog.isnull().sum()

id        0
gender    0
age       0
topic     0
sign      0
date      0
text      0
dtype: int64

In [35]:
data_blog.isna().sum()

id        0
gender    0
age       0
topic     0
sign      0
date      0
text      0
dtype: int64

In [36]:
data_blog.drop_duplicates('id',keep='first', inplace=True, ignore_index=False)

In [37]:
data_blog.shape

(19320, 7)

In [38]:
data_blog.nunique()

id        19320
gender        2
age          26
topic        40
sign         12
date        818
text      18835
dtype: int64

In [39]:
data_blog['topic'].value_counts()

indUnk                     6827
Student                    5120
Education                   980
Technology                  943
Arts                        721
Communications-Media        479
Internet                    397
Non-Profit                  372
Engineering                 312
Government                  236
Law                         197
Consulting                  191
Science                     184
Marketing                   180
BusinessServices            163
Publishing                  150
Advertising                 145
Religion                    139
Telecommunications          119
Military                    116
Banking                     112
Accounting                  105
Fashion                      98
Tourism                      94
HumanResources               94
Transportation               91
Sports-Recreation            90
Manufacturing                87
Architecture                 69
Chemicals                    62
Biotech                      57
LawEnfor

<span style="font-family: Arial; font-weight:bold;font-size:1em;color:#00b3e5;"> ii. Eliminate Non-English textual data

In [40]:
from langdetect import DetectorFactory
DetectorFactory.seed = 0

In [41]:
def detect_my(text):
   try:
       return detect(text)
   except:
       return 'unknown'

data_blog['language'] = data_blog['text'].apply(detect_my)

In [42]:
data_blog.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,language
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,...",en
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...,en
74,3539003,female,14,indUnk,Aries,"07,June,2004",O= optimist P= pessimist My...,en
95,4172416,female,25,indUnk,Capricorn,"08,August,2004","urlLink im new to this, ...",en
97,3668238,female,17,Student,Gemini,"30,June,2004",http://www.uploadimages.net/i...,en


In [43]:
data_blog['language'].value_counts()

en         18611
unknown      119
nl            75
lt            66
tl            56
de            55
id            50
sv            36
fr            34
af            33
so            24
pt            19
no            18
da            15
et            14
es            12
ko            11
it            10
zh-cn          9
sw             8
cy             6
vi             5
sq             4
fi             4
tr             4
pl             4
ro             3
sl             3
fa             2
ta             1
ja             1
bg             1
lv             1
zh-tw          1
ca             1
el             1
uk             1
th             1
ru             1
Name: language, dtype: int64

In [44]:
len(data_blog[data_blog['language']=='en'])

18611

In [45]:
data_blog = data_blog[data_blog['language']=='en']

In [46]:
data_blog['language'].value_counts()

en    18611
Name: language, dtype: int64

In [47]:
data_blog.describe()

Unnamed: 0,id,age
count,18611.0,18611.0
mean,3436322.0,22.837623
std,862204.3,7.995859
min,5114.0,13.0
25%,3350364.0,16.0
50%,3667099.0,23.0
75%,3980688.0,26.0
max,4337650.0,48.0


In [48]:
data_blog.corr().style.background_gradient(cmap='BuGn')

Unnamed: 0,id,age
id,1.0,-0.090361
age,-0.090361,1.0


<span style="font-family: Arial; font-weight:bold;font-size:1.5em;color:#00b3e5;"> 2. Preprocess unstructured data to make it consumable for model training

<span style="font-family: Arial; font-weight:bold;font-size:1em;color:#00b3e5;"> A. Eliminate All special Characters and Numbers

In [53]:
data_blog['text'] = data_blog['text'].apply(lambda text : re.sub("[^a-z ]","",text))

In [54]:
data_blog.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,language
0,2059027,male,15,Student,Leo,"14,May,2004",info has been found pages and mb...,en
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks to yahoos toolbar i can no...,en
74,3539003,female,14,indUnk,Aries,"07,June,2004",o optimist p pessimist my a...,en
95,4172416,female,25,indUnk,Capricorn,"08,August,2004",urllink im new to this c...,en
97,3668238,female,17,Student,Gemini,"30,June,2004",httpwwwuploadimagesnetimagesp...,en


<span style="font-family: Arial; font-weight:bold;font-size:1em;color:#00b3e5;"> B. Lowercase all textual data

In [51]:
data_blog['text'] = data_blog['text'].apply(lambda s: s.lower())

In [52]:
data_blog.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,language
0,2059027,male,15,Student,Leo,"14,May,2004",info has been found 100 pages and ...,en
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks to yahoos toolbar i can no...,en
74,3539003,female,14,indUnk,Aries,"07,June,2004",o optimist p pessimist my a...,en
95,4172416,female,25,indUnk,Capricorn,"08,August,2004",urllink im new to this c...,en
97,3668238,female,17,Student,Gemini,"30,June,2004",httpwwwuploadimagesnetimages6...,en


<span style="font-family: Arial; font-weight:bold;font-size:1em;color:#00b3e5;"> C. Remove all Stopwords

In [55]:
from string import punctuation
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english') + list(punctuation)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kakar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [56]:
data_blog['text'] = data_blog['text'].apply(lambda text: ' '.join([words for words in text.split() if words not in stop_words]) )

In [57]:
data_blog.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,language
0,2059027,male,15,Student,Leo,"14,May,2004",info found pages mb pdf files wait untill team...,en
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks yahoos toolbar capture urls popupswhich...,en
74,3539003,female,14,indUnk,Aries,"07,June,2004",optimist p pessimist argument p nooooo stop th...,en
95,4172416,female,25,indUnk,Capricorn,"08,August,2004",urllink im new tell god create links column,en
97,3668238,female,17,Student,Gemini,"30,June,2004",httpwwwuploadimagesnetimagespictjpg httpwwwupl...,en


<span style="font-family: Arial; font-weight:bold;font-size:1em;color:#00b3e5;"> D. Remove all extra white spaces

In [201]:
data_blog['text'] = data_blog['text'].apply(lambda s: s.strip())

In [69]:
data_blog.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,language
0,2059027,male,15,Student,Leo,"14,May,2004",info found pages mb pdf files wait untill team...,en
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks yahoos toolbar capture urls popupswhich...,en
74,3539003,female,14,indUnk,Aries,"07,June,2004",optimist p pessimist argument p nooooo stop th...,en
95,4172416,female,25,indUnk,Capricorn,"08,August,2004",urllink im new tell god create links column,en
97,3668238,female,17,Student,Gemini,"30,June,2004",httpwwwuploadimagesnetimagespictjpg httpwwwupl...,en


In [70]:
data_blog.drop(['id','date'], axis=1,inplace=True)

In [71]:
data_blog.columns

Index(['gender', 'age', 'topic', 'sign', 'text', 'language'], dtype='object')

In [72]:
data_blog.drop(['gender','age','sign','language'], axis=1,inplace=True)

In [74]:
data_blog.head()

Unnamed: 0,topic,text
0,Student,info found pages mb pdf files wait untill team...
4,InvestmentBanking,thanks yahoos toolbar capture urls popupswhich...
74,indUnk,optimist p pessimist argument p nooooo stop th...
95,indUnk,urllink im new tell god create links column
97,Student,httpwwwuploadimagesnetimagespictjpg httpwwwupl...


In [75]:
data_blog = data_blog[data_blog['topic']!='indUnk']

<span style="font-family: Arial; font-weight:bold;font-size:1.5em;color:#00b3e5;"> 3. Build a base Classification model

<span style="font-family: Arial; font-weight:bold;font-size:1em;color:#00b3e5;"> A. Create dependent and independent variables \
    B. Split data into train and test

In [78]:
X= data_blog['text']
y = data_blog['topic']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [79]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(8417,)
(8417,)
(3608,)
(3608,)


<span style="font-family: Arial; font-weight:bold;font-size:1em;color:#00b3e5;"> C. Vectorize data using any one vectorizer

In [80]:
tfidf = TfidfVectorizer(max_features=500)
X_train_tfidf = tfidf.fit_transform(X_train)

In [81]:
X_train_tfidf.shape

(8417, 500)

In [82]:
len(tfidf.vocabulary_)

500

In [83]:
X_test_tfidf = tfidf.transform(X_test)

In [84]:
print(X_train_tfidf[:10])
print(X_test_tfidf[:10])

  (0, 110)	0.08936343907592319
  (0, 298)	0.09633153862566615
  (0, 442)	0.0862727697355254
  (0, 210)	0.0973781208832822
  (0, 415)	0.06583346928407154
  (0, 313)	0.06959886219315504
  (0, 334)	0.11476173402402783
  (0, 15)	0.20400331979961175
  (0, 39)	0.19260075272060723
  (0, 467)	0.11562824412325345
  (0, 161)	0.08865182489644101
  (0, 156)	0.06816300520690938
  (0, 324)	0.10218988056030545
  (0, 93)	0.0627400404255186
  (0, 460)	0.10528680304028025
  (0, 9)	0.12257067130341513
  (0, 160)	0.06852511887257619
  (0, 200)	0.056397030660011435
  (0, 375)	0.1130451594539479
  (0, 83)	0.09104748425095865
  (0, 310)	0.10813098725330506
  (0, 141)	0.09428167638423876
  (0, 19)	0.08210457593668011
  (0, 213)	0.1024553507025777
  (0, 240)	0.1023412955270012
  :	:
  (9, 109)	0.0367282451939749
  (9, 298)	0.09765536928381209
  (9, 442)	0.08745836833769105
  (9, 210)	0.049358167069112655
  (9, 415)	0.10010727295376544
  (9, 15)	0.051701705937578495
  (9, 39)	0.04881188938640035
  (9, 156)	0.06

In [85]:
y_train.values

array(['Student', 'Technology', 'Religion', ..., 'Student', 'Non-Profit',
       'Education'], dtype=object)

In [86]:
val_list,counts = np.unique(y_train,return_counts=True)
print(val_list)
print(counts)

['Accounting' 'Advertising' 'Agriculture' 'Architecture' 'Arts'
 'Automotive' 'Banking' 'Biotech' 'BusinessServices' 'Chemicals'
 'Communications-Media' 'Construction' 'Consulting' 'Education'
 'Engineering' 'Environment' 'Fashion' 'Government' 'HumanResources'
 'Internet' 'InvestmentBanking' 'Law' 'LawEnforcement-Security'
 'Manufacturing' 'Maritime' 'Marketing' 'Military' 'Museums-Libraries'
 'Non-Profit' 'Publishing' 'RealEstate' 'Religion' 'Science'
 'Sports-Recreation' 'Student' 'Technology' 'Telecommunications' 'Tourism'
 'Transportation']
[  80   99   21   42  488   32   81   40   96   39  323   38  136  664
  212   20   66  160   67  264   23  119   38   52   11  113   76   36
  252  108   38   95  126   57 3472  631   84   56   62]


In [87]:
val_list,counts = np.unique(y_test,return_counts=True)
print(val_list)
print(counts)

['Accounting' 'Advertising' 'Agriculture' 'Architecture' 'Arts'
 'Automotive' 'Banking' 'Biotech' 'BusinessServices' 'Chemicals'
 'Communications-Media' 'Construction' 'Consulting' 'Education'
 'Engineering' 'Environment' 'Fashion' 'Government' 'HumanResources'
 'Internet' 'InvestmentBanking' 'Law' 'LawEnforcement-Security'
 'Manufacturing' 'Maritime' 'Marketing' 'Military' 'Museums-Libraries'
 'Non-Profit' 'Publishing' 'RealEstate' 'Religion' 'Science'
 'Sports-Recreation' 'Student' 'Technology' 'Telecommunications' 'Tourism'
 'Transportation']
[  21   39   14   18  207   21   30   15   57   21  139   14   47  277
   79    8   23   68   25  116    8   69   15   32    6   57   35   17
  110   40   17   39   49   32 1474  275   29   37   28]


In [88]:
labels_unique_list = []

for label in y.values:
    if label not in labels_unique_list:
        labels_unique_list.append(label)

In [89]:
labels_unique_list

['Student',
 'InvestmentBanking',
 'Non-Profit',
 'Banking',
 'Education',
 'Engineering',
 'Science',
 'Communications-Media',
 'BusinessServices',
 'Sports-Recreation',
 'Arts',
 'Internet',
 'Museums-Libraries',
 'Accounting',
 'Technology',
 'Law',
 'Automotive',
 'Consulting',
 'Religion',
 'Fashion',
 'Publishing',
 'Marketing',
 'LawEnforcement-Security',
 'HumanResources',
 'Telecommunications',
 'Military',
 'Transportation',
 'Architecture',
 'Government',
 'Advertising',
 'Agriculture',
 'Biotech',
 'RealEstate',
 'Manufacturing',
 'Construction',
 'Chemicals',
 'Maritime',
 'Tourism',
 'Environment']

In [90]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
print(y_train.shape)
y_test = le.transform(y_test)
print(y_test.shape)

(8417,)
(3608,)


In [91]:
val_list,counts = np.unique(y_train,return_counts=True)
print(val_list)
print(counts)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38]
[  80   99   21   42  488   32   81   40   96   39  323   38  136  664
  212   20   66  160   67  264   23  119   38   52   11  113   76   36
  252  108   38   95  126   57 3472  631   84   56   62]


In [92]:
val_list,counts = np.unique(y_test,return_counts=True)
print(val_list)
print(counts)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38]
[  21   39   14   18  207   21   30   15   57   21  139   14   47  277
   79    8   23   68   25  116    8   69   15   32    6   57   35   17
  110   40   17   39   49   32 1474  275   29   37   28]


<span style="font-family: Arial; font-weight:bold;font-size:1em;color:#00b3e5;"> B. Build a base model for Supervised Learning - Classification

In [93]:
model=LogisticRegression(solver='liblinear')
model=OneVsRestClassifier(model)
model.fit(X_train_tfidf,y_train)

OneVsRestClassifier(estimator=LogisticRegression(solver='liblinear'))

In [94]:
y_predict = model.predict(X_test_tfidf)

In [95]:
y_predict

array([34, 34, 34, ..., 34, 34, 34])

<span style="font-family: Arial; font-weight:bold;font-size:1em;color:#00b3e5;"> E. Clearly print Performance Metrics

In [96]:
print('Accuracy obtained: ', accuracy_score(y_test, y_predict))

Accuracy obtained:  0.41019955654101997


In [97]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        21
           1       0.00      0.00      0.00        39
           2       0.00      0.00      0.00        14
           3       0.00      0.00      0.00        18
           4       0.05      0.00      0.01       207
           5       0.00      0.00      0.00        21
           6       0.00      0.00      0.00        30
           7       0.00      0.00      0.00        15
           8       0.00      0.00      0.00        57
           9       0.00      0.00      0.00        21
          10       0.50      0.01      0.01       139
          11       0.00      0.00      0.00        14
          12       0.00      0.00      0.00        47
          13       0.15      0.03      0.05       277
          14       0.00      0.00      0.00        79
          15       0.00      0.00      0.00         8
          16       0.00      0.00      0.00        23
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))


In [98]:
y_predict_probs = model.predict_proba(X_test_tfidf)

#print('ROC-AUC score (micro averaging): ', roc_auc_score(y_test, y_predict_probs, average="micro", multi_class="ovr"))
print('ROC-AUC score (weighted averaging): ', roc_auc_score(y_test, y_predict_probs, average="weighted", multi_class="ovr"))
print('ROC-AUC score (macro averaging): ', roc_auc_score(y_test, y_predict_probs, average="macro", multi_class="ovr"))

ROC-AUC score (weighted averaging):  0.6398078331094302
ROC-AUC score (macro averaging):  0.5761380175651392


<span style="font-family: Arial; font-weight:bold;font-size:1.5em;color:#00b3e5;"> 4. Improve Performance of model

<span style="font-family: Arial; font-weight:bold;font-size:1em;color:#00b3e5;"> A. Experiment with other vectorisers \
B. Build classifier Models using other algorithms than base model \
C. Tune Parameters/Hyperparameters of the model/s\
D. Clearly print Performance Metrics

In [101]:
cntvec = CountVectorizer(max_features=10)
X_train_cntvec = cntvec.fit_transform(X_train)

In [102]:
X_train_cntvec.shape

(8417, 10)

In [103]:
len(cntvec.vocabulary_)

10

In [104]:
X_test_cntvec = cntvec.transform(X_test)

In [105]:
print(X_train_cntvec[:10])
print(X_test_cntvec[:10])

  (0, 6)	1
  (0, 7)	2
  (0, 3)	1
  (0, 0)	1
  (1, 6)	3
  (1, 7)	2
  (1, 3)	6
  (1, 0)	3
  (1, 2)	3
  (1, 1)	5
  (1, 5)	3
  (1, 8)	2
  (1, 9)	1
  (1, 4)	3
  (2, 3)	3
  (4, 6)	5
  (4, 2)	1
  (4, 1)	3
  (4, 5)	2
  (4, 8)	3
  (4, 9)	1
  (5, 3)	1
  (5, 2)	1
  (5, 5)	3
  (5, 9)	1
  (6, 6)	1
  (6, 7)	2
  (6, 3)	2
  (6, 0)	1
  (6, 2)	1
  (6, 1)	1
  (6, 9)	1
  (6, 4)	1
  (7, 2)	2
  (7, 9)	1
  (8, 3)	1
  (8, 2)	1
  (9, 6)	3
  (9, 7)	2
  (9, 3)	8
  (9, 0)	4
  (9, 2)	2
  (9, 5)	1
  (9, 8)	4
  (9, 9)	4
  (9, 4)	8
  (0, 1)	1
  (0, 3)	2
  (0, 4)	1
  (0, 6)	1
  (2, 1)	1
  (2, 2)	1
  (2, 4)	1
  (2, 5)	1
  (2, 6)	1
  (3, 1)	1
  (3, 3)	2
  (3, 4)	1
  (3, 6)	1
  (3, 7)	1
  (4, 1)	1
  (4, 8)	1
  (5, 0)	1
  (5, 1)	1
  (5, 6)	1
  (7, 2)	2
  (7, 3)	4
  (7, 5)	1
  (7, 6)	4
  (7, 8)	4
  (7, 9)	2
  (9, 3)	3


In [106]:
scaler = MaxAbsScaler()
X_train_cntvec = scaler.fit_transform(X_train_cntvec)
X_test_cntvec = scaler.fit_transform(X_test_cntvec)

In [107]:
print(X_train_cntvec[:10])
print(X_test_cntvec[:10])

  (0, 0)	0.009523809523809525
  (0, 3)	0.016666666666666666
  (0, 6)	0.006711409395973154
  (0, 7)	0.01652892561983471
  (1, 0)	0.028571428571428574
  (1, 1)	0.03067484662576687
  (1, 2)	0.03529411764705882
  (1, 3)	0.1
  (1, 4)	0.03296703296703297
  (1, 5)	0.01507537688442211
  (1, 6)	0.020134228187919462
  (1, 7)	0.01652892561983471
  (1, 8)	0.024390243902439025
  (1, 9)	0.007462686567164179
  (2, 3)	0.05
  (4, 1)	0.018404907975460124
  (4, 2)	0.011764705882352941
  (4, 5)	0.010050251256281407
  (4, 6)	0.03355704697986577
  (4, 8)	0.03658536585365854
  (4, 9)	0.007462686567164179
  (5, 2)	0.011764705882352941
  (5, 3)	0.016666666666666666
  (5, 5)	0.01507537688442211
  (5, 9)	0.007462686567164179
  (6, 0)	0.009523809523809525
  (6, 1)	0.006134969325153374
  (6, 2)	0.011764705882352941
  (6, 3)	0.03333333333333333
  (6, 4)	0.01098901098901099
  (6, 6)	0.006711409395973154
  (6, 7)	0.01652892561983471
  (6, 9)	0.007462686567164179
  (7, 2)	0.023529411764705882
  (7, 9)	0.00746268656716

In [108]:
model=LogisticRegression(solver='liblinear')
model=OneVsRestClassifier(model)
model.fit(X_train_cntvec,y_train)

OneVsRestClassifier(estimator=LogisticRegression(solver='liblinear'))

In [109]:
y_predict = model.predict(X_test_cntvec)

In [110]:
print('Accuracy obtained: ', accuracy_score(y_test, y_predict))

Accuracy obtained:  0.40853658536585363


In [111]:
print(classification_report(y_test, y_predict, target_names=labels_unique_list))

                         precision    recall  f1-score   support

                Student       0.00      0.00      0.00        21
      InvestmentBanking       0.00      0.00      0.00        39
             Non-Profit       0.00      0.00      0.00        14
                Banking       0.00      0.00      0.00        18
              Education       0.00      0.00      0.00       207
            Engineering       0.00      0.00      0.00        21
                Science       0.00      0.00      0.00        30
   Communications-Media       0.00      0.00      0.00        15
       BusinessServices       0.00      0.00      0.00        57
      Sports-Recreation       0.00      0.00      0.00        21
                   Arts       0.00      0.00      0.00       139
               Internet       0.00      0.00      0.00        14
      Museums-Libraries       0.00      0.00      0.00        47
             Accounting       0.00      0.00      0.00       277
             Technology 

  _warn_prf(average, modifier, msg_start, len(result))


In [112]:
y_predict_probs = model.predict_proba(X_test_cntvec)

#print('ROC-AUC score (micro averaging): ', roc_auc_score(y_test, y_predict_probs, average="micro", multi_class="ovr"))
print('ROC-AUC score (weighted averaging): ', roc_auc_score(y_test, y_predict_probs, average="weighted", multi_class="ovr"))
print('ROC-AUC score (macro averaging): ', roc_auc_score(y_test, y_predict_probs, average="macro", multi_class="ovr"))

ROC-AUC score (weighted averaging):  0.5608855684516297
ROC-AUC score (macro averaging):  0.5369031542728652


In [113]:
model = RandomForestClassifier()  
#model=OneVsRestClassifier(model)
model.fit(X_train_tfidf,y_train)

RandomForestClassifier()

In [114]:
y_predict = model.predict(X_test_tfidf)

In [115]:
y_predict

array([34, 34, 34, ..., 34, 34, 34])

In [116]:
print('Accuracy obtained: ', accuracy_score(y_test, y_predict))

Accuracy obtained:  0.4077050997782705


In [117]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        21
           1       0.00      0.00      0.00        39
           2       0.00      0.00      0.00        14
           3       0.00      0.00      0.00        18
           4       0.11      0.00      0.01       207
           5       0.00      0.00      0.00        21
           6       0.00      0.00      0.00        30
           7       0.00      0.00      0.00        15
           8       0.00      0.00      0.00        57
           9       0.00      0.00      0.00        21
          10       0.00      0.00      0.00       139
          11       0.00      0.00      0.00        14
          12       0.00      0.00      0.00        47
          13       0.16      0.01      0.02       277
          14       0.17      0.01      0.02        79
          15       0.00      0.00      0.00         8
          16       0.00      0.00      0.00        23
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))


In [118]:
y_predict_probs = model.predict_proba(X_test_tfidf)

#print('ROC-AUC score (micro averaging): ', roc_auc_score(y_test, y_predict_probs, average="micro", multi_class="ovr"))
print('ROC-AUC score (weighted averaging): ', roc_auc_score(y_test, y_predict_probs, average="weighted", multi_class="ovr"))
print('ROC-AUC score (macro averaging): ', roc_auc_score(y_test, y_predict_probs, average="macro", multi_class="ovr"))

ROC-AUC score (weighted averaging):  0.6099929864637126
ROC-AUC score (macro averaging):  0.5303237269706429


In [128]:
model = RandomForestClassifier()  
model=OneVsRestClassifier(model)
model.fit(X_train_cntvec,y_train)

OneVsRestClassifier(estimator=RandomForestClassifier())

In [120]:
y_predict = model.predict(X_test_cntvec)

In [121]:
print('Accuracy obtained: ', accuracy_score(y_test, y_predict))

Accuracy obtained:  0.3514412416851441


In [122]:
print(classification_report(y_test, y_predict, target_names=labels_unique_list))

                         precision    recall  f1-score   support

                Student       0.00      0.00      0.00        21
      InvestmentBanking       0.00      0.00      0.00        39
             Non-Profit       0.00      0.00      0.00        14
                Banking       0.00      0.00      0.00        18
              Education       0.05      0.01      0.02       207
            Engineering       0.00      0.00      0.00        21
                Science       0.00      0.00      0.00        30
   Communications-Media       0.00      0.00      0.00        15
       BusinessServices       0.00      0.00      0.00        57
      Sports-Recreation       0.00      0.00      0.00        21
                   Arts       0.07      0.02      0.03       139
               Internet       0.00      0.00      0.00        14
      Museums-Libraries       0.00      0.00      0.00        47
             Accounting       0.07      0.03      0.04       277
             Technology 

  _warn_prf(average, modifier, msg_start, len(result))


In [123]:
y_predict_probs = model.predict_proba(X_test_cntvec)

#print('ROC-AUC score (micro averaging): ', roc_auc_score(y_test, y_predict_probs, average="micro", multi_class="ovr"))
print('ROC-AUC score (weighted averaging): ', roc_auc_score(y_test, y_predict_probs, average="weighted", multi_class="ovr"))
print('ROC-AUC score (macro averaging): ', roc_auc_score(y_test, y_predict_probs, average="macro", multi_class="ovr"))

ROC-AUC score (weighted averaging):  0.5240385321654577
ROC-AUC score (macro averaging):  0.5164691237465877


In [127]:
hyperparameters = {'penalty':['l1','l2'], 'C':np.logspace(-3,3,7),'solver':['newton-cg', 'liblinear','lbfgs']}
grid_search = GridSearchCV(LogisticRegression(),hyperparameters,n_jobs=4,scoring="accuracy",cv=3)
model = grid_search.fit(X_train_tfidf,y_train)
best_model_lr = model.best_estimator_
best_model_lr.fit(X_train_tfidf, y_train)
print(best_model_lr)

LogisticRegression(C=0.001, penalty='l1', solver='liblinear')


In [133]:
hyperparameters = {'n_estimators': [200, 500],'max_depth' : [5,10],'criterion' :['gini', 'entropy']}
grid_search = GridSearchCV(RandomForestClassifier(),hyperparameters,n_jobs=4,scoring="accuracy",cv=3)
model = grid_search.fit(X_train_tfidf,y_train)
best_model_rf = model.best_estimator_
best_model_rf.fit(X_train_tfidf, y_train)
print(best_model_rf)

RandomForestClassifier(max_depth=5, n_estimators=200)


In [132]:
y_predict = best_model_lr.predict(X_test_tfidf)

print('Accuracy obtained: ', accuracy_score(y_test, y_predict))

print(classification_report(y_test, y_predict, target_names=labels_unique_list))

y_predict_probs = best_model_lr.predict_proba(X_test_tfidf)

#print('ROC-AUC score (micro averaging): ', roc_auc_score(y_test, y_predict_probs, average="micro", multi_class="ovr"))
print('ROC-AUC score (weighted averaging): ', roc_auc_score(y_test, y_predict_probs, average="weighted", multi_class="ovr"))
print('ROC-AUC score (macro averaging): ', roc_auc_score(y_test, y_predict_probs, average="macro", multi_class="ovr"))

Accuracy obtained:  0.40853658536585363
                         precision    recall  f1-score   support

                Student       0.00      0.00      0.00        21
      InvestmentBanking       0.00      0.00      0.00        39
             Non-Profit       0.00      0.00      0.00        14
                Banking       0.00      0.00      0.00        18
              Education       0.00      0.00      0.00       207
            Engineering       0.00      0.00      0.00        21
                Science       0.00      0.00      0.00        30
   Communications-Media       0.00      0.00      0.00        15
       BusinessServices       0.00      0.00      0.00        57
      Sports-Recreation       0.00      0.00      0.00        21
                   Arts       0.00      0.00      0.00       139
               Internet       0.00      0.00      0.00        14
      Museums-Libraries       0.00      0.00      0.00        47
             Accounting       0.00      0.00     

  _warn_prf(average, modifier, msg_start, len(result))


In [134]:
y_predict = best_model_rf.predict(X_test_tfidf)

print('Accuracy obtained: ', accuracy_score(y_test, y_predict))

print(classification_report(y_test, y_predict, target_names=labels_unique_list))

y_predict_probs = best_model_rf.predict_proba(X_test_tfidf)

#print('ROC-AUC score (micro averaging): ', roc_auc_score(y_test, y_predict_probs, average="micro", multi_class="ovr"))
print('ROC-AUC score (weighted averaging): ', roc_auc_score(y_test, y_predict_probs, average="weighted", multi_class="ovr"))
print('ROC-AUC score (macro averaging): ', roc_auc_score(y_test, y_predict_probs, average="macro", multi_class="ovr"))

Accuracy obtained:  0.40853658536585363
                         precision    recall  f1-score   support

                Student       0.00      0.00      0.00        21
      InvestmentBanking       0.00      0.00      0.00        39
             Non-Profit       0.00      0.00      0.00        14
                Banking       0.00      0.00      0.00        18
              Education       0.00      0.00      0.00       207
            Engineering       0.00      0.00      0.00        21
                Science       0.00      0.00      0.00        30
   Communications-Media       0.00      0.00      0.00        15
       BusinessServices       0.00      0.00      0.00        57
      Sports-Recreation       0.00      0.00      0.00        21
                   Arts       0.00      0.00      0.00       139
               Internet       0.00      0.00      0.00        14
      Museums-Libraries       0.00      0.00      0.00        47
             Accounting       0.00      0.00     

  _warn_prf(average, modifier, msg_start, len(result))


ROC-AUC score (weighted averaging):  0.632202354934222
ROC-AUC score (macro averaging):  0.5955280787677655


<span style="font-family: Arial; font-weight:bold;font-size:1.5em;color:#00b3e5;"> 5. Share insights on relative performance comparison 

<span style="font-family: Arial; font-weight:bold;font-size:1.5em;color:#00b3e5;"> A. Which vectorizer performed better? Probable reason?\
- Tfidf vectorizer performed better than the count vectorizer with both the logistic regression and random forest models. The reason behind this is that the tfidf vectorizer normalizes the term count and also penalizes the terms that occur often.

<span style="font-family: Arial; font-weight:bold;font-size:1.5em;color:#00b3e5;"> B. Which model outperformed? Probable reason? \
- Here, the random forest classifier with Tfidf vectorizer has better ROC-AUC score over the others. One probable reason could be this model captures the variance in the data better than the other models.

<span style="font-family: Arial; font-weight:bold;font-size:1.5em;color:#00b3e5;"> C. Which parameter/hyperparameter significantly helped
to improve performance?Probable reason? \
- Here, both 'max_depth' and 'n_estimators' of the random forest classifier model significantly improved the ROC-AUC score. 

<span style="font-family: Arial; font-weight:bold;font-size:1.5em;color:#00b3e5;"> D. According to you, which performance metric should be given most importance, why?\
- ROC-AUC score has to be given more importance as this represents the ability of the model to classify the given classes correctly.

<span style="font-family: Arial; font-weight:bold;font-size:2.5em;color:#00b3e5;"> Part-2

In [353]:
with open('GL Bot.json') as file:
    corpus = json.load(file)
    
print(corpus)

{'intents': [{'tag': 'Intro', 'patterns': ['hi', 'how are you', 'is anyone there', 'hello', 'whats up', 'hey', 'yo', 'listen', 'please help me', 'i am learner from', 'i belong to', 'aiml batch', 'aifl batch', 'i am from', 'my pm is', 'blended', 'online', 'i am from', 'hey ya', 'talking to you for first time'], 'responses': ['Hello! how can i help you ?'], 'context_set': ''}, {'tag': 'Exit', 'patterns': ['thank you', 'thanks', 'cya', 'see you', 'later', 'see you later', 'goodbye', 'i am leaving', 'have a Good day', 'you helped me', 'thanks a lot', 'thanks a ton', 'you are the best', 'great help', 'too good', 'you are a good learning buddy'], 'responses': ['I hope I was able to assist you, Good Bye'], 'context_set': ''}, {'tag': 'Olympus', 'patterns': ['olympus', 'explain me how olympus works', 'I am not able to understand olympus', 'olympus window not working', 'no access to olympus', 'unable to see link in olympus', 'no link visible on olympus', 'whom to contact for olympus', 'lot of p

In [354]:
type(corpus)

dict

In [355]:
corpus['intents']

[{'tag': 'Intro',
  'patterns': ['hi',
   'how are you',
   'is anyone there',
   'hello',
   'whats up',
   'hey',
   'yo',
   'listen',
   'please help me',
   'i am learner from',
   'i belong to',
   'aiml batch',
   'aifl batch',
   'i am from',
   'my pm is',
   'blended',
   'online',
   'i am from',
   'hey ya',
   'talking to you for first time'],
  'responses': ['Hello! how can i help you ?'],
  'context_set': ''},
 {'tag': 'Exit',
  'patterns': ['thank you',
   'thanks',
   'cya',
   'see you',
   'later',
   'see you later',
   'goodbye',
   'i am leaving',
   'have a Good day',
   'you helped me',
   'thanks a lot',
   'thanks a ton',
   'you are the best',
   'great help',
   'too good',
   'you are a good learning buddy'],
  'responses': ['I hope I was able to assist you, Good Bye'],
  'context_set': ''},
 {'tag': 'Olympus',
  'patterns': ['olympus',
   'explain me how olympus works',
   'I am not able to understand olympus',
   'olympus window not working',
   'no acces

In [356]:
corpus['intents'][0]

{'tag': 'Intro',
 'patterns': ['hi',
  'how are you',
  'is anyone there',
  'hello',
  'whats up',
  'hey',
  'yo',
  'listen',
  'please help me',
  'i am learner from',
  'i belong to',
  'aiml batch',
  'aifl batch',
  'i am from',
  'my pm is',
  'blended',
  'online',
  'i am from',
  'hey ya',
  'talking to you for first time'],
 'responses': ['Hello! how can i help you ?'],
 'context_set': ''}

In [357]:
type(corpus['intents'][0])

dict

In [358]:
corpus['intents'][0]['tag']

'Intro'

In [359]:
len(corpus['intents'])

8

In [360]:
tags_list = []

for i in range(len(corpus['intents'])):
    tags_list.append(corpus['intents'][i]['tag'])

In [361]:
print('\nAll the patterns in the corpus:\n',tags_list)


All the patterns in the corpus:
 ['Intro', 'Exit', 'Olympus', 'SL', 'NN', 'Bot', 'Profane', 'Ticket']


In [362]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kakar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [363]:
doc_patterns = []
X = []
y = []

for i in range(len(corpus['intents'])):
    for j in range(len(corpus['intents'][i]['patterns'])):
        pattern_list_temp = corpus['intents'][i]['patterns'][j]
        pattern_list_temp = re.sub("[^a-z ]","",pattern_list_temp)
        pattern_list_temp = pattern_list_temp.lower()
        pattern_list_temp = ' '.join([words for words in pattern_list_temp.split() if words not in stop_words])
        pattern_list_temp = pattern_list_temp.strip()
        doc_patterns.append(pattern_list_temp)
        y.append(corpus['intents'][i]['tag'])

doc_patterns = np.array(doc_patterns)
tfid = TfidfVectorizer(use_idf=True)
X = tfid.fit_transform(doc_patterns)
y = np.array(y)

In [364]:
tfid.get_feature_names()

['able',
 'access',
 'activation',
 'ada',
 'adam',
 'aifl',
 'aiml',
 'ann',
 'anyone',
 'artificial',
 'backward',
 'bad',
 'bagging',
 'batch',
 'bayes',
 'belong',
 'best',
 'blended',
 'bloody',
 'boosting',
 'bot',
 'buddy',
 'classification',
 'contact',
 'create',
 'cross',
 'cya',
 'day',
 'deep',
 'diffult',
 'ensemble',
 'epoch',
 'epochs',
 'explain',
 'first',
 'forest',
 'forward',
 'function',
 'good',
 'goodbye',
 'gradient',
 'great',
 'hate',
 'hell',
 'hello',
 'help',
 'helped',
 'hey',
 'hi',
 'hidden',
 'hours',
 'hyper',
 'imputer',
 'intelligence',
 'jerk',
 'joke',
 'knn',
 'later',
 'layers',
 'learner',
 'learning',
 'leaving',
 'link',
 'listen',
 'logistic',
 'lot',
 'machine',
 'naive',
 'name',
 'nb',
 'nets',
 'networks',
 'neural',
 'olympus',
 'olypus',
 'online',
 'ood',
 'operation',
 'opertions',
 'otimizer',
 'parameters',
 'piece',
 'please',
 'pm',
 'problem',
 'problems',
 'propagation',
 'random',
 'regression',
 'relu',
 'screw',
 'see',
 'sgd

In [365]:
print(X.shape)
print(y.shape)
print(len(np.unique(y)))

(128, 127)
(128,)
8


In [366]:
X

<128x127 sparse matrix of type '<class 'numpy.float64'>'
	with 240 stored elements in Compressed Sparse Row format>

In [367]:
X = X.toarray()

In [368]:
y

array(['Intro', 'Intro', 'Intro', 'Intro', 'Intro', 'Intro', 'Intro',
       'Intro', 'Intro', 'Intro', 'Intro', 'Intro', 'Intro', 'Intro',
       'Intro', 'Intro', 'Intro', 'Intro', 'Intro', 'Intro', 'Exit',
       'Exit', 'Exit', 'Exit', 'Exit', 'Exit', 'Exit', 'Exit', 'Exit',
       'Exit', 'Exit', 'Exit', 'Exit', 'Exit', 'Exit', 'Exit', 'Olympus',
       'Olympus', 'Olympus', 'Olympus', 'Olympus', 'Olympus', 'Olympus',
       'Olympus', 'Olympus', 'Olympus', 'Olympus', 'Olympus', 'Olympus',
       'SL', 'SL', 'SL', 'SL', 'SL', 'SL', 'SL', 'SL', 'SL', 'SL', 'SL',
       'SL', 'SL', 'SL', 'SL', 'SL', 'SL', 'SL', 'SL', 'SL', 'SL', 'SL',
       'SL', 'SL', 'SL', 'SL', 'SL', 'SL', 'SL', 'NN', 'NN', 'NN', 'NN',
       'NN', 'NN', 'NN', 'NN', 'NN', 'NN', 'NN', 'NN', 'NN', 'NN', 'NN',
       'NN', 'NN', 'NN', 'NN', 'NN', 'NN', 'NN', 'NN', 'NN', 'Bot', 'Bot',
       'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Bot', 'Profane', 'Profane',
       'Profane', 'Profane', 'Profane', 'Profane', 'Profane', 

In [369]:
y_encoded = pd.get_dummies(y, prefix='Class')
print(y_encoded.shape)

(128, 8)


In [370]:
y_encoded.columns

Index(['Class_Bot', 'Class_Exit', 'Class_Intro', 'Class_NN', 'Class_Olympus',
       'Class_Profane', 'Class_SL', 'Class_Ticket'],
      dtype='object')

In [371]:
y_encoded.loc[0]

Class_Bot        0
Class_Exit       0
Class_Intro      1
Class_NN         0
Class_Olympus    0
Class_Profane    0
Class_SL         0
Class_Ticket     0
Name: 0, dtype: uint8

In [372]:
model = tf.keras.models.Sequential() 

model.add(tf.keras.layers.Dense(128,input_dim=X.shape[1], activation="relu"))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(32, activation="relu"))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(32, activation="relu"))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(32, activation="relu"))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(32, activation="relu"))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(32, activation="relu"))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(32, activation="relu"))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(32, activation="relu"))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(32, activation="relu"))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(8, activation="softmax"))

In [373]:
from tensorflow.keras.optimizers import Adam
optimizer = Adam(learning_rate=1e-4, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

In [375]:
model.compile(loss="categorical_crossentropy", metrics=["accuracy"], optimizer=optimizer)
model.fit(X,y_encoded,epochs=500, batch_size=64, verbose=1)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.callbacks.History at 0x1fc8e5aa1c8>

<span style="font-family: Arial; font-weight:bold;font-size:1.5em;color:#00b3e5;"> 1. Start chat session with greetings and ask what the user is looking for \
2. Accept dynamic text based questions from the user. Reply back with relevant answer from the designed corpus \
3. End the chat session only if the user requests to end else ask what the user is looking for. Loop continues till the user asks to end it \

In [376]:
def chat():
    print('Chat with Priya (type: \'quit\' to stop)')
    while True:
        inp = input('\n\nYou:')
        if inp.lower()=='quit':
            break
        
        inp_temp = re.sub("[^a-z ]","",inp)
        inp_temp = inp.lower()
        inp_temp = ' '.join([words for words in inp.split() if words not in stop_words])
        inp_temp = inp_temp.strip()
        inp_vec = tfid.transform([inp_temp])
        inp_vec = inp_vec.toarray()
        
        predicted_tag_index = np.argmax(model.predict(inp_vec))
        predicted_tag = tags_list[predicted_tag_index]
        print(predicted_tag)
                                        
        for i in range(len(corpus['intents'])):
            tag = corpus['intents'][i]['tag']
            if tag == predicted_tag:
                response = corpus['intents'][i]['responses']
        print(response)

In [377]:
chat()

Chat with Priya (type: 'quit' to stop)




You: hi


Exit
['I hope I was able to assist you, Good Bye']




You: hello


SL
['Link: Machine Learning wiki ']




You: bye


SL
['Link: Machine Learning wiki ']




You: great learning


SL
['Link: Machine Learning wiki ']




You: neural networks


SL
['Link: Machine Learning wiki ']




You: quit


<span style="font-family: Arial; font-weight:bold;font-size:1.5em;color:#00b3e5;"> Repeating the above steps with the SVM classifier

In [306]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

svm_clf = svm.SVC()
svm_clf.fit(X , y_encoded)

SVC()

In [344]:
def chat():
    print('Chat with Priya (type: \'quit\' to stop)')
    while True:
        inp = input('\n\nYou:')
        if inp.lower()=='quit':
            break
        
        inp_temp = re.sub("[^a-z ]","",inp)
        inp_temp = inp.lower()
        inp_temp = ' '.join([words for words in inp.split() if words not in stop_words])
        inp_temp = inp_temp.strip()
        inp_vec = tfid.transform([inp_temp])
        inp_vec = inp_vec.toarray()
        
        predicted_tag_index = np.argmax(svm_clf.predict(inp_vec))
        predicted_tag = tags_list[predicted_tag_index]
        print(predicted_tag)
                                        
        for i in range(len(corpus['intents'])):
            tag = corpus['intents'][i]['tag']
            if tag == predicted_tag:
                response = corpus['intents'][i]['responses']
        print(response)

In [345]:
chat()

Chat with Priya (type: 'quit' to stop)




You: hi


Intro
['Hello! how can i help you ?']




You: Hello


Intro
['Hello! how can i help you ?']




You: Great Learning


Intro
['Hello! how can i help you ?']




You: Supervised Learning


Intro
['Hello! how can i help you ?']




You: neural nets


Intro
['Hello! how can i help you ?']




You: quit


<span style="font-family: Arial; font-weight:bold;font-size:1.5em;color:#00b3e5;"> Conclusions: \
- The given data seems to be not enough for training a chatbot.
- Both the neural networks and SVM model give only one response to the user.