**PART ONE - MULTI LABEL TEXT CLASSIFIER**



**1. Import and analyse the data set.**

In [1]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Setting the current working directory
import os; os.chdir('/content/drive/MyDrive/AIML/Projects/Statistical NLP')

In [17]:
import pandas as pd
import numpy as np
import scipy as sp
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn import metrics
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
%matplotlib inline
import nltk
from nltk.corpus import stopwords 
from collections import Counter

import warnings
warnings.filterwarnings('ignore')


In [4]:
# Loading  the dataset
blog_df=pd.read_csv('blogtext.csv')

In [5]:
blog_df.shape

(681284, 7)

In [6]:
blog_df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [7]:
blog_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681284 entries, 0 to 681283
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      681284 non-null  int64 
 1   gender  681284 non-null  object
 2   age     681284 non-null  int64 
 3   topic   681284 non-null  object
 4   sign    681284 non-null  object
 5   date    681284 non-null  object
 6   text    681284 non-null  object
dtypes: int64(2), object(5)
memory usage: 36.4+ MB


In [8]:
blog_df['age']=blog_df['age'].astype('str')

In [9]:
blog_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681284 entries, 0 to 681283
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      681284 non-null  int64 
 1   gender  681284 non-null  object
 2   age     681284 non-null  object
 3   topic   681284 non-null  object
 4   sign    681284 non-null  object
 5   date    681284 non-null  object
 6   text    681284 non-null  object
dtypes: int64(1), object(6)
memory usage: 36.4+ MB


In [10]:
blog_df.isna().sum()

id        0
gender    0
age       0
topic     0
sign      0
date      0
text      0
dtype: int64

In [11]:
blog_df.isnull().sum()

id        0
gender    0
age       0
topic     0
sign      0
date      0
text      0
dtype: int64

**2. Perform data pre-processing on the data:**

**• Data cleansing by removing unwanted characters, spaces, stop words etc. Convert text to lowercase.**

In [12]:
# function for Text Cleansing

def clean_text(text):
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    text = text.lower() 
    
    return text

In [13]:
blog_df['clean_text']=blog_df['text'].apply(lambda x: clean_text(x))

In [14]:
blog_df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,clean_text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,...",info has been found pages and mb of pdf files ...
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...,these are the team members drewes van der laag...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...,in het kader van kernfusie op aarde maak je ei...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!,testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...,thanks to yahoo s toolbar i can now capture th...


After Data cleansing procedure, File is now free from special characters, removed spaces and also all text been converted to Lower case 

In [16]:
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [18]:
# function to remove stopwords

def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

In [19]:
blog_df['clean_text'] = blog_df['clean_text'].apply(lambda x: remove_stopwords(x))

In [20]:
blog_df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,clean_text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,...",info found pages mb pdf files wait untill team...
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...,team members drewes van der laag urllink mail ...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...,het kader van kernfusie op aarde maak je eigen...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!,testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...,thanks yahoo toolbar capture urls popups means...


**Target/label merger and transformation**

In [38]:
blog_df=blog_df.iloc[0:681284]

In [39]:
blog_df['labels']=' '

In [40]:
# Merging label columns gender, age, topic and sign for multi label classification problem 
# Merging helps to put up all labels together for a specific sentence
for w in range(0,blog_df.shape[0]):
    blog_label=[]
    blog_label.append(blog_df['gender'][w])
    blog_label.append(blog_df['age'][w])
    blog_label.append(blog_df['topic'][w])
    blog_label.append(blog_df['sign'][w])
    blog_df['labels'][w]=blog_label
    

In [41]:
blog_df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,clean_text,labels
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,...",info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!,testing testing,"[male, 15, Student, Leo]"
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...,thanks yahoo toolbar capture urls popups means...,"[male, 33, InvestmentBanking, Aquarius]"


In [42]:
blog_df['id'].nunique()

808

**• Train and test split**

In [43]:
blog_df.sample(20)

Unnamed: 0,id,gender,age,topic,sign,date,text,clean_text,labels
9603,2844168,female,27,Marketing,Aquarius,"11,April,2004",Happy Easter! Easter bless...,happy easter easter blessings hope wonderful h...,"[female, 27, Marketing, Aquarius]"
23665,299143,female,23,Engineering,Cancer,"18,November,2003",Maybe I don't really belong h...,maybe dont really belong maybe english major m...,"[female, 23, Engineering, Cancer]"
14747,727002,male,23,Internet,Leo,"22,March,2003",i like to escape reality... my im...,like escape reality imagination always hero th...,"[male, 23, Internet, Leo]"
19083,3747578,female,14,indUnk,Leo,"10,August,2004",whheeeee.wahahhas.have been pms-ing lat...,whheeeee wahahhas pms ing lately opps hahas an...,"[female, 14, indUnk, Leo]"
8784,4192173,male,27,Technology,Pisces,"10,August,2004","In response to my previous post, ...",response previous post person called h bar boa...,"[male, 27, Technology, Pisces]"
27436,3612927,female,23,Science,Libra,"11,July,2004",i worked on saturday 7:30am-4:30pm on t...,worked saturday pm cat side clinic every cat d...,"[female, 23, Science, Libra]"
24659,546850,male,24,indUnk,Aries,"11,December,2002","Ok, so I don't have an important call t...",ok dont important call make spent hours sittin...,"[male, 24, indUnk, Aries]"
26635,2864451,male,26,Communications-Media,Taurus,"21,May,2004",Read John Stewart's Commencement Speech...,read john stewarts commencement speech linked ...,"[male, 26, Communications-Media, Taurus]"
20861,1234631,female,17,indUnk,Aquarius,"24,June,2003",Sometimes I wish that some th...,sometimes wish things arent things arent today...,"[female, 17, indUnk, Aquarius]"
3661,589736,male,35,Technology,Aries,"05,August,2004",...to convince me that he's a cluel...,convince hes clueless dunce urllink response a...,"[male, 35, Technology, Aries]"


In [44]:
blog_df=blog_df.drop(['id','gender','age','topic','sign','date','text'],axis=1)

In [45]:
blog_df.head()

Unnamed: 0,clean_text,labels
0,info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,testing testing,"[male, 15, Student, Leo]"
4,thanks yahoo toolbar capture urls popups means...,"[male, 33, InvestmentBanking, Aquarius]"


In [46]:
blog_df['labels'].values

array([list(['male', '15', 'Student', 'Leo']),
       list(['male', '15', 'Student', 'Leo']),
       list(['male', '15', 'Student', 'Leo']), ...,
       list(['male', '16', 'Biotech', 'Gemini']),
       list(['male', '16', 'Biotech', 'Gemini']),
       list(['male', '16', 'Biotech', 'Gemini'])], dtype=object)

In [50]:
X = blog_df.clean_text
y = blog_df.labels

In [51]:
X_train, X_test, y_train, y_test = train_test_split(blog_df['clean_text'], y, test_size=0.3, random_state=7)

In [52]:
multilabel_binarizer = MultiLabelBinarizer()
y_train_mlb=multilabel_binarizer.fit_transform(y_train)
y_test_mlb=multilabel_binarizer.transform(y_test)

In [53]:
print(X_train.shape)
print(X_test.shape)
print(y_train_mlb.shape)
print(y_test_mlb.shape)

(21000,)
(9000,)
(21000, 78)
(9000, 78)


In [54]:
multilabel_binarizer.classes_

array(['13', '14', '15', '16', '17', '23', '24', '25', '26', '27', '33',
       '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44',
       '45', '46', '47', '48', 'Accounting', 'Advertising', 'Agriculture',
       'Aquarius', 'Architecture', 'Aries', 'Arts', 'Automotive',
       'Banking', 'Biotech', 'BusinessServices', 'Cancer', 'Capricorn',
       'Chemicals', 'Communications-Media', 'Construction', 'Consulting',
       'Education', 'Engineering', 'Fashion', 'Gemini', 'Government',
       'HumanResources', 'Internet', 'InvestmentBanking', 'Law',
       'LawEnforcement-Security', 'Leo', 'Libra', 'Manufacturing',
       'Maritime', 'Marketing', 'Military', 'Museums-Libraries',
       'Non-Profit', 'Pisces', 'Publishing', 'RealEstate', 'Religion',
       'Sagittarius', 'Science', 'Scorpio', 'Sports-Recreation',
       'Student', 'Taurus', 'Technology', 'Telecommunications',
       'Transportation', 'Virgo', 'female', 'indUnk', 'male'],
      dtype=object)

**Vectorisation**

In [55]:
vect = CountVectorizer(ngram_range=(1, 2))

In [56]:
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [57]:
# examine & print the document-term matrix
X_train_dtm

<21000x1409533 sparse matrix of type '<class 'numpy.int64'>'
	with 3650132 stored elements in Compressed Sparse Row format>

In [58]:
print(X_train_dtm)

  (0, 12054)	1
  (0, 12055)	1
  (0, 50025)	1
  (0, 50288)	1
  (0, 121713)	1
  (0, 121733)	1
  (0, 187317)	1
  (0, 187422)	1
  (0, 216890)	1
  (0, 216916)	1
  (0, 221120)	1
  (0, 221439)	1
  (0, 222640)	1
  (0, 222874)	1
  (0, 236434)	2
  (0, 236451)	1
  (0, 236485)	1
  (0, 241948)	1
  (0, 242176)	1
  (0, 272700)	1
  (0, 272702)	1
  (0, 289945)	1
  (0, 289946)	1
  (0, 329861)	1
  (0, 329862)	1
  :	:
  (20999, 1224679)	1
  (20999, 1238692)	1
  (20999, 1238720)	1
  (20999, 1255620)	1
  (20999, 1255627)	1
  (20999, 1256736)	2
  (20999, 1256962)	1
  (20999, 1257153)	1
  (20999, 1278872)	1
  (20999, 1278899)	1
  (20999, 1319145)	1
  (20999, 1319156)	1
  (20999, 1329834)	1
  (20999, 1331190)	1
  (20999, 1333920)	1
  (20999, 1333921)	1
  (20999, 1347268)	2
  (20999, 1347742)	1
  (20999, 1347774)	1
  (20999, 1360865)	1
  (20999, 1361540)	1
  (20999, 1381329)	1
  (20999, 1381734)	1
  (20999, 1382919)	1
  (20999, 1382920)	1


In [59]:
X_test_dtm = vect.transform(X_test)

In [60]:
# examine & print the document-term matrix
X_test_dtm

<9000x1409533 sparse matrix of type '<class 'numpy.int64'>'
	with 1075449 stored elements in Compressed Sparse Row format>

In [61]:
print(X_test_dtm)

  (0, 1362)	1
  (0, 1647)	1
  (0, 50025)	1
  (0, 50288)	1
  (0, 124952)	1
  (0, 169685)	1
  (0, 170446)	1
  (0, 221120)	2
  (0, 221636)	1
  (0, 221686)	1
  (0, 230323)	1
  (0, 230590)	1
  (0, 250864)	2
  (0, 251532)	1
  (0, 251734)	1
  (0, 255298)	1
  (0, 255694)	1
  (0, 278460)	2
  (0, 287247)	1
  (0, 324503)	1
  (0, 325129)	1
  (0, 353837)	1
  (0, 354226)	1
  (0, 380754)	1
  (0, 381143)	1
  :	:
  (8999, 997063)	1
  (8999, 1021484)	1
  (8999, 1022637)	1
  (8999, 1029676)	1
  (8999, 1030070)	1
  (8999, 1034793)	1
  (8999, 1045378)	1
  (8999, 1045612)	1
  (8999, 1050005)	1
  (8999, 1050239)	1
  (8999, 1156071)	1
  (8999, 1156181)	1
  (8999, 1157223)	1
  (8999, 1298473)	1
  (8999, 1298498)	1
  (8999, 1304235)	1
  (8999, 1305024)	1
  (8999, 1327062)	2
  (8999, 1327380)	2
  (8999, 1373825)	1
  (8999, 1373875)	1
  (8999, 1377256)	1
  (8999, 1378242)	1
  (8999, 1385213)	1
  (8999, 1385922)	1


In [62]:
print(X_train_dtm.shape)
print(X_test_dtm.shape)

(21000, 1409533)
(9000, 1409533)


In [63]:
blog_df['labels'].to_dict()

{0: ['male', '15', 'Student', 'Leo'],
 1: ['male', '15', 'Student', 'Leo'],
 2: ['male', '15', 'Student', 'Leo'],
 3: ['male', '15', 'Student', 'Leo'],
 4: ['male', '33', 'InvestmentBanking', 'Aquarius'],
 5: ['male', '33', 'InvestmentBanking', 'Aquarius'],
 6: ['male', '33', 'InvestmentBanking', 'Aquarius'],
 7: ['male', '33', 'InvestmentBanking', 'Aquarius'],
 8: ['male', '33', 'InvestmentBanking', 'Aquarius'],
 9: ['male', '33', 'InvestmentBanking', 'Aquarius'],
 10: ['male', '33', 'InvestmentBanking', 'Aquarius'],
 11: ['male', '33', 'InvestmentBanking', 'Aquarius'],
 12: ['male', '33', 'InvestmentBanking', 'Aquarius'],
 13: ['male', '33', 'InvestmentBanking', 'Aquarius'],
 14: ['male', '33', 'InvestmentBanking', 'Aquarius'],
 15: ['male', '33', 'InvestmentBanking', 'Aquarius'],
 16: ['male', '33', 'InvestmentBanking', 'Aquarius'],
 17: ['male', '33', 'InvestmentBanking', 'Aquarius'],
 18: ['male', '33', 'InvestmentBanking', 'Aquarius'],
 19: ['male', '33', 'InvestmentBanking', 'Aq

In [66]:
gender=[]
age=[]
topic=[]
sign=[]
for pattern in blog_df['labels']:
    w=0
    for value in pattern:
        if w==0:
            gender.append(value)
        elif w==1:
            age.append(value)
        elif w==2:
            topic.append(value)
        elif w==3:
            sign.append(value)
        w+=1

In [67]:
age_freq=Counter(age)
gender_freq=Counter(gender)
topic_freq=Counter(topic)
sign_freq=Counter(sign)

In [68]:
def dict_print (age,gender,topic,sign ):
    w = age.copy()   
    w.update(gender)    
    w.update(topic)
    w.update(sign)
    return w


In [69]:
dict_print(age_freq,gender_freq,topic_freq,sign_freq)

Counter({'13': 548,
         '14': 1190,
         '15': 1535,
         '16': 2782,
         '17': 2868,
         '23': 3554,
         '24': 3184,
         '25': 1371,
         '26': 1667,
         '27': 3188,
         '33': 1386,
         '34': 925,
         '35': 2553,
         '36': 1760,
         '37': 186,
         '38': 142,
         '39': 132,
         '40': 183,
         '41': 95,
         '42': 48,
         '43': 55,
         '44': 10,
         '45': 78,
         '46': 204,
         '47': 105,
         '48': 251,
         'Accounting': 63,
         'Advertising': 56,
         'Agriculture': 47,
         'Aquarius': 2866,
         'Architecture': 53,
         'Aries': 6856,
         'Arts': 1119,
         'Automotive': 116,
         'Banking': 109,
         'Biotech': 80,
         'BusinessServices': 218,
         'Cancer': 2370,
         'Capricorn': 1378,
         'Chemicals': 75,
         'Communications-Media': 865,
         'Construction': 28,
         'Consulting': 174,
  

**3. Design, train, tune and test the best text classifier.**

In [70]:
clf=LogisticRegression(solver='lbfgs')
clf=OneVsRestClassifier(clf)

In [71]:
clf.fit(X_train_dtm,y_train_mlb)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [72]:
#Determine predictions on test set

y_pred_test=clf.predict(X_test_dtm)

In [73]:
y_pred_test

array([[0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])

**4. Display and explain detail the classification report**

In [74]:
#Evaluating model performance via accuracy score
accuracy_score(y_test_mlb, y_pred_test)

0.13822222222222222

In [75]:
# Evaluating performance of the model using f1 score 

f1_score(y_test_mlb, y_pred_test,average='micro')

0.5122963446238604

In [76]:
# Evaluating performance of the model using recall_score

recall_score(y_test_mlb,y_pred_test,average='micro')

0.39261111111111113

In [77]:
# Evaluating Performance of the model using precision_score

precision_score(y_test_mlb,y_pred_test,average='micro')

0.7369518744460086

**5. Print the true vs predicted labels for any 5 entries from the dataset.**

In [95]:
multilabel_binarizer.inverse_transform(y_pred_test)[0:5]

[('female', 'indUnk'),
 ('male',),
 ('male',),
 ('36', 'Aries', 'Fashion', 'male'),
 ('female',)]

In [96]:
y_test[0:5]

1252           [female, 15, Student, Aquarius]
10444    [female, 26, Transportation, Scorpio]
8994                [male, 15, Student, Virgo]
7463                [male, 36, Fashion, Aries]
1910             [male, 35, Technology, Aries]
Name: labels, dtype: object

In [123]:

y_val_pred_inversed = multilabel_binarizer.inverse_transform(y_pred_test)
y_val_inversed = multilabel_binarizer.inverse_transform(y_test_mlb)
for i in range(10):
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
         X_train_dtm[i],
        ','.join(y_val_inversed[i]),
        ','.join(y_val_pred_inversed[i])
    ))

Title:	  (0, 12054)	1
  (0, 12055)	1
  (0, 50025)	1
  (0, 50288)	1
  (0, 121713)	1
  (0, 121733)	1
  (0, 187317)	1
  (0, 187422)	1
  (0, 216890)	1
  (0, 216916)	1
  (0, 221120)	1
  (0, 221439)	1
  (0, 222640)	1
  (0, 222874)	1
  (0, 236434)	2
  (0, 236451)	1
  (0, 236485)	1
  (0, 241948)	1
  (0, 242176)	1
  (0, 272700)	1
  (0, 272702)	1
  (0, 289945)	1
  (0, 289946)	1
  (0, 329861)	1
  (0, 329862)	1
  :	:
  (0, 1159918)	1
  (0, 1160320)	1
  (0, 1165478)	1
  (0, 1165770)	1
  (0, 1197262)	1
  (0, 1197859)	1
  (0, 1218380)	1
  (0, 1218402)	1
  (0, 1226547)	1
  (0, 1227484)	1
  (0, 1242676)	1
  (0, 1245013)	1
  (0, 1257715)	1
  (0, 1258483)	1
  (0, 1307739)	1
  (0, 1308366)	1
  (0, 1350235)	1
  (0, 1350283)	1
  (0, 1359419)	1
  (0, 1359421)	1
  (0, 1385213)	2
  (0, 1386384)	1
  (0, 1387194)	1
  (0, 1402483)	1
  (0, 1402629)	1
True labels:	15,Aquarius,Student,female
Predicted labels:	female,indUnk


Title:	  (0, 495278)	1
  (0, 497079)	1
  (0, 511226)	1
  (0, 511886)	1
  (0, 555992)	1
  (0,

Using the default Threshold valid of 0.5,the probabilities greater than or equal to 0.5 were converted to 1’s and the rest to 0’.

In [98]:
y_pred_prob_test = clf.predict_proba(X_test_dtm)

In [99]:
y_pred_prob_test

array([[6.29763778e-06, 2.13998885e-04, 4.00008984e-03, ...,
        8.87321640e-01, 6.15010365e-01, 1.12678360e-01],
       [1.40123905e-02, 1.40284659e-02, 2.17049091e-02, ...,
        2.90401796e-01, 2.44988664e-01, 7.09598204e-01],
       [3.24566905e-02, 2.52882330e-02, 9.47402856e-03, ...,
        1.33881190e-01, 3.97244030e-01, 8.66118810e-01],
       ...,
       [4.78600287e-04, 1.27010093e-03, 1.26532158e-02, ...,
        8.94558634e-01, 2.10060141e-02, 1.05441366e-01],
       [1.41635338e-03, 1.07504741e-02, 2.15113671e-03, ...,
        9.85709500e-01, 1.29184278e-01, 1.42904997e-02],
       [1.05930660e-04, 2.79313315e-03, 9.42916540e-04, ...,
        2.05433273e-01, 2.05466494e-02, 7.94566727e-01]])

In [100]:
# Set threshold value set to .30 
t = 0.3

In [101]:
y_pred_new_test = (y_pred_prob_test >= t).astype(int)

In [102]:
y_pred_new_test

array([[0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 1, 1],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])

In [103]:
# Evaluating model performance via accuracy score with  threshold limit of .3

accuracy_score(y_test_mlb, y_pred_new_test)

0.15733333333333333

In [104]:
# Evaluate model performance via F1 score with threhold limit of .3 

f1_score(y_test_mlb, y_pred_new_test, average="micro")

0.5593341942889941

In [105]:
# Evaluate Model performance using recall_score with Threshold limit of .3

recall_score(y_test_mlb,y_pred_new_test,average='micro')

0.48725

In [106]:
# Evaluate Model performance using Precision_score with Threshold limit of .3

precision_score(y_test_mlb,y_pred_new_test,average='micro')

0.6564499831593129

In [107]:
#Print the true vs predicted labels for any 5 entries from the dataset in accordance to threshold limit =0.3
multilabel_binarizer.inverse_transform(y_pred_new_test)[0:5]

[('33', 'female', 'indUnk'),
 ('Aries', 'male'),
 ('indUnk', 'male'),
 ('36', 'Aries', 'Fashion', 'male'),
 ('female', 'male')]

In [108]:
y_test[0:5]

1252           [female, 15, Student, Aquarius]
10444    [female, 26, Transportation, Scorpio]
8994                [male, 15, Student, Virgo]
7463                [male, 36, Fashion, Aries]
1910             [male, 35, Technology, Aries]
Name: labels, dtype: object

In [122]:
y_val_pred_inversed = multilabel_binarizer.inverse_transform(y_pred_new_test)
y_val_inversed = multilabel_binarizer.inverse_transform(y_test_mlb)
for i in range(10):
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
         X_train_dtm[i],
        ','.join(y_val_inversed[i]),
        ','.join(y_val_pred_inversed[i])
    ))

Title:	  (0, 12054)	1
  (0, 12055)	1
  (0, 50025)	1
  (0, 50288)	1
  (0, 121713)	1
  (0, 121733)	1
  (0, 187317)	1
  (0, 187422)	1
  (0, 216890)	1
  (0, 216916)	1
  (0, 221120)	1
  (0, 221439)	1
  (0, 222640)	1
  (0, 222874)	1
  (0, 236434)	2
  (0, 236451)	1
  (0, 236485)	1
  (0, 241948)	1
  (0, 242176)	1
  (0, 272700)	1
  (0, 272702)	1
  (0, 289945)	1
  (0, 289946)	1
  (0, 329861)	1
  (0, 329862)	1
  :	:
  (0, 1159918)	1
  (0, 1160320)	1
  (0, 1165478)	1
  (0, 1165770)	1
  (0, 1197262)	1
  (0, 1197859)	1
  (0, 1218380)	1
  (0, 1218402)	1
  (0, 1226547)	1
  (0, 1227484)	1
  (0, 1242676)	1
  (0, 1245013)	1
  (0, 1257715)	1
  (0, 1258483)	1
  (0, 1307739)	1
  (0, 1308366)	1
  (0, 1350235)	1
  (0, 1350283)	1
  (0, 1359419)	1
  (0, 1359421)	1
  (0, 1385213)	2
  (0, 1386384)	1
  (0, 1387194)	1
  (0, 1402483)	1
  (0, 1402629)	1
True labels:	15,Aquarius,Student,female
Predicted labels:	33,female,indUnk


Title:	  (0, 495278)	1
  (0, 497079)	1
  (0, 511226)	1
  (0, 511886)	1
  (0, 555992)	1
  

In [119]:
#Printing Test Data
test_predictions=clf.predict(X_test_dtm)
test_pred_inversed = multilabel_binarizer.inverse_transform(test_predictions)

test_predictions = '\n'.join('%i\t%s' % (i, ','.join(row)) for i, row in enumerate(test_pred_inversed))

print('MultilabelClassification: {}'.format(test_predictions))

MultilabelClassification: 0	female,indUnk
1	male
2	male
3	36,Aries,Fashion,male
4	female
5	female,indUnk
6	female,indUnk
7	male
8	male
9	female
10	male
11	female
12	23,Engineering,female
13	14,Aries,indUnk,male
14	Student,male
15	Student,male
16	36,Aries,Fashion,male
17	17,female,indUnk
18	female
19	Aries,male
20	female,indUnk
21	female,indUnk
22	17,Aquarius,female,indUnk
23	male
24	Student,male
25	male
26	male
27	26,35,Taurus,male
28	16,Sagittarius,indUnk,male
29	male
30	Aries,male
31	14,female,indUnk
32	17,female,indUnk
33	female,indUnk
34	male
35	male
36	15,Student,female
37	indUnk,male
38	male
39	male
40	Aries,female
41	Aries,female,indUnk
42	24,Aries,female,indUnk
43	13,15,17,Cancer,Libra,Student,female,indUnk
44	female
45	34,Sagittarius,female,indUnk
46	Aries,Technology,male
47	male
48	23,female
49	male
50	27,Government,Leo,female
51	36,Aries,Fashion,male
52	Cancer,female
53	male
54	35,Aries,male
55	23,female,indUnk
56	Cancer,Technology,male
57	16,Sagittarius,female,indUnk
58	mal

**PART TWO - GL CHATBOT**

In [159]:
import nltk
import numpy as np
import random
import string

import bs4 as bs
import urllib.request
import re
from nltk.chat.util import Chat, reflections

In [125]:
#Importing Corpus
import json

#Import corpus file
with open('GL Bot.json') as file:
  Corpus = json.load(file)

#Display corpus file
print (Corpus)


{'intents': [{'tag': 'Intro', 'patterns': ['hi', 'how are you', 'is anyone there', 'hello', 'whats up', 'hey', 'yo', 'listen', 'please help me', 'i am learner from', 'i belong to', 'aiml batch', 'aifl batch', 'i am from', 'my pm is', 'blended', 'online', 'i am from', 'hey ya', 'talking to you for first time'], 'responses': ['Hello! how can i help you ?'], 'context_set': ''}, {'tag': 'Exit', 'patterns': ['thank you', 'thanks', 'cya', 'see you', 'later', 'see you later', 'goodbye', 'i am leaving', 'have a Good day', 'you helped me', 'thanks a lot', 'thanks a ton', 'you are the best', 'great help', 'too good', 'you are a good learning buddy'], 'responses': ['I hope I was able to assist you, Good Bye'], 'context_set': ''}, {'tag': 'Olympus', 'patterns': ['olympus', 'explain me how olympus works', 'I am not able to understand olympus', 'olympus window not working', 'no access to olympus', 'unable to see link in olympus', 'no link visible on olympus', 'whom to contact for olympus', 'lot of p

In [128]:
# Enhance/add more data to the corpus
Data = {'tag': 'Computer Vision', 'patterns': ['i am not able to understand CNN ', 'Clarify me how CNN outperforms other models', 'i am not able to train CNN model', 'i am not able to evalaute and test CNN models', 'i am not able to understand transfer learning', 'i am not able to understand VGG model', 'i am not able to understand Siamese Network', 'i am not able to understand ResNet', 'i am not able to understand Alexnet', 'i am not able to understand object detection', 'i am not able to understand image classification', 'i am not able to understand RCNN ', 'computer vision', 'object detection', 'image classification', 'cnn', 'rcnn', 'transfer learning', 'resnet', 'vgg', 'alexnet', 'iou', 'region proposal', 'fast rcnn', 'faster rcnn', 'yolo', 'ssd', 'image classifier'], 'responses': ['Link: Computer vision wiki '], 'context_set':''}

In [130]:
Corpus.update(Data)

In [131]:
print(json.dumps(Corpus)) 

{"intents": [{"tag": "Intro", "patterns": ["hi", "how are you", "is anyone there", "hello", "whats up", "hey", "yo", "listen", "please help me", "i am learner from", "i belong to", "aiml batch", "aifl batch", "i am from", "my pm is", "blended", "online", "i am from", "hey ya", "talking to you for first time"], "responses": ["Hello! how can i help you ?"], "context_set": ""}, {"tag": "Exit", "patterns": ["thank you", "thanks", "cya", "see you", "later", "see you later", "goodbye", "i am leaving", "have a Good day", "you helped me", "thanks a lot", "thanks a ton", "you are the best", "great help", "too good", "you are a good learning buddy"], "responses": ["I hope I was able to assist you, Good Bye"], "context_set": ""}, {"tag": "Olympus", "patterns": ["olympus", "explain me how olympus works", "I am not able to understand olympus", "olympus window not working", "no access to olympus", "unable to see link in olympus", "no link visible on olympus", "whom to contact for olympus", "lot of p

In [133]:
# Enhance/add more data to the corpus
Data1 = {'tag': 'AIML projects', 'patterns': ['i am not able to understand applied statistics project', 'i am not able to understand supervised learning project', 'i am not able to understand unsupervised learning project', 'i am not able to understand ensemble techiques project', 'i am not able to understand product recommendations project', 'i am not able to understand neural network project', 'i am not able to understand computer vision project', 'i am not able to understand advanced computer vision project', 'i am not able to create gui ', 'i am not able to import dataset', 'i am not able to load data', 'i am not able to get correct output ', 'i am not able to get best accuracy', 'aiml project', 'sl project', 'ensemble project', 'neural network project', 'nlp project', 'computer vision project', 'gui'], 'responses': ['Transferring the request to your Mentor'], 'context_set':''}

In [134]:
Corpus.update(Data1)

In [135]:
print(json.dumps(Corpus)) 

{"intents": [{"tag": "Intro", "patterns": ["hi", "how are you", "is anyone there", "hello", "whats up", "hey", "yo", "listen", "please help me", "i am learner from", "i belong to", "aiml batch", "aifl batch", "i am from", "my pm is", "blended", "online", "i am from", "hey ya", "talking to you for first time"], "responses": ["Hello! how can i help you ?"], "context_set": ""}, {"tag": "Exit", "patterns": ["thank you", "thanks", "cya", "see you", "later", "see you later", "goodbye", "i am leaving", "have a Good day", "you helped me", "thanks a lot", "thanks a ton", "you are the best", "great help", "too good", "you are a good learning buddy"], "responses": ["I hope I was able to assist you, Good Bye"], "context_set": ""}, {"tag": "Olympus", "patterns": ["olympus", "explain me how olympus works", "I am not able to understand olympus", "olympus window not working", "no access to olympus", "unable to see link in olympus", "no link visible on olympus", "whom to contact for olympus", "lot of p

In [202]:
# Extract data and perform Tokenization
W = [] # tokens
L = [] # Identified Tags
doc_x = [] #tokenised words
doc_y =[] # Tags or labels

for intent in Corpus['intents']:
  for pattern in intent['patterns']:
      w_temp = nltk.word_tokenize(pattern)
      W.extend(w_temp)
      doc_x.append(w_temp)
      doc_y.append(intent["tag"])

#Add the missing tag
if intent['tag'] not in L:
  L.append(intent['tag'])

In [203]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Stemming
Train = []
Target = []

out_empty = [0 for _ in range(len(L))]

#Loop to create Bag of words
for x,doc in enumerate(doc_x):
  bag = []
  w_temp = [w.lower() for w in doc]
  for w in W:
    if w in w_temp:
      bag.append(1)
    else:
      bag.append(0)
  output_row = out_empty[:]
  output_row[L.index(doc_y[x])]=1

  Train.append(bag)
  Target.append(output_row)

In [211]:
def chat():
    print("chat with AIML Agent(type: stop to quit)")
    print("if answer is not right (type : *)")
    while True:
      inp = input ("\n You:")
      if inp.lower()=="*":
        print("BOT : Please rephrase you question and try again")
      if inp.lower() =="quit":
          break
      results = clf.predict([bag_of_words(inp,W)])
      results_index =numpy.argmax(results)
      tag = L[results_index]

      for tg in Corpus["intents"]:
          if tg['tag']==tag:
             responses = tg['responses']
             print (random.choice(responses))

In [212]:
chat()

chat with AIML Agent(type: stop to quit)
if answer is not right (type : *)

 You:hi


NameError: ignored