In [171]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [0]:
import pandas as pd
import numpy as np

# 1. Load the dataset (5 points)
a. Tip: As the dataset is large, use fewer rows. Check what is working well on your machine and decide accordingly.

In [0]:
# read file into pandas using a relative path. Please change the path as needed
df_main = pd.read_csv('/gdrive/My Drive/AIML/statistical-nlp/project/blog-authorship-corpus/blogtext.csv')

In [174]:
df_main.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [175]:
df_main.shape

(681284, 7)

In [176]:
#Checkout missing values
df_main.isnull().sum()

id        0
gender    0
age       0
topic     0
sign      0
date      0
text      0
dtype: int64

In [177]:
df_main.isin(['UNKNOWN']).any()

id        False
gender    False
age       False
topic     False
sign      False
date      False
text      False
dtype: bool

# Getting fewer rows as the dataset is very large

In [0]:
df = df_main.iloc[0:5000,:]

In [179]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [180]:
df.shape

(5000, 7)

# 2. Preprocess rows of the “text” column (7.5 points)
## a. Remove unwanted characters
## b. Convert text to lowercase
## c. Remove unwanted spaces
## d. Remove stopwords

***NOTE : Stop words are removed while doing count vectorizer***

In [181]:
!pip install gensim



### Remove HTML tags

In [0]:
# def strip_html_tags(text):
#     soup = BeautifulSoup(text, "html.parser")
#     stripped_text = soup.get_text()
#     return stripped_text

# strip_html_tags('<html><h2>Some important text</h2></html>')

### Remove accented characters

In [183]:
import unicodedata
import re
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text



In [0]:
#remove_accented_chars('Sómě Áccěntěd těxt')

### Remove special characters

In [0]:
def remove_special_characters(text, remove_digits=False):
    #Using regex
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text



In [0]:
#remove_special_characters("Well this was fun! What do you think? 123#@!", remove_digits=True)

### Text lemmatization

In [0]:
def lemmatize_text(text):

    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()]) 

In [0]:
#lemmatize_text("My system keeps crashing, his crashed yesterday, ours crashes daily")

### Text stemming

In [0]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

#simple_stemmer("My system keeps crashing his crashed yesterday, ours crashes daily")

In [0]:
from gensim.parsing.preprocessing import remove_stopwords


In [0]:
def remove_stop_words(text):
    text = remove_stopwords(text)
    return text

In [0]:
#remove_stopwords("My system keeps crashing his crashed yesterday, ours crashes daily")

### Building a text normalizer

In [0]:
def normalize_corpus(corpus, html_stripping=True, accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True,remove_stopwords=True ):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits) 
        if  remove_stopwords:
            doc = remove_stop_words(doc)
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

In [195]:
df_cleantext = df.copy(deep=True)
df_cleantext['clean_text'] = normalize_corpus(df['text'], html_stripping=False)
df_cleantext.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,clean_text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,...",info ha pages mb pdf files wait untill team le...
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...,team members drewes van der laag urllink mail ...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...,het kader van kernfusie op aarde maak je eigen...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!,testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...,thanks yahoo s toolbar capture url popups mean...


In [196]:
df_cleantext['clean_text'].head()

0    info ha pages mb pdf files wait untill team le...
1    team members drewes van der laag urllink mail ...
2    het kader van kernfusie op aarde maak je eigen...
3                                      testing testing
4    thanks yahoo s toolbar capture url popups mean...
Name: clean_text, dtype: object

In [197]:
df_cleantext.shape

(5000, 8)

# 3. As we want to make this into a multi-label classification problem, you are required to merge all the label columns together, so that we have all the labels together for a particular sentence (7.5 points)
## a. Label columns to merge: “gender”, “age”, “topic”, “sign”
## b. After completing the previous step, there should be only two columns in your data frame i.e. “text” and “labels” as shown in the below image

In [0]:
# source_col_loc = df_cleantext.columns.get_loc('gender') # column position starts from 0

# df_cleantext['labels'] = df_cleantext.loc[:,source_col_loc+1:source_col_loc+4].apply(
#     lambda x: ",".join(x.astype(str)), axis=1)

In [0]:
df_merge=df_cleantext.copy(deep=True)

In [200]:
df_merge.shape

(5000, 8)

In [201]:
df_merge.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,clean_text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,...",info ha pages mb pdf files wait untill team le...
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...,team members drewes van der laag urllink mail ...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...,het kader van kernfusie op aarde maak je eigen...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!,testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...,thanks yahoo s toolbar capture url popups mean...


In [202]:
df_merge.dtypes

id             int64
gender        object
age            int64
topic         object
sign          object
date          object
text          object
clean_text    object
dtype: object

In [0]:
df_merge['labels']='[' + df_merge['gender'] + ',' + df_merge['age'].astype(str)  + ',' + df_merge['topic'] + ',' +df_merge['sign'] + ']' 

In [204]:
df_merge.head(100)

Unnamed: 0,id,gender,age,topic,sign,date,text,clean_text,labels
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,...",info ha pages mb pdf files wait untill team le...,"[male,15,Student,Leo]"
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...,team members drewes van der laag urllink mail ...,"[male,15,Student,Leo]"
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...,het kader van kernfusie op aarde maak je eigen...,"[male,15,Student,Leo]"
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!,testing testing,"[male,15,Student,Leo]"
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...,thanks yahoo s toolbar capture url popups mean...,"[male,33,InvestmentBanking,Aquarius]"
...,...,...,...,...,...,...,...,...,...
95,4172416,female,25,indUnk,Capricorn,"08,August,2004","urlLink im new to this, ...",urllink im new tell god create link column,"[female,25,indUnk,Capricorn]"
96,4172416,female,25,indUnk,Capricorn,"08,August,2004",Election time has rolled aro...,election time ha rolled spitting venom hope im...,"[female,25,indUnk,Capricorn]"
97,3668238,female,17,Student,Gemini,"30,June,2004",http://www.uploadimages.net/i...,httpwww uploadimages netimagespict jpg httpwww...,"[female,17,Student,Gemini]"
98,3668238,female,17,Student,Gemini,"26,June,2004",it was fun :) Hey dad ...,wa fun hey dad im writing tell hate ask feel f...,"[female,17,Student,Gemini]"


In [0]:
df_modified = pd.DataFrame(df_merge, columns  =['clean_text', 'labels'])
df_modified.rename(columns = {'clean_text':'text'}, inplace = True) 


In [206]:
df_modified.shape

(5000, 2)

In [207]:
df_modified.head()

Unnamed: 0,text,labels
0,info ha pages mb pdf files wait untill team le...,"[male,15,Student,Leo]"
1,team members drewes van der laag urllink mail ...,"[male,15,Student,Leo]"
2,het kader van kernfusie op aarde maak je eigen...,"[male,15,Student,Leo]"
3,testing testing,"[male,15,Student,Leo]"
4,thanks yahoo s toolbar capture url popups mean...,"[male,33,InvestmentBanking,Aquarius]"


# 4. Separate features and labels, and split the data into training and testing (5 points)

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
# split X and y into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(df_modified.text, df_modified.labels, random_state=2)

In [210]:
#Traing data
print(x_train.shape)
print(y_train.shape)

(3750,)
(3750,)


In [211]:
#Test Data
print(x_test.shape)
print(y_test.shape)

(1250,)
(1250,)


In [212]:
x_train

4715    dont fuck implied read urllink press release g...
3576    urllink powell distance bush powell told tim r...
4996    mmm strawberry tea breakfast tomorrow think de...
2556                          bunnys hoppin bunnys hoppin
611     wa peeing urinal yesterday hit prefer street s...
                              ...                        
3335                                                   uh
1099    wow thats im saying kids wow love want need ne...
2514                      fairly inept terminator wouldnt
3606    wa reading book odd trivia day wa list origina...
2575    thanks guys support tee hee hee theyre kind he...
Name: text, Length: 3750, dtype: object

# 5. Vectorize the features (5 points)
## a. Create a Bag of Words using count vectorizer
### i. Use ngram_range=(1, 2)
### ii. Vectorize training and testing features
## b. Print the term-document matrix

In [0]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
cvect = CountVectorizer(stop_words='english',ngram_range=(1,2),min_df=10,max_df=100)
#cvect = CountVectorizer(ngram_range=(1,2))
#cvect = CountVectorizer(stop_words='english')



In [214]:
#Feed text data to CountVectorizer
cvect.fit(x_train)

#Check the vocablury size
len(cvect.vocabulary_)

3314

In [215]:
#What is there in the vocabulary
cvect.vocabulary_

{'fuck': 1107,
 'press': 2200,
 'release': 2343,
 'comic': 508,
 'youll': 3305,
 'pure': 2255,
 'anger': 89,
 'apparently': 112,
 'peter': 2095,
 'website': 3194,
 'goddamn': 1156,
 'fairly': 977,
 'quality': 2261,
 'spiderman': 2693,
 'fucking': 1108,
 'honestly': 1332,
 'appeal': 113,
 'india': 1442,
 'fashion': 992,
 'sense': 2513,
 'shoes': 2561,
 'completely': 527,
 'crap': 599,
 'read urllink': 2303,
 'thought wa': 2907,
 'distance': 772,
 'bush': 345,
 'meet': 1805,
 'intelligence': 1462,
 'provided': 2245,
 'belief': 228,
 'effort': 865,
 'win': 3227,
 'public': 2248,
 'approval': 125,
 'war': 3163,
 'referring': 2329,
 'appearance': 115,
 'security': 2502,
 'iraq': 1479,
 'developing': 731,
 'including': 1436,
 'weapons': 3188,
 'information': 1448,
 'offered': 1980,
 'administration': 29,
 'turned': 3003,
 'disappointed': 757,
 'regret': 2335,
 'wa wrong': 3146,
 'strawberry': 2748,
 'tea': 2833,
 'breakfast': 302,
 'depressed': 708,
 'lazy': 1623,
 'reminds': 2359,
 'ton': 2

Build Document-term Matrix (DTM)

In [0]:
#Convert Training texts into Count Vectors
X_train_ct = cvect.transform(x_train)

In [217]:
print(cvect.get_feature_names())



***NOTE : System is crashing while printing DTM so commenting the code below***

In [0]:
# #print(X_train_ct.toarray())
# # Create dataFrame
# df_dtm = pd.DataFrame(X_train_ct.toarray().transpose(),
#                    index=cvect.get_feature_names())
 
# # # Change column headers
# # df2.columns = df_dtm.columns
# # print(df2)

# df_dtm.head()

In [219]:
#Size of Document Term Matrix
X_train_ct.shape

(3750, 3314)

In [220]:
#Let's check the first record
X_train_ct[0]

<1x3314 sparse matrix of type '<class 'numpy.int64'>'
	with 25 stored elements in Compressed Sparse Row format>

In [221]:
#What's there in sparse matrix
print(X_train_ct[0])

  (0, 89)	1
  (0, 112)	1
  (0, 113)	1
  (0, 508)	2
  (0, 527)	1
  (0, 599)	1
  (0, 977)	1
  (0, 992)	1
  (0, 1107)	1
  (0, 1108)	1
  (0, 1156)	1
  (0, 1332)	1
  (0, 1442)	1
  (0, 2095)	1
  (0, 2200)	2
  (0, 2255)	1
  (0, 2261)	1
  (0, 2303)	1
  (0, 2343)	2
  (0, 2513)	1
  (0, 2561)	1
  (0, 2693)	1
  (0, 2907)	1
  (0, 3194)	1
  (0, 3305)	1


Convert Test SMS also in numerical features

In [0]:
X_test_ct = cvect.transform(x_test)

In [223]:
X_test_ct.shape

(1250, 3314)

# 6. Create a dictionary to get the count of every label i.e. the key will be label name and value will be the total count of the label. Check below image for reference (5 points)

In [224]:
df_modified.head()

Unnamed: 0,text,labels
0,info ha pages mb pdf files wait untill team le...,"[male,15,Student,Leo]"
1,team members drewes van der laag urllink mail ...,"[male,15,Student,Leo]"
2,het kader van kernfusie op aarde maak je eigen...,"[male,15,Student,Leo]"
3,testing testing,"[male,15,Student,Leo]"
4,thanks yahoo s toolbar capture url popups mean...,"[male,33,InvestmentBanking,Aquarius]"


In [225]:
df_modified.shape

(5000, 2)

In [0]:
df_labels = df_modified['labels']


In [227]:
df_labels.head()

0                   [male,15,Student,Leo]
1                   [male,15,Student,Leo]
2                   [male,15,Student,Leo]
3                   [male,15,Student,Leo]
4    [male,33,InvestmentBanking,Aquarius]
Name: labels, dtype: object

In [0]:
label_arr = df_labels.to_numpy()

In [229]:
type(label_arr)

numpy.ndarray

In [230]:
label_arr

array(['[male,15,Student,Leo]', '[male,15,Student,Leo]',
       '[male,15,Student,Leo]', ..., '[female,17,indUnk,Scorpio]',
       '[female,17,indUnk,Scorpio]', '[female,17,indUnk,Scorpio]'],
      dtype=object)

In [231]:
type(label_arr[0])

str

In [232]:
map = {}
count = 0
for label in label_arr:
  s = label[1:len(label)-1]
  a = s.split(',')
  count = count+ 1
  for token in a:
    if(token not in map.keys()):
      map[token] = 1
    else:
      map[token]= map[token] + 1 
print(count)

5000


In [233]:
for token, count in map.items(): 
    print(token, ":", count) 

male : 3294
15 : 339
Student : 569
Leo : 190
33 : 101
InvestmentBanking : 70
Aquarius : 329
female : 1706
14 : 170
indUnk : 1381
Aries : 2483
25 : 268
Capricorn : 84
17 : 331
Gemini : 86
23 : 137
Non-Profit : 47
Cancer : 94
Banking : 16
37 : 19
Sagittarius : 704
26 : 96
24 : 353
Scorpio : 408
27 : 86
Education : 118
45 : 14
Engineering : 119
Libra : 414
Science : 33
34 : 540
41 : 14
Communications-Media : 61
BusinessServices : 87
Sports-Recreation : 75
Virgo : 41
Taurus : 100
Arts : 31
Pisces : 67
44 : 3
16 : 67
Internet : 20
Museums-Libraries : 2
Accounting : 2
39 : 79
35 : 2307
Technology : 2332
36 : 60
Law : 3
46 : 7
Consulting : 16
Automotive : 14
42 : 9
Religion : 4


# 7. Transform the labels - (7.5 points) As we have noticed before, in this task each example can have multiple tags. To deal with such kind of prediction, we need to transform labels in a binary form and the prediction will be a mask of 0s and 1s. For this purpose, it is convenient to use MultiLabelBinarizer from sklearn
## a. Convert your train and test labels using MultiLabelBinarizer

In [0]:
from sklearn.preprocessing import MultiLabelBinarizer


In [235]:
y_train[0]

'[male,15,Student,Leo]'

In [236]:
y_test

3566              [male,35,Technology,Aries]
4252          [female,34,indUnk,Sagittarius]
1918              [male,35,Technology,Aries]
4111          [female,34,indUnk,Sagittarius]
1471              [male,35,Technology,Aries]
                        ...                 
3444              [male,35,Technology,Aries]
2624              [male,35,Technology,Aries]
1691              [male,35,Technology,Aries]
3706    [male,39,Communications-Media,Libra]
4575          [female,34,indUnk,Sagittarius]
Name: labels, Length: 1250, dtype: object

In [0]:
mlb = MultiLabelBinarizer()


y_train_mlb = mlb.fit_transform(y_train)
y_test_mlb = mlb.transform(y_test)


In [238]:
y_train_mlb[100]

array([1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 0])

In [239]:
print(y_train_mlb.shape)
print(y_test_mlb.shape)

(3750, 50)
(1250, 50)


# 8. choose a classifier - (5 points) In this task, we suggest using the One-vs-Rest approach, which is implemented in OneVsRestClassifier class. In this approach k classifiers (= number of tags) are trained. As a basic classifier, use LogisticRegression. It is one of the simplest methods, but often it performs good enough in text classification tasks. It might take some time because the number of classifiers to train is large.
## a. Use a linear classifier of your choice, wrap it up in OneVsRestClassifier to train it on every label
## b. As One-vs-Rest approach might not have been discussed in the sessions, we are providing you the code for that

In [0]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

lg = LogisticRegression(solver='lbfgs',max_iter=1000)
clf = OneVsRestClassifier(lg)

In [241]:
clf.fit(X_train_ct, y_train_mlb)

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [0]:
predicted = clf.predict(X_test_ct)

In [243]:
predicted

array([[1, 0, 0, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 1],
       ...,
       [1, 0, 0, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [0]:
# summarize the fit of the model
model_score = clf.score(X_test_ct, y_test_mlb)


In [245]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test_mlb, predicted)

0.3592

# Fit the classifier, make predictions and get the accuracy (5 points)
## a. Print the following
### i. Accuracy score
### ii. F1 score
### iii. Average precision score
### iv. Average recall score
### v. Tip: Make sure you are familiar with all of them. How would you expect the things to work for the multi-label scenario? Read about micro/macro/weighted averaging

In [246]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
print('Accuracy: ', accuracy_score(y_test_mlb, predicted))
print('Precison: ', precision_score(y_test_mlb, predicted,average='micro'))
print('Recall: ', recall_score(y_test_mlb, predicted,average='micro'))
print('F1: ', f1_score(y_test_mlb, predicted,average='micro'))

Accuracy:  0.3592
Precison:  0.9051040525739321
Recall:  0.8406852771221617
F1:  0.871706154138273


# 10. Print true label and predicted label for any five examples (7.5 points)

In [247]:
#type(y_test)
y_test.head(1)
#print(len(y_test.head(1)))


3566    [male,35,Technology,Aries]
Name: labels, dtype: object

In [248]:
y_test_mlb[0]
y = mlb.inverse_transform(y_test_mlb)
len(y[0])

20

In [249]:
for i in range(0,4):
  print("True Label : " ,y_test_mlb[i])
  print("Predicted Label: " ,predicted[i])

True Label :  [1 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 0 1 0 1 0 1 1 1 0
 1 1 1 1 0 0 1 1 0 0 0 0 1]
Predicted Label:  [1 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 0 1 0 1 0 1 1 1 0
 1 1 1 1 0 0 1 1 0 0 0 0 1]
True Label :  [1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 1 0 0 1 1 1 1 0 1 1
 1 1 1 0 0 0 1 1 1 1 0 0 0]
Predicted Label:  [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 0 0 1 1 0 1 0 1 0
 1 1 1 1 0 0 1 0 1 1 0 0 0]
True Label :  [1 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 0 1 0 1 0 1 1 1 0
 1 1 1 1 0 0 1 1 0 0 0 0 1]
Predicted Label:  [1 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 0 1 0 1 0 1 1 1 0
 1 1 1 1 0 0 1 1 0 0 0 0 1]
True Label :  [1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 1 0 0 1 1 1 1 0 1 1
 1 1 1 0 0 0 1 1 1 1 0 0 0]
Predicted Label:  [1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 1 0 0 1 1 1 1 0 1 1
 1 1 1 0 0 0 1 1 1 1 0 0 0]


In [250]:
y = mlb.inverse_transform(y_test_mlb)
y[0]

(',',
 '3',
 '5',
 'A',
 'T',
 '[',
 ']',
 'a',
 'c',
 'e',
 'g',
 'h',
 'i',
 'l',
 'm',
 'n',
 'o',
 'r',
 's',
 'y')

In [251]:
y_pred = mlb.inverse_transform(predicted)
y_pred[0]

(',',
 '3',
 '5',
 'A',
 'T',
 '[',
 ']',
 'a',
 'c',
 'e',
 'g',
 'h',
 'i',
 'l',
 'm',
 'n',
 'o',
 'r',
 's',
 'y')

In [None]:
for i in range(0,4):
  print("True Label : " ,y[i])
  print("Predicted Label: " ,y_pred[i])