In [1140]:
import pandas as pd
import numpy as np

# 1. Load the dataset

In [1141]:
df = pd.read_csv("blogtext.csv")

In [1142]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [1143]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681284 entries, 0 to 681283
Data columns (total 7 columns):
id        681284 non-null int64
gender    681284 non-null object
age       681284 non-null int64
topic     681284 non-null object
sign      681284 non-null object
date      681284 non-null object
text      681284 non-null object
dtypes: int64(2), object(5)
memory usage: 36.4+ MB


In [1144]:
#check missing values
df.isnull().sum()

id        0
gender    0
age       0
topic     0
sign      0
date      0
text      0
dtype: int64

In [1145]:
#no of entries
df.shape

(681284, 7)

In [1146]:
#taking only first 3 lakh entries for processing
#df = df.iloc[0:300000]
#df = df.iloc[0:100000]
df = df.iloc[0:5000]

In [1147]:
df.shape

(5000, 7)

# 2. Pre-process rows of the text column

In [1148]:
!pip install nltk --quiet

In [1149]:
import nltk

In [1150]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nm949133\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [1151]:
#before removing the special characters
df['text']

0                  Info has been found (+/- 100 pages,...
1                  These are the team members:   Drewe...
2                  In het kader van kernfusie op aarde...
3                        testing!!!  testing!!!          
4                    Thanks to Yahoo!'s Toolbar I can ...
5                    I had an interesting conversation...
6                    Somehow Coca-Cola has a way of su...
7                    If anything, Korea is a country o...
8                    Take a read of this news article ...
9                    I surf the English news sites a l...
10                   Ah, the Korean language...it look...
11                   If you click on my profile you'll...
12                   Last night was pretty fun...mostl...
13                   There is so much that is differen...
14                    urlLink    Here it is, the super...
15                   One thing I love about Seoul (and...
16                    urlLink    Wonderful oh-gyup-sal...
17            

## Remove Unwanted Characters

In [1152]:
import re

In [1153]:
# we will use regex library to remove the unwanted characters
# replace all characters, except alphabets and numbers, with a space and create a new column as text2
df['text2'] = [re.sub("[^A-Za-z0-9]+"," ",e) for e in df['text']]

In [1154]:
# after removing unwanted characters
df['text2']

0        Info has been found 100 pages and 4 5 MB of p...
1        These are the team members Drewes van der Laa...
2        In het kader van kernfusie op aarde MAAK JE E...
3                                        testing testing 
4        Thanks to Yahoo s Toolbar I can now capture t...
5        I had an interesting conversation with my Dad...
6        Somehow Coca Cola has a way of summing up thi...
7        If anything Korea is a country of extremes Ev...
8        Take a read of this news article from urlLink...
9        I surf the English news sites a lot looking f...
10       Ah the Korean language it looks so difficult ...
11       If you click on my profile you ll make a not ...
12       Last night was pretty fun mostly because of t...
13       There is so much that is different here from ...
14       urlLink Here it is the superfantastic phonebo...
15       One thing I love about Seoul and I mean this ...
16       urlLink Wonderful oh gyup sal at my favorite ...
17       Here 

## convert text to lowercase

In [1155]:
#converting all the words to lowercase using lower() function
df['text3'] = [word.lower() for word in df['text2']]

In [1156]:
df['text3']

0        info has been found 100 pages and 4 5 mb of p...
1        these are the team members drewes van der laa...
2        in het kader van kernfusie op aarde maak je e...
3                                        testing testing 
4        thanks to yahoo s toolbar i can now capture t...
5        i had an interesting conversation with my dad...
6        somehow coca cola has a way of summing up thi...
7        if anything korea is a country of extremes ev...
8        take a read of this news article from urllink...
9        i surf the english news sites a lot looking f...
10       ah the korean language it looks so difficult ...
11       if you click on my profile you ll make a not ...
12       last night was pretty fun mostly because of t...
13       there is so much that is different here from ...
14       urllink here it is the superfantastic phonebo...
15       one thing i love about seoul and i mean this ...
16       urllink wonderful oh gyup sal at my favorite ...
17       here 

## remove unwanted spaces

In [1157]:
# replace all 'more than 1 spaces' as single space and create a new column as text4
df['text4'] = (df['text3'].str.split()).str.join(' ')

In [1158]:
df['text4']

0       info has been found 100 pages and 4 5 mb of pd...
1       these are the team members drewes van der laag...
2       in het kader van kernfusie op aarde maak je ei...
3                                         testing testing
4       thanks to yahoo s toolbar i can now capture th...
5       i had an interesting conversation with my dad ...
6       somehow coca cola has a way of summing up thin...
7       if anything korea is a country of extremes eve...
8       take a read of this news article from urllink ...
9       i surf the english news sites a lot looking fo...
10      ah the korean language it looks so difficult a...
11      if you click on my profile you ll make a not s...
12      last night was pretty fun mostly because of th...
13      there is so much that is different here from a...
14      urllink here it is the superfantastic phonebox...
15      one thing i love about seoul and i mean this a...
16      urllink wonderful oh gyup sal at my favorite p...
17      here i

## Remove stopwords

In [1159]:
#from nltk.corpus import stopwords
#stop = stopwords.words('english')

#df['text5'] = df['text4'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [1160]:
#df['text5']

In [1161]:
#df.head()

# clean_text

In [1162]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [1241]:
df['text5'] = df['text4'].map(lambda com : clean_text(com))
df['text5'][3]

'testing testing'

# Do Lemmatization

In [1164]:
from nltk.stem import WordNetLemmatizer

def get_lemmatized_text(corpus):
    
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

In [1165]:
#Create a new column to hold lemmatized reviews
df['text5'] = get_lemmatized_text(df['text5'].tolist())

In [1166]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,text2,text3,text4,text5
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,...",Info has been found 100 pages and 4 5 MB of p...,info has been found 100 pages and 4 5 mb of p...,info has been found 100 pages and 4 5 mb of pd...,info ha been found 100 page and 4 5 mb of pdf ...
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...,These are the team members Drewes van der Laa...,these are the team members drewes van der laa...,these are the team members drewes van der laag...,these are the team member drewes van der laag ...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...,In het kader van kernfusie op aarde MAAK JE E...,in het kader van kernfusie op aarde maak je e...,in het kader van kernfusie op aarde maak je ei...,in het kader van kernfusie op aarde maak je ei...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!,testing testing,testing testing,testing testing,testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...,Thanks to Yahoo s Toolbar I can now capture t...,thanks to yahoo s toolbar i can now capture t...,thanks to yahoo s toolbar i can now capture th...,thanks to yahoo s toolbar i can now capture th...


In [1167]:
df['text5'][0]

'info ha been found 100 page and 4 5 mb of pdf file now i have to wait untill our team leader ha processed it and learns html'

# Do Stemming

In [1168]:
from nltk.stem import PorterStemmer

#Function to Stem words
def get_stemmed_text(corpus):
    stemmer = PorterStemmer()
    return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]

In [1169]:
df['text5'] = get_stemmed_text(df['text5'].tolist())

In [1170]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,text2,text3,text4,text5
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,...",Info has been found 100 pages and 4 5 MB of p...,info has been found 100 pages and 4 5 mb of p...,info has been found 100 pages and 4 5 mb of pd...,info ha been found 100 page and 4 5 mb of pdf ...
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...,These are the team members Drewes van der Laa...,these are the team members drewes van der laa...,these are the team members drewes van der laag...,these are the team member drew van der laag ur...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...,In het kader van kernfusie op aarde MAAK JE E...,in het kader van kernfusie op aarde maak je e...,in het kader van kernfusie op aarde maak je ei...,in het kader van kernfusi op aard maak je eige...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!,testing testing,testing testing,testing testing,test test
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...,Thanks to Yahoo s Toolbar I can now capture t...,thanks to yahoo s toolbar i can now capture t...,thanks to yahoo s toolbar i can now capture th...,thank to yahoo s toolbar i can now captur the ...


In [1245]:
df['text5'][4111]

'as i burned my tongue on my oatmeal this morning i started thinking about how my life has lately been a pure example of ying yang a complete balance of good bad examples being first of all i had to cancel a very chi chi dinner i had planned to include a 5 course meal under a full miami moon oceanside with 4 french wines to accompany the meal masquerade gourmet cuisine great people to say the least i was extremely dissapointed in what to me represented complete and utter failure on my part but then on the day of the cancelled dinner someone handed me an article extolling another past event that i put together the writer bless him described the event as cozy and refreshing with the company of gentle folk as he described it exactly as i remember it and i felt really good to know that someone else besides myself really enjoyed the fruits of my labor so i am once again reminded that you take the good with the bad even when i am at my lowest point it takes so little to bring me back to the 

In [1172]:
df_cleantext = df.copy(deep=True)

In [1173]:
df_cleantext['text'] = df_cleantext['text5']
df_cleantext.drop(['text2','text3','text4','text5'],axis=1,inplace=True)

In [1174]:
df_cleantext.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004",info ha been found 100 page and 4 5 mb of pdf ...
1,2059027,male,15,Student,Leo,"13,May,2004",these are the team member drew van der laag ur...
2,2059027,male,15,Student,Leo,"12,May,2004",in het kader van kernfusi op aard maak je eige...
3,2059027,male,15,Student,Leo,"12,May,2004",test test
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thank to yahoo s toolbar i can now captur the ...


# 3. As we want to make this into a multi-label classification problem, you are required to merge all the label columns together, so that we have all the labels together for a particular sentence (7.5 points)
## a. Label columns to merge: “gender”, “age”, “topic”, “sign”
## b. After completing the previous step, there should be only two columns in your data frame i.e. “text” and “labels” as shown in the below image

In [1175]:
df_merge=df_cleantext.copy(deep=True)

In [1176]:
df_merge.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004",info ha been found 100 page and 4 5 mb of pdf ...
1,2059027,male,15,Student,Leo,"13,May,2004",these are the team member drew van der laag ur...
2,2059027,male,15,Student,Leo,"12,May,2004",in het kader van kernfusi op aard maak je eige...
3,2059027,male,15,Student,Leo,"12,May,2004",test test
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thank to yahoo s toolbar i can now captur the ...


In [1177]:
df_merge.shape

(5000, 7)

In [1178]:
df_merge.dtypes

id         int64
gender    object
age        int64
topic     object
sign      object
date      object
text      object
dtype: object

In [1179]:
#df_merge['labels']='[' + df_merge['gender'] + ',' + df_merge['age'].astype(str)  + ',' + df_merge['topic'] + ',' +df_merge['sign'] + ']' 
df_merge['labels']=df_merge['gender'] + ',' + df_merge['age'].astype(str)  + ',' + df_merge['topic'] + ',' +df_merge['sign']

In [1180]:
df_modified = pd.DataFrame(df_merge, columns  =['text', 'labels'])
df_modified.head()

Unnamed: 0,text,labels
0,info ha been found 100 page and 4 5 mb of pdf ...,"male,15,Student,Leo"
1,these are the team member drew van der laag ur...,"male,15,Student,Leo"
2,in het kader van kernfusi op aard maak je eige...,"male,15,Student,Leo"
3,test test,"male,15,Student,Leo"
4,thank to yahoo s toolbar i can now captur the ...,"male,33,InvestmentBanking,Aquarius"


In [1181]:
df_modified.shape

(5000, 2)

# 4. Separate features and labels, and split the data into training and testing (5 points)

In [1182]:
from sklearn.model_selection import train_test_split

In [1183]:
# split X and y into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(df_modified.text, df_modified.labels, test_size=0.2,random_state=2)

In [1184]:
#Training data
print(x_train.shape)
print(y_train.shape)

(4000,)
(4000,)


In [1185]:
#Test data
print(x_test.shape)
print(y_test.shape)

(1000,)
(1000,)


In [1186]:
x_train

868     ye thi mean that i am back home again actual w...
936     well look at the amount of homework i ve got f...
4034    hey it s been a bit wow king arthur wa the las...
1923    from the secret citi what doe ad free my fello...
3330    i ll probabl be move to canada in decemb we ge...
676     i m back unfortun i got my mattress yesterday ...
1145    music zanzibar suburban legend fall without yo...
1301    spare the rod save the child missouri s youth ...
2458    man wear name tag to make the world a friendli...
1242    everyth s lost everyth fade you have noth and ...
4195    ugh haircut everytim i get my hair cut i m rem...
3670    but is there a cure from an honest to god real...
3160    meanwhil thi headlin from cnn polic await mich...
3970    anoth day confus weav a web around my thought ...
4315    and later that day i receiv a call today from ...
2965    from a post on monster other critic requir sig...
3593             the berat will continu until blog improv
625     i got 

# 5. Vectorize the features (5 points)
## a. Create a Bag of Words using count vectorizer
### i. Use ngram_range=(1, 2)
### ii. Vectorize training and testing features
## b. Print the term-document matrix

In [1187]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
cvect = CountVectorizer(stop_words='english', ngram_range=(1,2))

In [1188]:
#Feed text data to CountVectorizer
cvect.fit(x_train)

#Check the vocablury size
len(cvect.vocabulary_)

239055

In [1189]:
#What is there in the vocabulary
cvect.vocabulary_

{'ye': 236987,
 'thi': 205116,
 'mean': 130435,
 'home': 97208,
 'actual': 5855,
 'got': 85324,
 'saturday': 177532,
 'night': 141587,
 'pretti': 160964,
 'busi': 30723,
 'trip': 214523,
 'wa': 223308,
 'awesom': 17420,
 'onli': 146221,
 'time': 209291,
 'chicago': 36692,
 'twice': 216040,
 'just': 109172,
 'connector': 43867,
 'flight': 76408,
 'hare': 91942,
 'seen': 180796,
 'citi': 38225,
 'stay': 195028,
 'downtown': 60222,
 'walk': 225734,
 'lake': 114254,
 'shore': 184115,
 'park': 150351,
 'area': 13973,
 'beauti': 20173,
 'unfortun': 217426,
 'traffic': 213138,
 'terribl': 204100,
 'wors': 235263,
 'nyc': 143756,
 'imo': 102060,
 'heard': 93295,
 'la': 113803,
 'realli': 167553,
 'piti': 155664,
 'everyth': 68494,
 'like': 118878,
 'museum': 137571,
 'shedd': 183230,
 'aquarium': 13814,
 'close': 39419,
 'day': 51033,
 'friday': 79311,
 'went': 230152,
 'visit': 222709,
 'campu': 32116,
 'love': 125078,
 'felt': 74032,
 'sort': 191208,
 'prioriti': 161582,
 'want': 226302,
 'p

In [1190]:
#Convert Training into Count Vectors
x_train_ct = cvect.transform(x_train)

In [1191]:
x_train_ct.shape

(4000, 239055)

In [1192]:
x_train_ct[0]

<1x239055 sparse matrix of type '<class 'numpy.int64'>'
	with 545 stored elements in Compressed Sparse Row format>

In [1193]:
#What's there in sparse matrix
print(x_train_ct[0])

  (0, 471)	2
  (0, 530)	1
  (0, 576)	1
  (0, 916)	1
  (0, 919)	1
  (0, 1728)	1
  (0, 1750)	1
  (0, 2500)	1
  (0, 2530)	1
  (0, 2970)	1
  (0, 2977)	1
  (0, 4052)	1
  (0, 4061)	1
  (0, 4085)	1
  (0, 4089)	1
  (0, 5855)	2
  (0, 5992)	1
  (0, 6077)	1
  (0, 8361)	1
  (0, 8365)	1
  (0, 10239)	1
  (0, 10308)	1
  (0, 10995)	1
  (0, 11210)	1
  (0, 13511)	1
  :	:
  (0, 229318)	1
  (0, 229479)	1
  (0, 230152)	1
  (0, 230578)	1
  (0, 232021)	1
  (0, 232033)	1
  (0, 233280)	1
  (0, 233340)	1
  (0, 233455)	1
  (0, 233579)	1
  (0, 235263)	2
  (0, 235300)	1
  (0, 235307)	1
  (0, 235615)	1
  (0, 235625)	1
  (0, 236442)	1
  (0, 236444)	1
  (0, 236828)	1
  (0, 236830)	1
  (0, 236987)	1
  (0, 237203)	1
  (0, 237244)	1
  (0, 237389)	1
  (0, 238649)	1
  (0, 238677)	1


In [1194]:
#convert x_test also into numerical features
x_test_ct = cvect.transform(x_test)

In [1195]:
x_test_ct.shape

(1000, 239055)

# 6. Create a dictionary to get the count of every label i.e. the key will be label name and value will be the total count of the label. Check below image for reference (5 points)

In [1196]:
df_labels = df_modified['labels']

In [1197]:
df_labels.head()

0                   male,15,Student,Leo
1                   male,15,Student,Leo
2                   male,15,Student,Leo
3                   male,15,Student,Leo
4    male,33,InvestmentBanking,Aquarius
Name: labels, dtype: object

In [1198]:
label_arr = df_labels.to_numpy()

In [1199]:
label_arr 

array(['male,15,Student,Leo', 'male,15,Student,Leo',
       'male,15,Student,Leo', ..., 'female,17,indUnk,Scorpio',
       'female,17,indUnk,Scorpio', 'female,17,indUnk,Scorpio'],
      dtype=object)

In [1200]:
map = {}
count = 0
for label in label_arr:
  s = label[1:len(label)-1]
  a = s.split(',')
  count = count+ 1
  for token in a:
    if(token not in map.keys()):
      map[token] = 1
    else:
      map[token]= map[token] + 1 
print(count)

5000


In [1201]:
for token, count in map.items(): 
    print(token, ":", count) 

ale : 3294
15 : 339
Student : 569
Le : 190
33 : 101
InvestmentBanking : 70
Aquariu : 329
emale : 1706
14 : 170
indUnk : 1381
Arie : 2483
25 : 268
Capricor : 84
17 : 331
Gemin : 86
23 : 137
Non-Profit : 47
Cance : 94
Banking : 16
37 : 19
Sagittariu : 704
26 : 96
24 : 353
Scorpi : 408
27 : 86
Education : 118
45 : 14
Engineering : 119
Libr : 414
Science : 33
34 : 540
41 : 14
Communications-Media : 61
BusinessServices : 87
Sports-Recreation : 75
Virg : 41
Tauru : 100
Arts : 31
Pisce : 67
44 : 3
16 : 67
Internet : 20
Museums-Libraries : 2
Accounting : 2
39 : 79
35 : 2307
Technology : 2332
36 : 60
Law : 3
46 : 7
Consulting : 16
Automotive : 14
42 : 9
Religion : 4


# 7. Transform the labels - (7.5 points) As we have noticed before, in this task each example can have multiple tags. To deal with such kind of prediction, we need to transform labels in a binary form and the prediction will be a mask of 0s and 1s. For this purpose, it is convenient to use MultiLabelBinarizer from sklearn
## a. Convert your train and test labels using MultiLabelBinarizer

In [1243]:
from sklearn.preprocessing import MultiLabelBinarizer

In [1203]:
# an empty list
y_train_list = [] 

for i in y_train: 
    y_train_list.append(i.split(",")) 
    #print(i)
    #print(i.split(","))
    

In [1204]:
y_train_list

[['male', '17', 'Sports-Recreation', 'Capricorn'],
 ['male', '14', 'Student', 'Scorpio'],
 ['male', '14', 'Student', 'Leo'],
 ['male', '35', 'Technology', 'Aries'],
 ['male', '35', 'Technology', 'Aries'],
 ['male', '24', 'Engineering', 'Libra'],
 ['female', '15', 'Student', 'Libra'],
 ['male', '39', 'Education', 'Virgo'],
 ['male', '35', 'Technology', 'Aries'],
 ['female', '15', 'Student', 'Aquarius'],
 ['female', '34', 'indUnk', 'Sagittarius'],
 ['male', '35', 'Technology', 'Aries'],
 ['male', '35', 'Technology', 'Aries'],
 ['female', '25', 'BusinessServices', 'Aries'],
 ['female', '34', 'indUnk', 'Sagittarius'],
 ['male', '35', 'Technology', 'Aries'],
 ['male', '35', 'Technology', 'Aries'],
 ['male', '24', 'Engineering', 'Libra'],
 ['male', '26', 'indUnk', 'Gemini'],
 ['male', '35', 'Technology', 'Aries'],
 ['male', '35', 'Technology', 'Aries'],
 ['male', '35', 'Technology', 'Aries'],
 ['male', '35', 'Technology', 'Aries'],
 ['male', '35', 'Technology', 'Aries'],
 ['male', '35', 'Tec

In [1205]:
# an empty list
y_test_list = [] 

for i in y_test: 
    y_test_list.append(i.split(",")) 
    #print(i)
    #print(i.split(","))
    

In [1206]:
y_test_list

[['male', '35', 'Technology', 'Aries'],
 ['female', '34', 'indUnk', 'Sagittarius'],
 ['male', '35', 'Technology', 'Aries'],
 ['female', '34', 'indUnk', 'Sagittarius'],
 ['male', '35', 'Technology', 'Aries'],
 ['male', '14', 'Student', 'Scorpio'],
 ['male', '35', 'Technology', 'Aries'],
 ['male', '25', 'Arts', 'Aries'],
 ['female', '15', 'Student', 'Libra'],
 ['male', '35', 'Technology', 'Aries'],
 ['male', '35', 'Technology', 'Aries'],
 ['female', '34', 'indUnk', 'Sagittarius'],
 ['female', '14', 'indUnk', 'Aries'],
 ['male', '35', 'Technology', 'Aries'],
 ['male', '35', 'Technology', 'Aries'],
 ['female', '24', 'indUnk', 'Scorpio'],
 ['male', '35', 'Technology', 'Aries'],
 ['male', '35', 'Technology', 'Aries'],
 ['male', '14', 'Student', 'Scorpio'],
 ['male', '35', 'Technology', 'Aries'],
 ['male', '33', 'InvestmentBanking', 'Aquarius'],
 ['male', '24', 'Engineering', 'Libra'],
 ['male', '35', 'Technology', 'Aries'],
 ['male', '35', 'Technology', 'Aries'],
 ['female', '17', 'Student',

In [1207]:
mlb = MultiLabelBinarizer()


y_train_mlb = mlb.fit_transform(y_train_list)
y_test_mlb = mlb.transform(y_test_list)

In [1208]:
list(mlb.classes_)

['14',
 '15',
 '16',
 '17',
 '23',
 '24',
 '25',
 '26',
 '27',
 '33',
 '34',
 '35',
 '36',
 '37',
 '39',
 '41',
 '42',
 '44',
 '45',
 '46',
 'Accounting',
 'Aquarius',
 'Aries',
 'Arts',
 'Automotive',
 'Banking',
 'BusinessServices',
 'Cancer',
 'Capricorn',
 'Communications-Media',
 'Consulting',
 'Education',
 'Engineering',
 'Gemini',
 'Internet',
 'InvestmentBanking',
 'Law',
 'Leo',
 'Libra',
 'Museums-Libraries',
 'Non-Profit',
 'Pisces',
 'Religion',
 'Sagittarius',
 'Science',
 'Scorpio',
 'Sports-Recreation',
 'Student',
 'Taurus',
 'Technology',
 'Virgo',
 'female',
 'indUnk',
 'male']

In [1209]:
y_train_mlb[100]

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1])

In [1210]:
print(y_train_mlb.shape)
print(y_test_mlb.shape)

(4000, 54)
(1000, 54)


# 8. choose a classifier - (5 points) In this task, we suggest using the One-vs-Rest approach, which is implemented in OneVsRestClassifier class. In this approach k classifiers (= number of tags) are trained. As a basic classifier, use LogisticRegression. It is one of the simplest methods, but often it performs good enough in text classification tasks. It might take some time because the number of classifiers to train is large.
## a. Use a linear classifier of your choice, wrap it up in OneVsRestClassifier to train it on every label
## b. As One-vs-Rest approach might not have been discussed in the sessions, we are providing you the code for that

In [1211]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

lg = LogisticRegression(solver='lbfgs',max_iter=1000)
#lg = LogisticRegression(solver='lbfgs')
clf = OneVsRestClassifier(lg)

In [1212]:
clf.fit(x_train_ct, y_train_mlb)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [1213]:
predicted = clf.predict(x_test_ct)

In [1214]:
predicted

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0]])

In [1215]:
# summarize the fit of the model
model_score = clf.score(x_test_ct, y_test_mlb)
print(model_score)

0.552


In [1216]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test_mlb, predicted)

0.552

# Fit the classifier, make predictions and get the accuracy (5 points)
## a. Print the following
### i. Accuracy score
### ii. F1 score
### iii. Average precision score
### iv. Average recall score
### v. Tip: Make sure you are familiar with all of them. How would you expect the things to work for the multi-label scenario? Read about micro/macro/weighted averaging

In [1217]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
print('Accuracy: ', accuracy_score(y_test_mlb, predicted))
print('Precison: ', precision_score(y_test_mlb, predicted,average='micro'))
print('Recall: ', recall_score(y_test_mlb, predicted,average='micro'))
print('F1: ', f1_score(y_test_mlb, predicted,average='micro'))

Accuracy:  0.552
Precison:  0.84435558289757
Recall:  0.68625
F1:  0.7571369466280513


# 10. Print true label and predicted label for any five examples (7.5 points)

In [1235]:
for i in range(0,5):
    print("    True Label : " ,y_test_mlb[i])
    print("Predicted Label: " ,predicted[i])
    print("\n")    

    True Label :  [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1]
Predicted Label:  [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1]


    True Label :  [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0]
Predicted Label:  [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0]


    True Label :  [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1]
Predicted Label:  [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1]


    True Label :  [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0]
Predicted Label:  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [1236]:
y = mlb.inverse_transform(y_test_mlb)
y_pred = mlb.inverse_transform(predicted)

In [1240]:
for i in range(0,5):
    print("    True Label : " ,y[i])
    print("Predicted Label: " ,y_pred[i])
    print("\n")

    True Label :  ('35', 'Aries', 'Technology', 'male')
Predicted Label:  ('35', 'Aries', 'Technology', 'male')


    True Label :  ('34', 'Sagittarius', 'female', 'indUnk')
Predicted Label:  ('34', 'Sagittarius', 'female', 'indUnk')


    True Label :  ('35', 'Aries', 'Technology', 'male')
Predicted Label:  ('35', 'Aries', 'Technology', 'male')


    True Label :  ('34', 'Sagittarius', 'female', 'indUnk')
Predicted Label:  ('female', 'indUnk')


    True Label :  ('35', 'Aries', 'Technology', 'male')
Predicted Label:  ('35', 'Aries', 'Technology', 'male')


