**1. Load the dataset**

In [0]:
import pandas as pd
import numpy as np

In [264]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [0]:
# read file into pandas using a relative path. Please change the path as needed
corpus_df = pd.read_csv('/gdrive/My Drive/Colab Notebooks/NLP/blogtext.csv')

In [266]:
corpus_df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [267]:
corpus_df.size

4768988

In [0]:
#Sample the orginal becoz of huge volume
sample_df=pd.DataFrame(corpus_df.loc[0:4999,:])

In [391]:
sample_df.shape

(5000, 7)

In [392]:
sample_df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004",info found pages mb pdf files wait untill team...
1,2059027,male,15,Student,Leo,"13,May,2004",team members drewes van der laag urllink mail ...
2,2059027,male,15,Student,Leo,"12,May,2004",het kader van kernfusie op aarde maak je eigen...
3,2059027,male,15,Student,Leo,"12,May,2004",testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks yahoos toolbar capture urls popupswhich...


**2. Preprocess rows of the “text” column**

In [393]:
sample_df['text']

0       info found pages mb pdf files wait untill team...
1       team members drewes van der laag urllink mail ...
2       het kader van kernfusie op aarde maak je eigen...
3                                         testing testing
4       thanks yahoos toolbar capture urls popupswhich...
                              ...                        
4995    another one dreams last night except time leas...
4996    mmm strawberry tea breakfast tomorrow think de...
4997    yay new layout yeah know need get complicated ...
4998    ok lied fed isnt playing friday night underdog...
4999    well today went church talked music director t...
Name: text, Length: 5000, dtype: object

**A) REMOVE UNWANTED CHARACTERS**

In [0]:
import re
import unicodedata

In [0]:
def remove_special_characters(text, remove_digits=False):
    #Using regex
    pattern = r'[^a-zA-z\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

In [0]:
sample_df['text']=sample_df['text'].apply(remove_special_characters)

In [397]:
sample_df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004",info found pages mb pdf files wait untill team...
1,2059027,male,15,Student,Leo,"13,May,2004",team members drewes van der laag urllink mail ...
2,2059027,male,15,Student,Leo,"12,May,2004",het kader van kernfusie op aarde maak je eigen...
3,2059027,male,15,Student,Leo,"12,May,2004",testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks yahoos toolbar capture urls popupswhich...


**B) Convert text to lowercase**

In [0]:
sample_df['text']=[i.lower() for i in sample_df['text']]

In [399]:
sample_df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004",info found pages mb pdf files wait untill team...
1,2059027,male,15,Student,Leo,"13,May,2004",team members drewes van der laag urllink mail ...
2,2059027,male,15,Student,Leo,"12,May,2004",het kader van kernfusie op aarde maak je eigen...
3,2059027,male,15,Student,Leo,"12,May,2004",testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks yahoos toolbar capture urls popupswhich...


**C) Remove unwanted space**

In [0]:
 # remove extra newlines
 sample_df['text']=[re.sub(r'[\r|\n|\r\n]+', ' ',i) for i in sample_df['text']]
 # remove extra whitespace
sample_df['text']=[re.sub(' +', ' ', i) for i in sample_df['text']]

In [401]:
sample_df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004",info found pages mb pdf files wait untill team...
1,2059027,male,15,Student,Leo,"13,May,2004",team members drewes van der laag urllink mail ...
2,2059027,male,15,Student,Leo,"12,May,2004",het kader van kernfusie op aarde maak je eigen...
3,2059027,male,15,Student,Leo,"12,May,2004",testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks yahoos toolbar capture urls popupswhich...


**D.) Remove stopwords** 

In [402]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
sample_df['text'] = sample_df['text'].apply(lambda x:' '.join([word for word in x.split() if word not in stopwords.words('english')]))


In [404]:
sample_df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004",info found pages mb pdf files wait untill team...
1,2059027,male,15,Student,Leo,"13,May,2004",team members drewes van der laag urllink mail ...
2,2059027,male,15,Student,Leo,"12,May,2004",het kader van kernfusie op aarde maak je eigen...
3,2059027,male,15,Student,Leo,"12,May,2004",testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks yahoos toolbar capture urls popupswhich...


3. As we want to make this into a multi-label classification problem, you are required to merge all the label columns together, so that we have all the labels together for a particular sentence 

**A)  Label columns to merge: “gender”, “age”, “topic”, “sign”** 

In [0]:
#For using multilabel Binarizer
#We need to convert all numeric to str
sample_df['age']=sample_df['age'].astype(str)

In [0]:
#Merge all the columns
sample_df['labels']=sample_df[['gender','age','topic','sign']].values.tolist()

In [407]:
sample_df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,labels
0,2059027,male,15,Student,Leo,"14,May,2004",info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,2059027,male,15,Student,Leo,"13,May,2004",team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,2059027,male,15,Student,Leo,"12,May,2004",het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,2059027,male,15,Student,Leo,"12,May,2004",testing testing,"[male, 15, Student, Leo]"
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks yahoos toolbar capture urls popupswhich...,"[male, 33, InvestmentBanking, Aquarius]"


b.) After completing the previous step, there should be only two columns in your data frame i.e. “text” and “labels” 

In [0]:
sample_df_mod=sample_df.drop(labels=['gender','age','sign','topic','id','date'],axis=1)

In [409]:
sample_df_mod.head()

Unnamed: 0,text,labels
0,info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,testing testing,"[male, 15, Student, Leo]"
4,thanks yahoos toolbar capture urls popupswhich...,"[male, 33, InvestmentBanking, Aquarius]"


In [0]:
#For using multilabel Binarizer the classes are to generated using the labels for which they need to be in list
sample_df_mod['labels']=[tuple(i) for i in sample_df_mod['labels']]

In [411]:
sample_df_mod.head()

Unnamed: 0,text,labels
0,info found pages mb pdf files wait untill team...,"(male, 15, Student, Leo)"
1,team members drewes van der laag urllink mail ...,"(male, 15, Student, Leo)"
2,het kader van kernfusie op aarde maak je eigen...,"(male, 15, Student, Leo)"
3,testing testing,"(male, 15, Student, Leo)"
4,thanks yahoos toolbar capture urls popupswhich...,"(male, 33, InvestmentBanking, Aquarius)"


**4. Separate features and labels, and split the data into training and testing** 

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
# split X and y into training and testing sets
X=np.array(sample_df_mod['text'])
Y=np.array(sample_df_mod['labels'])
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=2)

In [414]:
#Traing data
print(X_train.shape)
print(Y_train.shape)

(3750,)
(3750,)


In [415]:
#Test Data
print(X_test.shape)
print(Y_test.shape)

(1250,)
(1250,)


**5. Vectorize the features**

 **a. Create a Bag of Words using count vectorizer**

**i.) Use n_gram(1,2)**

In [0]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(ngram_range=(1, 2)) #bigram

In [417]:
vect.fit(X_train)
#Check the vocablury size
len(vect.vocabulary_)

261460

**ii).Vectorize training and testing features**

In [0]:
X_train_ct = vect.transform(X_train) #This creates the document term matrix

In [419]:
#Size of Document Term Matrix
X_train_ct.shape

(3750, 261460)

In [0]:
X_test_ct = vect.transform(X_test)

In [421]:
X_test_ct.shape

(1250, 261460)

**b). Print the term-document matrix**

In [422]:
#the first record
# The data will be stored in terms of a sparse matrix
X_train_ct[0]

<1x261460 sparse matrix of type '<class 'numpy.int64'>'
	with 113 stored elements in Compressed Sparse Row format>

In [423]:
print(X_train_ct[0]) 

  (0, 7750)	1
  (0, 7753)	1
  (0, 10141)	1
  (0, 10190)	1
  (0, 10224)	1
  (0, 10228)	1
  (0, 12361)	1
  (0, 12364)	1
  (0, 20900)	1
  (0, 20978)	1
  (0, 29805)	2
  (0, 29842)	1
  (0, 29992)	1
  (0, 40849)	2
  (0, 40855)	1
  (0, 40856)	1
  (0, 41914)	1
  (0, 41948)	1
  (0, 47263)	1
  (0, 55315)	1
  (0, 55316)	1
  (0, 59179)	1
  (0, 59265)	1
  (0, 68376)	2
  (0, 68514)	1
  :	:
  (0, 200726)	1
  (0, 207182)	1
  (0, 207226)	1
  (0, 207480)	1
  (0, 207635)	1
  (0, 210534)	1
  (0, 210537)	1
  (0, 210901)	1
  (0, 210902)	1
  (0, 225067)	1
  (0, 225128)	1
  (0, 227929)	1
  (0, 228178)	1
  (0, 234060)	1
  (0, 234061)	1
  (0, 235873)	1
  (0, 235882)	1
  (0, 238268)	1
  (0, 238317)	1
  (0, 239925)	1
  (0, 240632)	1
  (0, 248630)	1
  (0, 248650)	1
  (0, 260536)	1
  (0, 260582)	1


**6. Create a dictionary to get the count of every label i.e. the key will be label name and value will be the total count of the label.**

In [0]:
dict_labels={}

In [0]:
for i in 'age gender topic sign'.split():
  dict_labels.update(dict(sample_df[i].value_counts()))

In [426]:
dict_labels

{'14': 170,
 '15': 339,
 '16': 67,
 '17': 331,
 '23': 137,
 '24': 353,
 '25': 268,
 '26': 96,
 '27': 86,
 '33': 101,
 '34': 540,
 '35': 2307,
 '36': 60,
 '37': 19,
 '39': 79,
 '41': 14,
 '42': 9,
 '44': 3,
 '45': 14,
 '46': 7,
 'Accounting': 2,
 'Aquarius': 329,
 'Aries': 2483,
 'Arts': 31,
 'Automotive': 14,
 'Banking': 16,
 'BusinessServices': 87,
 'Cancer': 94,
 'Capricorn': 84,
 'Communications-Media': 61,
 'Consulting': 16,
 'Education': 118,
 'Engineering': 119,
 'Gemini': 86,
 'Internet': 20,
 'InvestmentBanking': 70,
 'Law': 3,
 'Leo': 190,
 'Libra': 414,
 'Museums-Libraries': 2,
 'Non-Profit': 47,
 'Pisces': 67,
 'Religion': 4,
 'Sagittarius': 704,
 'Science': 33,
 'Scorpio': 408,
 'Sports-Recreation': 75,
 'Student': 569,
 'Taurus': 100,
 'Technology': 2332,
 'Virgo': 41,
 'female': 1706,
 'indUnk': 1381,
 'male': 3294}

**7.  Transform the labels:**

   **a. Convert your train and test labels using MultiLabelBinarizer** 

In [0]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

In [428]:
mlb.fit(Y_train)

MultiLabelBinarizer(classes=None, sparse_output=False)

In [429]:
mlb.classes_

array(['14', '15', '16', '17', '23', '24', '25', '26', '27', '33', '34',
       '35', '36', '37', '39', '41', '42', '44', '45', '46', 'Accounting',
       'Aquarius', 'Aries', 'Arts', 'Automotive', 'Banking',
       'BusinessServices', 'Cancer', 'Capricorn', 'Communications-Media',
       'Consulting', 'Education', 'Engineering', 'Gemini', 'Internet',
       'InvestmentBanking', 'Law', 'Leo', 'Libra', 'Museums-Libraries',
       'Non-Profit', 'Pisces', 'Religion', 'Sagittarius', 'Science',
       'Scorpio', 'Sports-Recreation', 'Student', 'Taurus', 'Technology',
       'Virgo', 'female', 'indUnk', 'male'], dtype=object)

In [0]:
Y_train_mlb=mlb.transform(Y_train)

In [431]:
Y_train_mlb

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]])

In [0]:
Y_test_mlb=mlb.transform(Y_test)

In [433]:
Y_test_mlb

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 1, 0]])

In [434]:
Y_test_mlb.shape

(1250, 54)

**8.  Choose a classifier**

**Use a linear classifier of your choice, wrap it up in OneVsRestClassifier to train it on every label** 

In [0]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

In [0]:
clf = LogisticRegression(solver='lbfgs',max_iter=1000)
clf = OneVsRestClassifier(clf)

**9.  Fit the classifier, make predictions and get the accuracy**

In [437]:
# fit the model with data (occurs in-place)
clf.fit(X_train_ct, Y_train_mlb)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [438]:
from sklearn import metrics
metrics.accuracy_score(Y_train_mlb, clf.predict(X_train_ct))

0.9768

In [439]:
Y_predict=clf.predict(X_test_ct)
Y_predict

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 1, 0]])

**i. Accuracy score** 

In [440]:
from sklearn import metrics
#Calculate accuracy on Test Dataset
metrics.accuracy_score(Y_test_mlb, Y_predict)

0.5296

**ii) F1 score**

In [441]:
from sklearn.metrics import f1_score
print('F1_score_macro',f1_score(Y_test_mlb, Y_predict, average='macro'))
print('F1_score_micro',f1_score(Y_test_mlb, Y_predict, average='micro'))
print('F1_score_weighted',f1_score(Y_test_mlb, Y_predict, average='weighted'))

F1_score_macro 0.24919729374885535
F1_score_micro 0.7393818100956193
F1_score_weighted 0.6894989962856233


  average, "true nor predicted", 'F-score is', len(true_sum)


In [0]:
from sklearn.metrics import confusion_matrix,precision_score,recall_score

**iii). Average precision score**

In [443]:
precision_score(Y_test_mlb,Y_predict,average=None)

  _warn_prf(average, modifier, msg_start, len(result))


array([0.57142857, 0.82352941, 0.        , 0.92307692, 0.        ,
       0.93939394, 0.35714286, 0.        , 0.5       , 0.8       ,
       0.96521739, 0.82985554, 1.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.71428571, 0.81268882, 0.        , 0.        ,
       0.        , 0.85714286, 1.        , 1.        , 0.        ,
       0.        , 0.5       , 0.875     , 0.        , 0.        ,
       0.71428571, 0.        , 1.        , 0.79166667, 0.        ,
       0.        , 0.        , 0.        , 0.94262295, 0.        ,
       0.81818182, 1.        , 0.76923077, 0.        , 0.82389937,
       0.        , 0.87134503, 0.79032258, 0.83700441])

**iv) Average recall score**

In [444]:
recall_score(Y_test_mlb,Y_predict,average=None)

  _warn_prf(average, modifier, msg_start, len(result))


array([0.08163265, 0.35897436, 0.        , 0.3       , 0.        ,
       0.36046512, 0.09615385, 0.        , 0.08      , 0.28571429,
       0.75      , 0.91504425, 0.07142857, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.24390244, 0.87337662, 0.        , 0.        ,
       0.        , 0.26086957, 0.23529412, 0.36363636, 0.        ,
       0.        , 0.08571429, 0.22580645, 0.        , 0.        ,
       0.27777778, 0.        , 0.10909091, 0.35185185, 0.        ,
       0.        , 0.        , 0.        , 0.60526316, 0.        ,
       0.17821782, 0.4       , 0.2739726 , 0.        , 0.92091388,
       0.        , 0.66816143, 0.55681818, 0.94527363])

10.  Print true label and predicted label for any five examples 

In [445]:
print(Y_test_mlb[0:5])

[[0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1]]


In [446]:
print(Y_predict[0:5])

[[0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1]]


Decoding back to the original

In [447]:
#original test data for first 5 records
mlb.inverse_transform(Y_test_mlb[0:5])

[('35', 'Aries', 'Technology', 'male'),
 ('34', 'Sagittarius', 'female', 'indUnk'),
 ('35', 'Aries', 'Technology', 'male'),
 ('34', 'Sagittarius', 'female', 'indUnk'),
 ('35', 'Aries', 'Technology', 'male')]

In [448]:
#Predicted values for the test data
mlb.inverse_transform(Y_predict[0:5])

[('35', 'Aries', 'Technology', 'male'),
 ('34', 'Sagittarius', 'female', 'indUnk'),
 ('35', 'Aries', 'Technology', 'male'),
 ('34', 'Sagittarius', 'female', 'indUnk'),
 ('35', 'Aries', 'Technology', 'male')]