In [1]:
% tensorflow_version 2.x

TensorFlow 2.x selected.


In [0]:
import tensorflow as tf

In [3]:
tf.__version__

'2.1.0'

In [0]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

In [0]:
path = '/content/drive/My Drive/NLP/'

In [0]:
os.chdir(path)

In [7]:
os.listdir

<function posix.listdir>

In [0]:
zip_path = path + 'blog-authorship-corpus.zip'

In [0]:
from zipfile import ZipFile
with ZipFile(zip_path , 'r') as z:
  z.extractall()

In [10]:
os.listdir()

['blog-authorship-corpus.zip', 'blogtext.csv']

In [0]:
df=pd.read_csv('blogtext.csv')

In [12]:
df.shape

(681284, 7)

In [0]:
df=df.iloc[:100000,:]

In [14]:
df.shape

(100000, 7)

In [16]:
df.head(1)

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."


In [17]:
df.isnull().sum()

id        0
gender    0
age       0
topic     0
sign      0
date      0
text      0
dtype: int64

In [18]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Data Cleaning

In [0]:
import re, string

def clean_str(string):
    

    try:
        string=re.sub(r'^https?:\/\/<>.*[\r\n]*','',string,flags=re.MULTILINE) 
        string=re.sub(r"[^A-Za-z]"," ",string) 
        words=string.strip().lower().split() 
        words=[w for w in words if not w in stop_words] 
        return " ".join(words) 
    except:
        return ""

In [0]:
df['processed_text'] = df['text'].map(clean_str)

Merging labels

In [0]:
df['labels'] = df['gender'] +',' + df['age'].astype(str) + ',' + df['topic'] + ',' + df['sign']

In [24]:
df['labels'].head()

0                   male,15,Student,Leo
1                   male,15,Student,Leo
2                   male,15,Student,Leo
3                   male,15,Student,Leo
4    male,33,InvestmentBanking,Aquarius
Name: labels, dtype: object

In [0]:
df = df[['processed_text','labels']]

In [29]:
df.head()

Unnamed: 0,processed_text,labels
0,info found pages mb pdf files wait untill team...,"male,15,Student,Leo"
1,team members drewes van der laag urllink mail ...,"male,15,Student,Leo"
2,het kader van kernfusie op aarde maak je eigen...,"male,15,Student,Leo"
3,testing testing,"male,15,Student,Leo"
4,thanks yahoo toolbar capture urls popups means...,"male,33,InvestmentBanking,Aquarius"


Separate features and labels, and split the data into training and testing

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X = df['processed_text']
y = df ['labels']

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=1)

Vectorize the features

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
vectorizer = CountVectorizer(ngram_range=(1, 2),min_df=2)

In [35]:
vectorizer.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=2,
                ngram_range=(1, 2), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [36]:
X_train_dtm = vectorizer.fit_transform(X_train)
X_test_dtm = vectorizer.transform(X_test)
print(X_train_dtm.shape)
print(X_test_dtm.shape)

(80000, 901917)
(20000, 901917)


Create a dictionary to get the count of every label 

In [0]:
y_ch = y.iloc[:10]
myDict = dict()

In [38]:
for i, j in enumerate(y):
  my_list = j.split(',')
  
  for item in my_list:
    if (item in myDict): 
        myDict[item] += 1
    else: 
         myDict[item] = 1 
    
for key, value in myDict.items(): 
      print ("% s : % d"%(key, value))

male :  53358
15 :  6532
Student :  22122
Leo :  8230
33 :  2835
InvestmentBanking :  244
Aquarius :  9050
female :  46642
14 :  3540
indUnk :  33097
Aries :  10637
25 :  8660
Capricorn :  8723
17 :  12755
Gemini :  9225
23 :  10757
Non-Profit :  1326
Cancer :  9253
Banking :  354
37 :  863
Sagittarius :  7366
26 :  8059
24 :  11814
Scorpio :  7049
27 :  8007
Education :  5553
45 :  906
Engineering :  2332
Libra :  7250
Science :  1090
34 :  2388
41 :  772
Communications-Media :  2830
BusinessServices :  626
Sports-Recreation :  406
Virgo :  7134
Taurus :  8530
Arts :  5031
Pisces :  7553
44 :  76
16 :  8406
Internet :  2251
Museums-Libraries :  308
Accounting :  528
39 :  568
35 :  4720
Technology :  8484
36 :  3045
Law :  360
46 :  914
Consulting :  905
Automotive :  124
42 :  156
Religion :  1081
13 :  1497
Fashion :  1898
38 :  801
43 :  505
Publishing :  1079
40 :  513
Marketing :  726
LawEnforcement-Security :  368
HumanResources :  209
Telecommunications :  165
Military :  798
G

 Transform the labels

In [0]:
list_class = [] 
for key in myDict.keys(): 
    list_class.append(key) 
list_class_array=np.array(list_class)

In [0]:
y_train_pass = [set(i.split(',')) for i in y_train]
y_test_pass = [set(i.split(',')) for i in y_test]

In [0]:
from sklearn.preprocessing import MultiLabelBinarizer

In [42]:
mlb = MultiLabelBinarizer()
mlb.fit(y_train_pass)

MultiLabelBinarizer(classes=None, sparse_output=False)

In [43]:
mlb.classes_

array(['13', '14', '15', '16', '17', '23', '24', '25', '26', '27', '33',
       '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44',
       '45', '46', '47', '48', 'Accounting', 'Advertising', 'Agriculture',
       'Aquarius', 'Architecture', 'Aries', 'Arts', 'Automotive',
       'Banking', 'Biotech', 'BusinessServices', 'Cancer', 'Capricorn',
       'Chemicals', 'Communications-Media', 'Construction', 'Consulting',
       'Education', 'Engineering', 'Environment', 'Fashion', 'Gemini',
       'Government', 'HumanResources', 'Internet', 'InvestmentBanking',
       'Law', 'LawEnforcement-Security', 'Leo', 'Libra', 'Manufacturing',
       'Maritime', 'Marketing', 'Military', 'Museums-Libraries',
       'Non-Profit', 'Pisces', 'Publishing', 'RealEstate', 'Religion',
       'Sagittarius', 'Science', 'Scorpio', 'Sports-Recreation',
       'Student', 'Taurus', 'Technology', 'Telecommunications', 'Tourism',
       'Transportation', 'Virgo', 'female', 'indUnk', 'male'],
      dtype

In [0]:
y_trn_mlb = mlb.transform(y_train_pass)

In [0]:
y_test_mlb =mlb.transform(y_test_pass)

Choose a classifier

In [0]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver = 'lbfgs',max_iter=20)
clf = OneVsRestClassifier(clf)

Fit the classifier, make predictions and get the accuracy

In [58]:
clf.fit(X_train_dtm, y_trn_mlb)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=20,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [0]:
y_pred_class = clf.predict (X_test_dtm)

In [60]:
from sklearn import metrics
metrics.accuracy_score(y_test_mlb, y_pred_class)

0.04965

In [61]:
print(metrics.classification_report(y_test_mlb, y_pred_class))

              precision    recall  f1-score   support

           0       0.67      0.20      0.31       300
           1       0.49      0.10      0.17       730
           2       0.58      0.11      0.18      1329
           3       0.73      0.26      0.38      1732
           4       0.57      0.17      0.26      2511
           5       0.67      0.08      0.14      2180
           6       0.62      0.13      0.22      2386
           7       0.45      0.06      0.10      1685
           8       0.56      0.15      0.24      1538
           9       0.57      0.11      0.19      1637
          10       0.47      0.09      0.15       604
          11       0.89      0.44      0.59       491
          12       0.67      0.19      0.30       958
          13       0.77      0.29      0.42       562
          14       0.31      0.07      0.11       170
          15       0.84      0.25      0.38       154
          16       0.42      0.10      0.16        98
          17       0.28    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [0]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

In [63]:
print("Accuracy Score: " , (accuracy_score(y_test_mlb, y_pred_class))) 
print("F1 Score: " , (f1_score(y_test_mlb, y_pred_class, average='micro')))
print("Average Precision: " ,(average_precision_score(y_test_mlb, y_pred_class, average='micro')))
print("Average Recall: " , recall_score(y_test_mlb, y_pred_class, average='micro'))


Accuracy Score:  0.04965
F1 Score:  0.3985144829156776
Average Precision:  0.22510443382525094
Average Recall:  0.2843625


Print true label and predicted label for any five examples

In [64]:
y_test_pred_inversed = mlb.inverse_transform(y_pred_class)
y_test_inversed = mlb.inverse_transform(y_test_mlb)
for i in range(15,20):
    print( 'True labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        
        ','.join(y_test_inversed[i]),
        ','.join(y_test_pred_inversed[i])
    ))

True labels:	25,Law,Taurus,male
Predicted labels:	male


True labels:	24,Gemini,Student,male
Predicted labels:	female


True labels:	17,Cancer,indUnk,male
Predicted labels:	male


True labels:	35,Libra,female,indUnk
Predicted labels:	35,female


True labels:	15,Taurus,female,indUnk
Predicted labels:	female


