In [45]:
# Mounting the drive 
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
#importing tensorflow 2 
% tensorflow_version 2.x
import tensorflow as tf
# Importing required Libraries 
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline

## Load the dataset

In [0]:
# defining project path
project_path = '/content/drive/My Drive/NLP/'

In [0]:
# Changing directory location to project path
os.chdir(project_path)

In [116]:
# checking the contents in the project_path 
os.listdir()

['blog-authorship-corpus.zip', 'SNLP_R9_project1_TEST.ipynb', 'blogtext.csv']

In [0]:
# specifing the zip_path 
zip_path = project_path + 'blog-authorship-corpus.zip'

In [0]:
# Extracting the zip file 
from zipfile import ZipFile
with ZipFile(zip_path , 'r') as z:
  z.extractall()

In [0]:
# reading blogtext.csv 
df_full = pd.read_csv('blogtext.csv')

In [120]:
# checking the shape of blogtext.csv
df_full = df_full.iloc[:10000 , :]
df_full.shape

(10000, 7)

In [121]:
# Fetching the columns from blogtext.csv
df_full.columns

Index(['id', 'gender', 'age', 'topic', 'sign', 'date', 'text'], dtype='object')

In [122]:
# checking memory consuptiom by blogtext.csv
print ('Aprox df_full occupies ',df_full.memory_usage().sum()/1000000 , 'MB of memory' )

Aprox df_full occupies  0.560128 MB of memory


In [123]:
# checking first 10 entries of df 
df_full.head(10)

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...
5,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",I had an interesting conversation...
6,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",Somehow Coca-Cola has a way of su...
7,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004","If anything, Korea is a country o..."
8,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",Take a read of this news article ...
9,3581210,male,33,InvestmentBanking,Aquarius,"09,June,2004",I surf the English news sites a l...


In [124]:
# checking for null values 
print ('NAN values present in dataset :',df_full.isnull().sum().sum())

NAN values present in dataset : 0


In [125]:
# checking unique value of sign column
df_full.sign.unique()

array(['Leo', 'Aquarius', 'Aries', 'Capricorn', 'Gemini', 'Cancer',
       'Sagittarius', 'Scorpio', 'Libra', 'Virgo', 'Taurus', 'Pisces'],
      dtype=object)

In [126]:
# checking unique value of age column
df_full.age.unique()

array([15, 33, 14, 25, 17, 23, 37, 26, 24, 27, 45, 34, 41, 44, 16, 39, 35,
       36, 46, 42, 13, 38, 43, 40])

In [127]:
# checking unique value of topic column
df_full.topic.unique()

array(['Student', 'InvestmentBanking', 'indUnk', 'Non-Profit', 'Banking',
       'Education', 'Engineering', 'Science', 'Communications-Media',
       'BusinessServices', 'Sports-Recreation', 'Arts', 'Internet',
       'Museums-Libraries', 'Accounting', 'Technology', 'Law',
       'Consulting', 'Automotive', 'Religion', 'Fashion', 'Publishing',
       'Marketing', 'LawEnforcement-Security', 'HumanResources',
       'Telecommunications'], dtype=object)

In [0]:
# checking last 5 values of text column 
df_full.text.tail(5)

In [128]:
# Need to download stop words from nltk 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [129]:
# checking length of stop word 
len(stop_words)

179

In [0]:
#data cleaning
#  text cleaning using regex
import re, string

def clean_str(string):
    

    try:
        string=re.sub(r'^https?:\/\/<>.*[\r\n]*','',string,flags=re.MULTILINE) #Removing Special characters characters
        string=re.sub(r"[^A-Za-z]"," ",string) # Keeping A-Z and a-z removing rest  
        words=string.strip().lower().split() # Converting text to lowercase
        words=[w for w in words if not w in stop_words] #Remove stopwords
        return " ".join(words) # removes space 
    except:
        return ""

In [0]:
# Applying text cleaning function on df
df_full['processed_text'] = df_full['text'].map(clean_str) 

In [132]:
# comparing orignal text with processed text
df_full[['processed_text','text']].head()

Unnamed: 0,processed_text,text
0,info found pages mb pdf files wait untill team...,"Info has been found (+/- 100 pages,..."
1,team members drewes van der laag urllink mail ...,These are the team members: Drewe...
2,het kader van kernfusie op aarde maak je eigen...,In het kader van kernfusie op aarde...
3,testing testing,testing!!! testing!!!
4,thanks yahoo toolbar capture urls popups means...,Thanks to Yahoo!'s Toolbar I can ...


In [0]:
# Merging the gender , age, topic and sign columns into a new lables column
df_full['labels'] = df_full['gender'] +',' + df_full['age'].astype(str) + ',' + df_full['topic'] + ',' + df_full['sign']

In [0]:
# checking the label column
df_full['labels'].head()

In [0]:
# creating new df consisting of processed text and lables 
df = df_full[['processed_text','labels']]

In [135]:
# Checking the head of newly created datset 
df.head()

Unnamed: 0,processed_text,labels
0,info found pages mb pdf files wait untill team...,"male,15,Student,Leo"
1,team members drewes van der laag urllink mail ...,"male,15,Student,Leo"
2,het kader van kernfusie op aarde maak je eigen...,"male,15,Student,Leo"
3,testing testing,"male,15,Student,Leo"
4,thanks yahoo toolbar capture urls popups means...,"male,33,InvestmentBanking,Aquarius"


## Separate features and labels, and split the data into training and testing

In [0]:
X = df['processed_text']
y = df ['labels']

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

## Vectorize the features

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
vectorizer = CountVectorizer(ngram_range=(1, 2),min_df=2)

In [141]:
X_train_dtm = vectorizer.fit_transform(X_train)
X_test_dtm = vectorizer.transform(X_test)
print(X_train_dtm.shape)
print(X_test_dtm.shape)

(8000, 80910)
(2000, 80910)


In [142]:
# last 50 features
print(vectorizer.get_feature_names()[-50:])

['zelo', 'zen', 'zenith', 'zeppelin', 'zero', 'zest', 'zeta', 'zeta jones', 'zhadoxero', 'zhang', 'zhao', 'zheng', 'zhentil', 'zhentil keep', 'ziggy', 'ziggy stardust', 'zillion', 'zimbabwe', 'zion', 'zionist', 'zip', 'zip code', 'zipper', 'zipping', 'zipping around', 'zmx', 'zmx dont', 'zmx hate', 'zmx thing', 'zmx think', 'zodiac', 'zodiac forecasts', 'zodiac sign', 'zombie', 'zone', 'zone natural', 'zones', 'zoo', 'zoo animal', 'zoo tv', 'zookx', 'zoomed', 'zooming', 'zooms', 'zooropa', 'zoos', 'zy', 'zzz', 'zzzz', 'zzzzzz']


## Print the term-document matrix

In [143]:
X_test_dtm

<2000x80910 sparse matrix of type '<class 'numpy.int64'>'
	with 151352 stored elements in Compressed Sparse Row format>

In [0]:
# Creating empty dictionary 
y_ch = y.iloc[:10]
myDict = dict() # creating empty dict 


In [145]:
for i, j in enumerate(y):
  my_list = j.split(',')
  
  for item in my_list:
    #myDict[item] = my_list.count(item)
    if (item in myDict): 
        myDict[item] += 1
    else: 
         myDict[item] = 1 
    
for key, value in myDict.items(): 
      print ("% s : % d"%(key, value))
    



male :  5916
15 :  602
Student :  1137
Leo :  301
33 :  136
InvestmentBanking :  70
Aquarius :  571
female :  4084
14 :  212
indUnk :  3287
Aries :  4198
25 :  386
Capricorn :  215
17 :  1185
Gemini :  150
23 :  253
Non-Profit :  71
Cancer :  504
Banking :  16
37 :  33
Sagittarius :  1097
26 :  234
24 :  655
Scorpio :  971
27 :  1054
Education :  270
45 :  16
Engineering :  127
Libra :  491
Science :  63
34 :  553
41 :  20
Communications-Media :  99
BusinessServices :  91
Sports-Recreation :  80
Virgo :  236
Taurus :  812
Arts :  45
Pisces :  454
44 :  3
16 :  440
Internet :  118
Museums-Libraries :  17
Accounting :  4
39 :  79
35 :  2315
Technology :  2654
36 :  1708
Law :  11
46 :  7
Consulting :  21
Automotive :  14
42 :  14
Religion :  9
13 :  42
Fashion :  1622
38 :  46
43 :  6
Publishing :  4
40 :  1
Marketing :  156
LawEnforcement-Security :  10
HumanResources :  2
Telecommunications :  2


## Transform the labels

In [0]:
list_class = [] 
for key in myDict.keys(): 
    list_class.append(key) 
list_class_array=np.array(list_class)

In [0]:
# transform to dictionary as Acceptable format of MultiLabelBinarizer
y_train_pass = [set(i.split(',')) for i in y_train]
y_test_pass = [set(i.split(',')) for i in y_test]

In [0]:
from sklearn.preprocessing import MultiLabelBinarizer

In [149]:
mlb = MultiLabelBinarizer()
mlb.fit(y_train_pass)

MultiLabelBinarizer(classes=None, sparse_output=False)

In [150]:
len(y_train_pass)

8000

In [151]:
# retriving the lables 
mlb.classes_

array(['13', '14', '15', '16', '17', '23', '24', '25', '26', '27', '33',
       '34', '35', '36', '37', '38', '39', '41', '42', '43', '44', '45',
       '46', 'Accounting', 'Aquarius', 'Aries', 'Arts', 'Automotive',
       'Banking', 'BusinessServices', 'Cancer', 'Capricorn',
       'Communications-Media', 'Consulting', 'Education', 'Engineering',
       'Fashion', 'Gemini', 'HumanResources', 'Internet',
       'InvestmentBanking', 'Law', 'LawEnforcement-Security', 'Leo',
       'Libra', 'Marketing', 'Museums-Libraries', 'Non-Profit', 'Pisces',
       'Publishing', 'Religion', 'Sagittarius', 'Science', 'Scorpio',
       'Sports-Recreation', 'Student', 'Taurus', 'Technology',
       'Telecommunications', 'Virgo', 'female', 'indUnk', 'male'],
      dtype=object)

In [0]:
y_trn_mlb = mlb.transform(y_train_pass)

In [153]:
print(y_trn_mlb)

[[0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]
 ...
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]]


In [154]:
y_test_mlb =mlb.transform(y_test_pass)

  .format(sorted(unknown, key=str)))


In [155]:
print(y_test_mlb)

[[0 0 0 ... 0 0 1]
 [0 0 0 ... 0 1 1]
 [0 0 0 ... 0 0 1]
 ...
 [0 0 0 ... 0 0 1]
 [0 0 1 ... 1 0 0]
 [0 0 0 ... 0 0 1]]


## Choose a classifier

In [0]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

In [0]:
clf = LogisticRegression(solver = 'lbfgs',max_iter=10000)
clf = OneVsRestClassifier(clf)

##  Fit the classifier, make predictions and get the accuracy

In [169]:
# fitting the classifier 
clf.fit(X_train_dtm, y_trn_mlb)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=10000,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [0]:
# Predicting on test data 
y_pred_class = clf.predict (X_test_dtm)

In [172]:
 #calculate accuracy of class predictions
from sklearn import metrics

metrics.accuracy_score(y_test_mlb, y_pred_class)

0.333

In [173]:
# Classification report  
print(metrics.classification_report(y_test_mlb, y_pred_class))

              precision    recall  f1-score   support

           0       1.00      0.25      0.40        16
           1       0.56      0.12      0.20        42
           2       0.76      0.35      0.48       113
           3       0.73      0.39      0.51        85
           4       0.70      0.34      0.45       217
           5       0.20      0.02      0.04        41
           6       0.67      0.20      0.31       140
           7       0.60      0.11      0.18        84
           8       0.33      0.02      0.04        50
           9       0.72      0.42      0.53       203
          10       0.91      0.29      0.43        35
          11       0.97      0.77      0.86       111
          12       0.76      0.71      0.73       468
          13       0.93      0.53      0.68       352
          14       0.00      0.00      0.00         0
          15       0.67      0.15      0.25        13
          16       1.00      0.08      0.15        12
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [0]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

In [175]:

print("F1: " , (f1_score(y_test_mlb, y_pred_class, average='micro')))
print("F1_macro: " , (f1_score(y_test_mlb, y_pred_class, average='macro')))
print("Recall micro: " , recall_score(y_test_mlb, y_pred_class, average='micro'))
print("F1_micro: " , (f1_score(y_test_mlb, y_pred_class, average='micro')))
print("Recall macro: " , recall_score(y_test_mlb, y_pred_class, average='macro'))
print("Average Precision: " ,(average_precision_score(y_test_mlb, y_pred_class, average='micro')))
print("Accuracy:" , (accuracy_score(y_test_mlb, y_pred_class))) 

F1:  0.6676754575587261
F1_macro:  0.30297246584068577
Recall micro:  0.5791973996749594
F1_micro:  0.6676754575587261
Recall macro:  0.23686904866948197
Average Precision:  0.48315612151868903
Accuracy: 0.333


  average, "true nor predicted", 'F-score is', len(true_sum)
  _warn_prf(average, modifier, msg_start, len(result))


## Print true label and predicted label for any five examples

In [180]:
y_test_pred_inversed = mlb.inverse_transform(y_pred_class)
y_test_inversed = mlb.inverse_transform(y_test_mlb)
for i in range(120,125):
    print( 'True labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        
        ','.join(y_test_inversed[i]),
        ','.join(y_test_pred_inversed[i])
    ))

True labels:	36,Aries,Fashion,male
Predicted labels:	male


True labels:	34,Sagittarius,female,indUnk
Predicted labels:	34,Sagittarius,female,indUnk


True labels:	38,Internet,Sagittarius,female
Predicted labels:	Sagittarius,male


True labels:	27,Taurus,female,indUnk
Predicted labels:	Taurus,female,indUnk


True labels:	35,Aries,Technology,male
Predicted labels:	35,Aries,Technology,male


