In [52]:
# Mounting the drive 
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
#importing tensorflow 2 
% tensorflow_version 2.x
import tensorflow as tf
# Importing required Libraries 
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline

## Load the dataset

In [0]:
# Setting project path
project_path = '/content/drive/My Drive/Deep Learning/R9_SNLP_Project/ANN -Statistical NLP/'

In [0]:
# setting default path as project path
os.chdir(project_path)

In [112]:
# checking the contents in the project_path 
os.listdir()

['blog-authorship-corpus.zip', 'blogtext.csv', 'SNLP_R9_project1.ipynb']

In [0]:
zip_path = project_path + 'blog-authorship-corpus.zip'

In [0]:
# Extracting the zip file 
from zipfile import ZipFile
with ZipFile(zip_path , 'r') as z:
  z.extractall()

In [0]:
# reading blogtext.csv 
df_full = pd.read_csv('blogtext.csv')

In [116]:
df_full.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [117]:
#shape of blogtext.csv
df_full = df_full.iloc[:300000 , :]
df_full.shape

(300000, 7)

In [118]:
# columns of blogtext.csv
df_full.columns

Index(['id', 'gender', 'age', 'topic', 'sign', 'date', 'text'], dtype='object')

In [119]:
# checking memory consuptiom by blogtext.csv
print ('memory consumption ',df_full.memory_usage().sum()/1000000 , 'mb' )

memory consumption  16.800128 mb


In [120]:
df_full.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [121]:
df_full.isnull().sum() # No Null values

id        0
gender    0
age       0
topic     0
sign      0
date      0
text      0
dtype: int64

In [122]:
# checking unique astrological sign
df_full.sign.unique()

array(['Leo', 'Aquarius', 'Aries', 'Capricorn', 'Gemini', 'Cancer',
       'Sagittarius', 'Scorpio', 'Libra', 'Virgo', 'Taurus', 'Pisces'],
      dtype=object)

In [123]:
# checking age column range
df_full.age.unique()

array([15, 33, 14, 25, 17, 23, 37, 26, 24, 27, 45, 34, 41, 44, 16, 39, 35,
       36, 46, 42, 13, 38, 43, 40, 47, 48])

In [124]:
# checking unique value of topic column
df_full.topic.unique()

array(['Student', 'InvestmentBanking', 'indUnk', 'Non-Profit', 'Banking',
       'Education', 'Engineering', 'Science', 'Communications-Media',
       'BusinessServices', 'Sports-Recreation', 'Arts', 'Internet',
       'Museums-Libraries', 'Accounting', 'Technology', 'Law',
       'Consulting', 'Automotive', 'Religion', 'Fashion', 'Publishing',
       'Marketing', 'LawEnforcement-Security', 'HumanResources',
       'Telecommunications', 'Military', 'Government', 'Transportation',
       'Architecture', 'Advertising', 'Agriculture', 'Biotech',
       'RealEstate', 'Manufacturing', 'Construction', 'Chemicals',
       'Maritime', 'Tourism', 'Environment'], dtype=object)

In [125]:
df_full.text.tail(5)

299995             meantime girl   Sheâ€™s the one you cal...
299996               When someone breaks your heart, you...
299997               Love can't do anything for you.  Lo...
299998           I may have found the abstract side of m...
299999           Before I continue, who is TonyBlair??? ...
Name: text, dtype: object

In [126]:
# downloading nltk stop words
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [127]:
len(stop_words)

179

In [0]:
#data preprocessing , cleaning
import re, string

def clean_str(string):
    try:
        string=re.sub(r'^https?:\/\/<>.*[\r\n]*','',string,flags=re.MULTILINE) #Removing Special characters characters
        string=re.sub(r"[^A-Za-z]"," ",string) # Keeping A-Z and a-z removing rest  
        words=string.strip().lower().split() # Converting text to lowercase
        words=[w for w in words if not w in stop_words] #Remove stopwords
        return " ".join(words) # removes space 
    except:
        return ""

In [0]:
# Applying text cleaning function on df
df_full['processed_text'] = df_full['text'].map(clean_str) 

In [130]:
# comparing orignal text with processed text
df_full[['processed_text','text']].head()

Unnamed: 0,processed_text,text
0,info found pages mb pdf files wait untill team...,"Info has been found (+/- 100 pages,..."
1,team members drewes van der laag urllink mail ...,These are the team members: Drewe...
2,het kader van kernfusie op aarde maak je eigen...,In het kader van kernfusie op aarde...
3,testing testing,testing!!! testing!!!
4,thanks yahoo toolbar capture urls popups means...,Thanks to Yahoo!'s Toolbar I can ...


In [0]:
# Merging the gender , age, topic and sign columns into a new lables column
df_full['labels'] = df_full['gender'] +',' + df_full['age'].astype(str) + ',' + df_full['topic'] + ',' + df_full['sign']

In [132]:
df_full['labels'].head()

0                   male,15,Student,Leo
1                   male,15,Student,Leo
2                   male,15,Student,Leo
3                   male,15,Student,Leo
4    male,33,InvestmentBanking,Aquarius
Name: labels, dtype: object

In [0]:
# creating new df consisting of processed text and lables
df = df_full[['processed_text','labels']]

In [134]:
df.head()

Unnamed: 0,processed_text,labels
0,info found pages mb pdf files wait untill team...,"male,15,Student,Leo"
1,team members drewes van der laag urllink mail ...,"male,15,Student,Leo"
2,het kader van kernfusie op aarde maak je eigen...,"male,15,Student,Leo"
3,testing testing,"male,15,Student,Leo"
4,thanks yahoo toolbar capture urls popups means...,"male,33,InvestmentBanking,Aquarius"


## 4. Separate features and labels, and split the data into training and testing

In [0]:
X = df['processed_text']
y = df ['labels']

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

## Vectorize the features

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
vectorizer = CountVectorizer(ngram_range=(1, 2),min_df=2) # Creating bag of words.  

In [140]:
vectorizer.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=2,
                ngram_range=(1, 2), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [141]:
X_train_dtm = vectorizer.fit_transform(X_train)
X_test_dtm = vectorizer.transform(X_test)
print(X_train_dtm.shape)
print(X_test_dtm.shape)

(240000, 2524255)
(60000, 2524255)


In [142]:
# last 50 features
print(vectorizer.get_feature_names()[20:])

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## Print the term-document matrix

In [143]:
X_test_dtm

<60000x2524255 sparse matrix of type '<class 'numpy.int64'>'
	with 7618190 stored elements in Compressed Sparse Row format>

In [0]:
# Creating empty dictionary 
y_ch = y.iloc[:10]
myDict = dict() # creating empty dict 


In [145]:
for i, j in enumerate(y):
  my_list = j.split(',')
  
  for item in my_list:
    #myDict[item] = my_list.count(item)
    if (item in myDict): 
        myDict[item] += 1
    else: 
         myDict[item] = 1 
    
for key, value in myDict.items(): 
      print ("% s : % d"%(key, value))
    



male :  158404
15 :  17320
Student :  61869
Leo :  27437
33 :  7104
InvestmentBanking :  558
Aquarius :  22603
female :  141596
14 :  11168
indUnk :  110731
Aries :  28651
25 :  26747
Capricorn :  23121
17 :  35558
Gemini :  24647
23 :  29292
Non-Profit :  7333
Cancer :  26205
Banking :  933
37 :  4328
Sagittarius :  21951
26 :  25733
24 :  37988
Scorpio :  23529
27 :  20632
Education :  15414
45 :  1764
Engineering :  6771
Libra :  24739
Science :  3109
34 :  11195
41 :  1441
Communications-Media :  9200
BusinessServices :  1909
Sports-Recreation :  1086
Virgo :  27403
Taurus :  27156
Arts :  14232
Pisces :  22558
44 :  1035
16 :  28702
Internet :  7271
Museums-Libraries :  1294
Accounting :  908
39 :  2969
35 :  10017
Technology :  22496
36 :  7460
Law :  2302
46 :  1105
Consulting :  2988
Automotive :  430
42 :  1462
Religion :  2181
13 :  6687
Fashion :  2772
38 :  3841
43 :  2426
Publishing :  2988
40 :  2179
Marketing :  2637
LawEnforcement-Security :  906
HumanResources :  900
T

## Transform the labels

In [0]:
list_class = [] 
for key in myDict.keys(): 
    list_class.append(key) 
list_class_array=np.array(list_class)

In [0]:
# transform to dictionary as Acceptable format of MultiLabelBinarizer
y_train_pass = [set(i.split(',')) for i in y_train]
y_test_pass = [set(i.split(',')) for i in y_test]

In [0]:
from sklearn.preprocessing import MultiLabelBinarizer

In [149]:
mlb = MultiLabelBinarizer()
mlb.fit(y_train_pass)

MultiLabelBinarizer(classes=None, sparse_output=False)

In [150]:
len(y_train_pass)

240000

In [151]:
# retriving the lables 
mlb.classes_

array(['13', '14', '15', '16', '17', '23', '24', '25', '26', '27', '33',
       '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44',
       '45', '46', '47', '48', 'Accounting', 'Advertising', 'Agriculture',
       'Aquarius', 'Architecture', 'Aries', 'Arts', 'Automotive',
       'Banking', 'Biotech', 'BusinessServices', 'Cancer', 'Capricorn',
       'Chemicals', 'Communications-Media', 'Construction', 'Consulting',
       'Education', 'Engineering', 'Environment', 'Fashion', 'Gemini',
       'Government', 'HumanResources', 'Internet', 'InvestmentBanking',
       'Law', 'LawEnforcement-Security', 'Leo', 'Libra', 'Manufacturing',
       'Maritime', 'Marketing', 'Military', 'Museums-Libraries',
       'Non-Profit', 'Pisces', 'Publishing', 'RealEstate', 'Religion',
       'Sagittarius', 'Science', 'Scorpio', 'Sports-Recreation',
       'Student', 'Taurus', 'Technology', 'Telecommunications', 'Tourism',
       'Transportation', 'Virgo', 'female', 'indUnk', 'male'],
      dtype

In [0]:
y_trn_mlb = mlb.transform(y_train_pass)

In [0]:
y_test_mlb =mlb.transform(y_test_pass)

## Choose a classifier

In [0]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

In [0]:
clf = LogisticRegression(solver='lbfgs')
clf = OneVsRestClassifier(clf)

##  Fit the classifier, make predictions and get the accuracy

In [156]:
# fitting the classifier 
clf.fit(X_train_dtm, y_trn_mlb) 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [0]:
# Predicting on test data 
y_pred_class = clf.predict (X_test_dtm)

In [158]:
 #calculate accuracy of class predictions
from sklearn import metrics
# Score CNN model
metrics.accuracy_score(y_test_mlb, y_pred_class)

0.07916666666666666

In [159]:
# Classification report  CNN model
print(metrics.classification_report(y_test_mlb, y_pred_class))

              precision    recall  f1-score   support

           0       0.75      0.28      0.41      1328
           1       0.66      0.25      0.36      2210
           2       0.62      0.25      0.35      3484
           3       0.63      0.28      0.39      5761
           4       0.61      0.30      0.40      7135
           5       0.50      0.18      0.27      5824
           6       0.55      0.22      0.31      7609
           7       0.56      0.20      0.29      5314
           8       0.57      0.17      0.27      5116
           9       0.55      0.17      0.26      4135
          10       0.51      0.10      0.17      1421
          11       0.73      0.27      0.39      2239
          12       0.76      0.23      0.36      2024
          13       0.76      0.22      0.35      1493
          14       0.71      0.20      0.31       863
          15       0.69      0.16      0.25       769
          16       0.50      0.06      0.11       554
          17       0.43    

In [0]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

In [161]:

print("F1: " , (f1_score(y_test_mlb, y_pred_class, average='micro')))
print("F1_macro: " , (f1_score(y_test_mlb, y_pred_class, average='macro')))
print("Recall micro: " , recall_score(y_test_mlb, y_pred_class, average='micro'))
print("F1_micro: " , (f1_score(y_test_mlb, y_pred_class, average='micro')))
print("Recall macro: " , recall_score(y_test_mlb, y_pred_class, average='macro'))
print("Average Precision: " ,(average_precision_score(y_test_mlb, y_pred_class, average='micro')))
print("Accuracy:" , (accuracy_score(y_test_mlb, y_pred_class))) 

F1:  0.447935027435193
F1_macro:  0.25218825453484633
Recall micro:  0.3467791666666667
F1_micro:  0.447935027435193
Recall macro:  0.1726756957194821
Average Precision:  0.2519675141951552
Accuracy: 0.07916666666666666


## Print true label and predicted label for any five examples

In [162]:
y_test_pred_inversed = mlb.inverse_transform(y_pred_class)
y_test_inversed = mlb.inverse_transform(y_test_mlb)
for i in range(15,20):
    print( 'True labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        
        ','.join(y_test_inversed[i]),
        ','.join(y_test_pred_inversed[i])
    ))

True labels:	34,Aries,indUnk,male
Predicted labels:	34,Aries,indUnk,male


True labels:	15,Scorpio,indUnk,male
Predicted labels:	indUnk,male


True labels:	23,Aquarius,Student,female
Predicted labels:	female


True labels:	26,Aquarius,Law,male
Predicted labels:	indUnk,male


True labels:	15,Virgo,female,indUnk
Predicted labels:	Virgo,male


