In [1]:
# Mounting the drive 
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
#importing tensorflow 2 
% tensorflow_version 2.x
import tensorflow as tf
# Importing required Libraries 
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline

TensorFlow 2.x selected.


## Load the dataset

In [0]:
# defining project path
project_path = '/content/drive/My Drive/ANN DL/NLP_project_60/'

In [0]:
# Changing directory location to project path
os.chdir(project_path)

In [5]:
# checking the contents in the project_path 
os.listdir()

['blog-authorship-corpus.zip', 'blogtext.csv', 'SNLP_R9_project1.ipynb.ipynb']

In [0]:
# specifing the zip_path 
zip_path = project_path + 'blog-authorship-corpus.zip'

In [0]:
# Extracting the zip file 
from zipfile import ZipFile
with ZipFile(zip_path , 'r') as z:
  z.extractall()

In [0]:
# reading blogtext.csv 
df_full = pd.read_csv('blogtext.csv')

In [9]:
df_full.shape

(681284, 7)

In [10]:
# checking the shape of blogtext.csv
df_full = df_full.iloc[:100000 , :]
df_full.shape

(100000, 7)

In [11]:
# Fetching the columns from blogtext.csv
df_full.columns

Index(['id', 'gender', 'age', 'topic', 'sign', 'date', 'text'], dtype='object')

In [12]:
# checking memory consuptiom by blogtext.csv
print ('Aprox df_full occupies ',df_full.memory_usage().sum()/1000000 , 'MB of memory' )

Aprox df_full occupies  5.600128 MB of memory


In [13]:
# checking first 5 entries of df 
df_full.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [14]:
# checking for null values 
print ('NAN values present in dataset :',df_full.isnull().sum().sum())

NAN values present in dataset : 0


In [15]:
# checking unique value of sign column
df_full.sign.unique()

array(['Leo', 'Aquarius', 'Aries', 'Capricorn', 'Gemini', 'Cancer',
       'Sagittarius', 'Scorpio', 'Libra', 'Virgo', 'Taurus', 'Pisces'],
      dtype=object)

In [16]:
# checking unique value of age column
df_full.age.unique()

array([15, 33, 14, 25, 17, 23, 37, 26, 24, 27, 45, 34, 41, 44, 16, 39, 35,
       36, 46, 42, 13, 38, 43, 40, 47, 48])

In [17]:
# checking unique value of topic column
df_full.topic.unique()

array(['Student', 'InvestmentBanking', 'indUnk', 'Non-Profit', 'Banking',
       'Education', 'Engineering', 'Science', 'Communications-Media',
       'BusinessServices', 'Sports-Recreation', 'Arts', 'Internet',
       'Museums-Libraries', 'Accounting', 'Technology', 'Law',
       'Consulting', 'Automotive', 'Religion', 'Fashion', 'Publishing',
       'Marketing', 'LawEnforcement-Security', 'HumanResources',
       'Telecommunications', 'Military', 'Government', 'Transportation',
       'Architecture', 'Advertising', 'Agriculture', 'Biotech',
       'RealEstate', 'Manufacturing', 'Construction', 'Chemicals',
       'Maritime', 'Tourism', 'Environment'], dtype=object)

In [18]:
# checking last 5 values of text column 
df_full.text.tail(5)

99995                THE HINDU - 125 YEARS             ...
99996                DILBERT & IIT-ans                 ...
99997                Case Study : How HP won $3 billion...
99998                Championing Chennai               ...
99999                WEEKEND                         It...
Name: text, dtype: object

In [19]:
# Need to download stop words from nltk 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [20]:
# checking length of stop word 
len(stop_words)

179

## Preprocess rows of the “text” column

In [0]:
#data cleaning
#  text cleaning using regex
import re, string

def clean_str(string):
    

    try:
        string=re.sub(r'^https?:\/\/<>.*[\r\n]*','',string,flags=re.MULTILINE) #Removing Special characters characters
        string=re.sub(r"[^A-Za-z]"," ",string) # Keeping A-Z and a-z removing rest  
        words=string.strip().lower().split() # Converting text to lowercase
        words=[w for w in words if not w in stop_words] #Remove stopwords
        return " ".join(words) # removes space 
    except:
        return ""

In [0]:
# Applying text cleaning function on df
df_full['processed_text'] = df_full['text'].map(clean_str) 

In [23]:
# comparing orignal text with processed text
df_full[['processed_text','text']].head()

Unnamed: 0,processed_text,text
0,info found pages mb pdf files wait untill team...,"Info has been found (+/- 100 pages,..."
1,team members drewes van der laag urllink mail ...,These are the team members: Drewe...
2,het kader van kernfusie op aarde maak je eigen...,In het kader van kernfusie op aarde...
3,testing testing,testing!!! testing!!!
4,thanks yahoo toolbar capture urls popups means...,Thanks to Yahoo!'s Toolbar I can ...


## Label columns to merge: “gender”, “age”, “topic”, “sign”

In [0]:
# Merging the gender , age, topic and sign columns into a new lables column
df_full['labels'] = df_full['gender'] +',' + df_full['age'].astype(str) + ',' + df_full['topic'] + ',' + df_full['sign']

In [25]:
# checking the label column
df_full['labels'].head()

0                   male,15,Student,Leo
1                   male,15,Student,Leo
2                   male,15,Student,Leo
3                   male,15,Student,Leo
4    male,33,InvestmentBanking,Aquarius
Name: labels, dtype: object

In [0]:
# creating new df consisting of processed text and lables 
df = df_full[['processed_text','labels']]

In [27]:
# Checking the head of newly created datset 
df.head()

Unnamed: 0,processed_text,labels
0,info found pages mb pdf files wait untill team...,"male,15,Student,Leo"
1,team members drewes van der laag urllink mail ...,"male,15,Student,Leo"
2,het kader van kernfusie op aarde maak je eigen...,"male,15,Student,Leo"
3,testing testing,"male,15,Student,Leo"
4,thanks yahoo toolbar capture urls popups means...,"male,33,InvestmentBanking,Aquarius"


## Separate features and labels, and split the data into training and testing

In [0]:
X = df['processed_text']
y = df ['labels']

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

## Vectorize the features

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
vectorizer = CountVectorizer(ngram_range=(1, 2),min_df=2)

In [33]:
vectorizer.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=2,
                ngram_range=(1, 2), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [34]:
X_train_dtm = vectorizer.fit_transform(X_train)
X_test_dtm = vectorizer.transform(X_test)
print(X_train_dtm.shape)
print(X_test_dtm.shape)

(80000, 898184)
(20000, 898184)


## Print the term-document matrix

In [35]:
X_test_dtm

<20000x898184 sparse matrix of type '<class 'numpy.int64'>'
	with 2319276 stored elements in Compressed Sparse Row format>

## Create a dictionary to get the count of every label

In [0]:
# Creating empty dictionary 
myDict = dict() # creating empty dict 
#count_vect.vocabulary_

In [37]:
for i, j in enumerate(y):
  my_list = j.split(',')
  
  for item in my_list:
    #myDict[item] = my_list.count(item)
    if (item in myDict): 
        myDict[item] += 1
    else: 
         myDict[item] = 1 
    
for key, value in myDict.items(): 
      print ("% s : % d"%(key, value))
    



male :  53358
15 :  6532
Student :  22122
Leo :  8230
33 :  2835
InvestmentBanking :  244
Aquarius :  9050
female :  46642
14 :  3540
indUnk :  33097
Aries :  10637
25 :  8660
Capricorn :  8723
17 :  12755
Gemini :  9225
23 :  10757
Non-Profit :  1326
Cancer :  9253
Banking :  354
37 :  863
Sagittarius :  7366
26 :  8059
24 :  11814
Scorpio :  7049
27 :  8007
Education :  5553
45 :  906
Engineering :  2332
Libra :  7250
Science :  1090
34 :  2388
41 :  772
Communications-Media :  2830
BusinessServices :  626
Sports-Recreation :  406
Virgo :  7134
Taurus :  8530
Arts :  5031
Pisces :  7553
44 :  76
16 :  8406
Internet :  2251
Museums-Libraries :  308
Accounting :  528
39 :  568
35 :  4720
Technology :  8484
36 :  3045
Law :  360
46 :  914
Consulting :  905
Automotive :  124
42 :  156
Religion :  1081
13 :  1497
Fashion :  1898
38 :  801
43 :  505
Publishing :  1079
40 :  513
Marketing :  726
LawEnforcement-Security :  368
HumanResources :  209
Telecommunications :  165
Military :  798
G

## Transform the labels

In [0]:
list_class = [] 
for key in myDict.keys(): 
    list_class.append(key) 
list_class_array=np.array(list_class)

In [0]:
# transform to dictionary as Acceptable format of MultiLabelBinarizer
y_train_pass = [set(i.split(',')) for i in y_train]
y_test_pass = [set(i.split(',')) for i in y_test]

In [0]:
from sklearn.preprocessing import MultiLabelBinarizer

In [41]:
mlb = MultiLabelBinarizer()
mlb.fit(y_train_pass)

MultiLabelBinarizer(classes=None, sparse_output=False)

In [42]:
len(y_train_pass)

80000

In [43]:
# retriving the lables 
mlb.classes_

array(['13', '14', '15', '16', '17', '23', '24', '25', '26', '27', '33',
       '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44',
       '45', '46', '47', '48', 'Accounting', 'Advertising', 'Agriculture',
       'Aquarius', 'Architecture', 'Aries', 'Arts', 'Automotive',
       'Banking', 'Biotech', 'BusinessServices', 'Cancer', 'Capricorn',
       'Chemicals', 'Communications-Media', 'Construction', 'Consulting',
       'Education', 'Engineering', 'Environment', 'Fashion', 'Gemini',
       'Government', 'HumanResources', 'Internet', 'InvestmentBanking',
       'Law', 'LawEnforcement-Security', 'Leo', 'Libra', 'Manufacturing',
       'Maritime', 'Marketing', 'Military', 'Museums-Libraries',
       'Non-Profit', 'Pisces', 'Publishing', 'RealEstate', 'Religion',
       'Sagittarius', 'Science', 'Scorpio', 'Sports-Recreation',
       'Student', 'Taurus', 'Technology', 'Telecommunications', 'Tourism',
       'Transportation', 'Virgo', 'female', 'indUnk', 'male'],
      dtype

In [0]:
y_trn_mlb = mlb.transform(y_train_pass)

In [0]:
y_test_mlb =mlb.transform(y_test_pass)

## Choose a classifier

In [0]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

In [0]:
clf = LogisticRegression()
clf = OneVsRestClassifier(clf)

##  Fit the classifier, make predictions and get the accuracy

In [48]:
# fitting the classifier 
clf.fit(X_train_dtm, y_trn_mlb)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [0]:
# Predicting on test data 
y_pred_class = clf.predict (X_test_dtm)

In [50]:
 #calculate accuracy of class predictions
from sklearn import metrics
# Score CNN model
print ('test set accuracy ',metrics.accuracy_score(y_test_mlb, y_pred_class))

test set accuracy  0.1293


In [51]:
y_pred_class_trn = clf.predict (X_train_dtm)
print ('train set accuracy ',metrics.accuracy_score(y_trn_mlb,y_pred_class_trn ))

train set accuracy  0.7804375


In [52]:
# Classification report  CNN model
print(metrics.classification_report(y_test_mlb, y_pred_class))

              precision    recall  f1-score   support

           0       0.73      0.20      0.32       337
           1       0.75      0.22      0.34       708
           2       0.74      0.29      0.42      1303
           3       0.77      0.37      0.50      1651
           4       0.68      0.31      0.42      2512
           5       0.63      0.21      0.32      2129
           6       0.68      0.30      0.41      2367
           7       0.65      0.22      0.33      1758
           8       0.67      0.25      0.36      1625
           9       0.65      0.24      0.35      1619
          10       0.51      0.10      0.17       539
          11       0.92      0.49      0.64       486
          12       0.75      0.25      0.37       959
          13       0.85      0.37      0.52       663
          14       0.35      0.09      0.14       143
          15       0.83      0.26      0.39       167
          16       0.71      0.17      0.27       120
          17       0.44    

  _warn_prf(average, modifier, msg_start, len(result))


In [0]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

In [54]:

print("F1: " , (f1_score(y_test_mlb, y_pred_class, average='micro')))
print("F1_macro: " , (f1_score(y_test_mlb, y_pred_class, average='macro')))
print("Recall micro: " , recall_score(y_test_mlb, y_pred_class, average='micro'))
print("F1_micro: " , (f1_score(y_test_mlb, y_pred_class, average='micro')))
print("Recall macro: " , recall_score(y_test_mlb, y_pred_class, average='macro'))
print("Average Precision: " ,(average_precision_score(y_test_mlb, y_pred_class, average='micro')))
print("Accuracy:" , (accuracy_score(y_test_mlb, y_pred_class))) 

F1:  0.5051713435819717
F1_macro:  0.30538949765367895
Recall micro:  0.3916625
F1_micro:  0.5051713435819717
Recall macro:  0.2120484063104803
Average Precision:  0.3090148252939908
Accuracy: 0.1293


## Print true label and predicted label for any five examples

In [55]:
y_test_pred_inversed = mlb.inverse_transform(y_pred_class)
y_test_inversed = mlb.inverse_transform(y_test_mlb)
for i in range(15,20):
    print (i)
    print( 'True labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        
        ','.join(y_test_inversed[i]),
        ','.join(y_test_pred_inversed[i])
    ))

15
True labels:	36,Pisces,Technology,male
Predicted labels:	Pisces,male


16
True labels:	25,Cancer,Non-Profit,male
Predicted labels:	male


17
True labels:	35,Aries,Technology,male
Predicted labels:	male


18
True labels:	25,Gemini,indUnk,male
Predicted labels:	male


19
True labels:	17,Virgo,indUnk,male
Predicted labels:	17,Virgo,indUnk,male


