## 1. Load the dataset  

In [0]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
df = pd.read_csv('/content/drive/My Drive/Statistical NLP Project /blog-authorship-corpus.zip')

In [4]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [5]:
df.shape

(681284, 7)

In [0]:
df= df.iloc[0:50000, : ]

In [7]:
df.shape

(50000, 7)

## 2. Preprocess rows of the “text” column  
a. Remove unwanted characters 
b. Convert text to lowercase 
c. Remove unwanted spaces 
d. Remove stopwords  

In [0]:
#Preprocessing data 

import re, string

def clean_str(string):
    
    
    """
    text cleaning using regex
    """
    try:
        string=re.sub(r'^https?:\/\/<>.*[\r\n]*','',string,flags=re.MULTILINE)
        string=re.sub(r"[^A-Za-z]"," ",string)
        words=string.strip().lower().split()
        words=[w for w in words if len(w) >= 1]
        return " ".join(words)
    except:
        return ""

In [0]:
df['text']=df['text'].map(clean_str)

In [10]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004",info has been found pages and mb of pdf files ...
1,2059027,male,15,Student,Leo,"13,May,2004",these are the team members drewes van der laag...
2,2059027,male,15,Student,Leo,"12,May,2004",in het kader van kernfusie op aarde maak je ei...
3,2059027,male,15,Student,Leo,"12,May,2004",testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks to yahoo s toolbar i can now capture th...


Stopword Removal

In [11]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
from nltk.corpus import stopwords
 
stopwords.words('english')

In [0]:
clean_words = df['text']
 
for text in df:
 
    if text in stopwords.words('english'):
 
        clean_words.remove(text)

In [15]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004",info has been found pages and mb of pdf files ...
1,2059027,male,15,Student,Leo,"13,May,2004",these are the team members drewes van der laag...
2,2059027,male,15,Student,Leo,"12,May,2004",in het kader van kernfusie op aarde maak je ei...
3,2059027,male,15,Student,Leo,"12,May,2004",testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks to yahoo s toolbar i can now capture th...


## 3.a. Label columns to merge: “gender”, “age”, “topic”, “sign”  

In [0]:
new= df[['age','topic','sign']].copy()

In [0]:
df['labels']= df['gender'].str.cat(new.values.astype(str), sep =", ")

## 3.b. After completing the previous step, there should be only two columns in your data  frame i.e. “text” and “labels”

In [18]:
df_new=df.iloc[:,6:8]
df_new.head()

Unnamed: 0,text,labels
0,info has been found pages and mb of pdf files ...,"male, 15, Student, Leo"
1,these are the team members drewes van der laag...,"male, 15, Student, Leo"
2,in het kader van kernfusie op aarde maak je ei...,"male, 15, Student, Leo"
3,testing testing,"male, 15, Student, Leo"
4,thanks to yahoo s toolbar i can now capture th...,"male, 33, InvestmentBanking, Aquarius"


## 4. Separate features and labels, and split the data into training and testing 

In [0]:
from sklearn.model_selection import train_test_split
# define X and y
X = df_new.text
y = df_new.labels

# split the new DataFrame into training and testing sets [Default test size = 25%]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [20]:
y_test.shape

(12500,)

# 5.Vectorize the features 
a. Create a Bag of Words using count vectorizer
 i. Use ngram_range=(1, 2)

In [0]:
# use CountVectorizer to create document-term matrices from X_train and X_test
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [22]:
X_train_dtm.shape

(37500, 117547)

In [23]:
# show vectorizer options
vect

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [24]:
# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 2))
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm.shape

(37500, 1954166)

ii. Vectorize training and testing features

In [0]:
# create document-term matrices
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

b. Print the term-document matrix

In [26]:
# create a document-term matrix using TF-IDF
vect = TfidfVectorizer(stop_words='english')
dtm = vect.fit_transform(df_new.text)
features = vect.get_feature_names()
dtm.shape

(50000, 135342)

# 6. Create a dictionary to get the count of every label i.e. the key will be label name and value will be the total count of the label.

In [27]:
myDict ={}
for i, j in enumerate(y):
  my_list = j.split(',')
  
  for item in my_list:
      #myDict[item] = my_list.count(item)
      if (item in myDict): 
        myDict[item] += 1
      else: 
         myDict[item] = 1 
    
for key, value in myDict.items(): 
      print ("% s : % d"%(key, value))

male :  25815
 15 :  3508
 Student :  10660
 Leo :  3904
 33 :  1654
 InvestmentBanking :  85
 Aquarius :  4784
female :  24185
 14 :  2043
 indUnk :  17560
 Aries :  7795
 25 :  2837
 Capricorn :  3819
 17 :  6859
 Gemini :  2558
 23 :  5518
 Non-Profit :  491
 Cancer :  4589
 Banking :  283
 37 :  310
 Sagittarius :  4571
 26 :  2869
 24 :  5746
 Scorpio :  3243
 27 :  4094
 Education :  2646
 45 :  93
 Engineering :  1402
 Libra :  4378
 Science :  705
 34 :  1886
 41 :  394
 Communications-Media :  1603
 BusinessServices :  416
 Sports-Recreation :  120
 Virgo :  2827
 Taurus :  3390
 Arts :  1817
 Pisces :  4142
 44 :  38
 16 :  4156
 Internet :  1420
 Museums-Libraries :  285
 Accounting :  364
 39 :  412
 35 :  3365
 Technology :  4379
 36 :  1985
 Law :  308
 46 :  330
 Consulting :  243
 Automotive :  116
 42 :  96
 Religion :  258
 13 :  745
 Fashion :  1805
 38 :  196
 43 :  150
 Publishing :  207
 40 :  192
 Marketing :  414
 LawEnforcement-Security :  125
 HumanResources :

## 7. Transform the labels 

In [0]:
list_class = [] 
for key in myDict.keys(): 
    list_class.append(key) 
list_class_array=np.array(list_class)

In [0]:
# transform to dictionary as Acceptable format of MultiLabelBinarizer
y_train_pass = [set(i.split(',')) for i in y_train]
y_test_pass = [set(i.split(',')) for i in y_test]

In [0]:
 from sklearn.preprocessing import MultiLabelBinarizer
 mlb = MultiLabelBinarizer()

In [31]:

mlb.fit(y_train_pass)

MultiLabelBinarizer(classes=None, sparse_output=False)

In [32]:
len(y_train_pass)

37500

In [33]:
mlb.classes_

array([' 13', ' 14', ' 15', ' 16', ' 17', ' 23', ' 24', ' 25', ' 26',
       ' 27', ' 33', ' 34', ' 35', ' 36', ' 37', ' 38', ' 39', ' 40',
       ' 41', ' 42', ' 43', ' 44', ' 45', ' 46', ' 47', ' 48',
       ' Accounting', ' Advertising', ' Agriculture', ' Aquarius',
       ' Architecture', ' Aries', ' Arts', ' Automotive', ' Banking',
       ' Biotech', ' BusinessServices', ' Cancer', ' Capricorn',
       ' Chemicals', ' Communications-Media', ' Construction',
       ' Consulting', ' Education', ' Engineering', ' Environment',
       ' Fashion', ' Gemini', ' Government', ' HumanResources',
       ' Internet', ' InvestmentBanking', ' Law',
       ' LawEnforcement-Security', ' Leo', ' Libra', ' Manufacturing',
       ' Maritime', ' Marketing', ' Military', ' Museums-Libraries',
       ' Non-Profit', ' Pisces', ' Publishing', ' RealEstate',
       ' Religion', ' Sagittarius', ' Science', ' Scorpio',
       ' Sports-Recreation', ' Student', ' Taurus', ' Technology',
       ' Telecommuni

In [0]:
y_trn_trans = mlb.transform(y_train_pass)

In [0]:
y_test_trans =mlb.transform(y_test_pass)

## 8 a. Use a linear classifier of your choice, wrap it up in OneVsRestClassifier to train it on  every label

In [0]:
 from sklearn.multiclass import OneVsRestClassifier
 from sklearn.linear_model import LogisticRegression

clf= LogisticRegression(solver='lbfgs')
clf= OneVsRestClassifier(clf)

# 9. Fit the classifier, make predictions and get the accuracy 
 a. Print the following  i. Accuracy score  ii. F1 score  iii. Average precision score  iv. Average recall score 
 

In [37]:
clf.fit(X_train_dtm,y_trn_trans)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [0]:
Y_preds = clf.predict(X_test_dtm)

In [40]:
#calculate accuracy of class predictions
from sklearn import metrics
# Score CNN model
metrics.accuracy_score(y_test_trans, Y_preds)

0.14088

In [42]:
# Classification report  CNN model
print(metrics.classification_report(y_test_trans, Y_preds))

              precision    recall  f1-score   support

           0       0.73      0.16      0.27       166
           1       0.75      0.23      0.35       497
           2       0.75      0.23      0.35       841
           3       0.77      0.31      0.44      1050
           4       0.70      0.34      0.46      1646
           5       0.70      0.27      0.39      1375
           6       0.78      0.37      0.50      1481
           7       0.49      0.07      0.12       726
           8       0.65      0.13      0.22       711
           9       0.69      0.24      0.35      1051
          10       0.57      0.09      0.16       432
          11       0.92      0.61      0.73       446
          12       0.72      0.24      0.36       878
          13       0.89      0.36      0.51       521
          14       0.36      0.04      0.07        98
          15       1.00      0.04      0.08        48
          16       0.87      0.13      0.22       102
          17       0.33    

In [0]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

In [46]:
print("F1: " , (f1_score(y_test_trans, Y_preds, average='micro')))
print("F1_macro: " , (f1_score(y_test_trans, Y_preds, average='macro')))
print("Recall micro: " , recall_score(y_test_trans, Y_preds, average='micro'))
print("F1_micro: " , (f1_score(y_test_trans, Y_preds, average='micro')))
print("Recall macro: " , recall_score(y_test_trans, Y_preds, average='macro'))
print("Average Precision: " ,(average_precision_score(y_test_trans, Y_preds, average='micro')))
print("Accuracy:" , (accuracy_score(y_test_trans, Y_preds))) 

F1:  0.5190412301976366
F1_macro:  0.2838486567280153
Recall micro:  0.40234
F1_micro:  0.5190412301976366
Recall macro:  0.20021389590225316
Average Precision:  0.3240345401947957
Accuracy: 0.14088


## 10.Print true label and predicted label for any five examples

In [48]:
y_test_pred_inversed = mlb.inverse_transform(Y_preds)
y_test_inversed = mlb.inverse_transform(y_test_trans)
for i in range(15,20):
    print( 'True labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        
        ','.join(y_test_inversed[i]),
        ','.join(y_test_pred_inversed[i])
    ))

True labels:	 26, Sagittarius, Science,female
Predicted labels:	 indUnk,male


True labels:	 24, Chemicals, Libra,male
Predicted labels:	 Student,female


True labels:	 25, Aries, indUnk,male
Predicted labels:	 25,male


True labels:	 23, Pisces, Student,female
Predicted labels:	 23, Pisces, Student,female


True labels:	 17, Aquarius, indUnk,male
Predicted labels:	 indUnk,male


