In [0]:
# Importing the necessary packages

from zipfile import ZipFile

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

%tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__)

import warnings
warnings.filterwarnings("ignore")

import nltk
import re
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.metrics import confusion_matrix, classification_report,f1_score, accuracy_score, recall_score, precision_score

In [2]:
# Mounting the google drive

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
# Defining the drive project path
project_path = "/content/gdrive/My Drive/Colab Notebooks/Assignments/NLP/"

In [0]:
# Changing the current directory
os.chdir(project_path)

# 1. Load the dataset (5 points)
Tip: As the dataset is large, use fewer rows. Check what is working well on your machine and decide accordingly.

In [0]:
# Extracting the zip file

with ZipFile(project_path + "blog-authorship-corpus.zip") as zip_file:
  zip_file.extractall()

In [6]:
# Reading the data

blogs_df = pd.read_csv("blogtext.csv")
blogs_df.shape

(681284, 7)

In [7]:
# Using first 10,000 rows of the data

blogs_df_cropped = blogs_df[0:1000]
blogs_df_cropped.shape

(1000, 7)

In [8]:
blogs_df_cropped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
id        1000 non-null int64
gender    1000 non-null object
age       1000 non-null int64
topic     1000 non-null object
sign      1000 non-null object
date      1000 non-null object
text      1000 non-null object
dtypes: int64(2), object(5)
memory usage: 54.8+ KB


# 2. Preprocess rows of the “text” column (7.5 points)
a. Remove unwanted characters, b. Convert text to lowercase, c. Remove unwanted spaces, d. Remove stopwords

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
stop_words = set(stopwords.words('english'))

In [0]:
for i in range(blogs_df_cropped.shape[0]):
  blogs_df_cropped['text'][i] = re.sub("[^a-zA-Z]", " ", blogs_df_cropped['text'][i])   # Removing unwanted characters
  blogs_df_cropped['text'][i] = blogs_df_cropped['text'][i].lower()   # Lowercase characters
  blogs_df_cropped['text'][i] =  blogs_df_cropped['text'][i].strip()   # Removing unwanted spaces

In [0]:
# Splitting the sentence into words to remove stopwords
blogs_df_cropped['text'] = blogs_df_cropped['text'].apply( lambda t : " ".join(word for word in t.split() if word not in stop_words)) 

In [13]:
blogs_df_cropped.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004",info found pages mb pdf files wait untill team...
1,2059027,male,15,Student,Leo,"13,May,2004",team members drewes van der laag urllink mail ...
2,2059027,male,15,Student,Leo,"12,May,2004",het kader van kernfusie op aarde maak je eigen...
3,2059027,male,15,Student,Leo,"12,May,2004",testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks yahoo toolbar capture urls popups means...


# 3. As we want to make this into a multi-label classification problem, you are required to merge all the label columns together, so that we have all the labels together for a particular sentence (7.5 points)
a. Label columns to merge: “gender”, “age”, “topic”, “sign”          
b. After completing the previous step, there should be only two columns in your dataframe i.e. “text” and “labels” as shown in the below image

In [14]:
blogs_df_merged = pd.DataFrame(columns = ["text", "labels"])
blogs_df_cropped['age'] = blogs_df_cropped['age'].astype(str)
blogs_df_merged["labels"] = blogs_df_cropped[['gender','age','topic','sign']].apply(lambda x: ','.join(x), axis = 1) 
blogs_df_merged["text"] = blogs_df_cropped["text"]
blogs_df_merged.head()

Unnamed: 0,text,labels
0,info found pages mb pdf files wait untill team...,"male,15,Student,Leo"
1,team members drewes van der laag urllink mail ...,"male,15,Student,Leo"
2,het kader van kernfusie op aarde maak je eigen...,"male,15,Student,Leo"
3,testing testing,"male,15,Student,Leo"
4,thanks yahoo toolbar capture urls popups means...,"male,33,InvestmentBanking,Aquarius"


# 4. Separate features and labels, and split the data into training and testing (5 points)

In [0]:
features = blogs_df_merged["text"]
labels = blogs_df_merged["labels"].str.lower()
X_train, X_test, Y_train, Y_test = train_test_split(features, labels, test_size = 0.3, random_state = 47)

# 5. Vectorize the features (5 points)
a. Create a Bag of Words using count vectorizer: i. Use ngram_range=(1, 2) and ii. Vectorize training and testing features     
b. Print the term-document matrix

In [16]:
# Count Vectorizer

vectorizer_model = CountVectorizer(ngram_range = (1, 2), stop_words = "english")

train_data_features = vectorizer_model.fit_transform(X_train)
test_data_features = vectorizer_model.transform(X_test)

train_data_features = train_data_features.toarray()
test_data_features = test_data_features.toarray()

# Document term matrix
print(pd.DataFrame(train_data_features))

     0      1      2      3      4      ...  91064  91065  91066  91067  91068
0        0      0      0      0      0  ...      0      0      0      0      0
1        0      0      0      0      0  ...      0      0      0      0      0
2        0      0      0      0      0  ...      0      0      0      0      0
3        0      0      0      0      0  ...      0      0      0      0      0
4        0      0      0      0      0  ...      0      0      0      0      0
..     ...    ...    ...    ...    ...  ...    ...    ...    ...    ...    ...
695      0      0      0      0      0  ...      0      0      0      0      0
696      0      0      0      0      0  ...      0      0      0      0      0
697      0      0      0      0      0  ...      0      0      0      0      0
698      0      0      0      0      0  ...      0      0      0      0      0
699      0      0      0      0      0  ...      0      0      0      0      0

[700 rows x 91069 columns]


# 6. Create a dictionary to get the count of every label i.e. the key will be label name and value will be the total count of the label. Check below image for reference (5 points)

In [17]:
vectorizer_model_2 = CountVectorizer(ngram_range = (1, 1), stop_words = "english")
labels_vector = vectorizer_model_2.fit_transform(labels)
vectorizer_model_2.vocabulary_

{'14': 0,
 '15': 1,
 '17': 2,
 '23': 3,
 '24': 4,
 '25': 5,
 '26': 6,
 '27': 7,
 '33': 8,
 '34': 9,
 '37': 10,
 '41': 11,
 '44': 12,
 '45': 13,
 'aquarius': 14,
 'aries': 15,
 'arts': 16,
 'banking': 17,
 'businessservices': 18,
 'cancer': 19,
 'capricorn': 20,
 'communications': 21,
 'education': 22,
 'engineering': 23,
 'female': 24,
 'gemini': 25,
 'indunk': 26,
 'investmentbanking': 27,
 'leo': 28,
 'libra': 29,
 'male': 30,
 'media': 31,
 'non': 32,
 'pisces': 33,
 'profit': 34,
 'recreation': 35,
 'sagittarius': 36,
 'science': 37,
 'scorpio': 38,
 'sports': 39,
 'student': 40,
 'taurus': 41,
 'virgo': 42}

# 7. Transform the labels - (7.5 points)
As we have noticed before, in this task each example can have multiple tags. To deal with such kind of prediction, we need to transform labels in a binary form and the prediction will be a mask of 0s and 1s. For this purpose, it is convenient to use MultiLabelBinarizer from sklearn
#### a. Convert your train and test labels using MultiLabelBinarizer

In [18]:
keys = []

for key in vectorizer_model_2.vocabulary_.keys():
  keys.append(key)

keys[0:10]

['male',
 '15',
 'student',
 'leo',
 '33',
 'investmentbanking',
 'aquarius',
 'female',
 '14',
 'indunk']

In [0]:
MLB_model = MultiLabelBinarizer(classes = keys) 

In [0]:
labels = [["".join(re.findall("\w",f)) for f in lst] for lst in [s.split(",") for s in labels]]

In [21]:
labels_model = MLB_model.fit(labels) # transforming entire set of lables
labels_model

MultiLabelBinarizer(classes=['male', '15', 'student', 'leo', '33',
                             'investmentbanking', 'aquarius', 'female', '14',
                             'indunk', 'aries', '25', 'capricorn', '17',
                             'gemini', '23', 'non', 'profit', 'cancer',
                             'banking', '37', 'sagittarius', '26', '24',
                             'scorpio', '27', 'education', '45', 'engineering',
                             'libra', ...],
                    sparse_output=False)

In [0]:
Y_train = [["".join(re.findall("\w",f)) for f in lst] for lst in [s.split(",") for s in Y_train]]

In [23]:
Y_train_2 = MLB_model.transform(Y_train)
Y_train_2.shape

(700, 43)

In [0]:
Y_test = [["".join(re.findall("\w",f)) for f in lst] for lst in [s.split(",") for s in Y_test]]

In [25]:
Y_test_2 = MLB_model.transform(Y_test)
Y_test_2[100]

array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

# 8. Choose a classifier - (5 points)
In this task, we suggest using the One-vs-Rest approach, which is implemented in
OneVsRestClassifier class. In this approach k classifiers (= number of tags) are trained. As a basic classifier, use LogisticRegression . It is one of the simplest methods, but often it performs good enough in text classification tasks. It might take some time because the number of classifiers to train is large.          
#### a. Use a linear classifier of your choice, wrap it up in OneVsRestClassifier to train it on every label             
#### b. As One-vs-Rest approach might not have been discussed in the sessions, we are providing you the code for that

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

In [0]:
clf = LogisticRegression(solver = 'lbfgs')
clf = OneVsRestClassifier(clf)

# 9. Fit the classifier, make predictions and get the accuracy (5 points)
a. Print the following: i. Accuracy score, ii. F1 score, iii. Average precision score, iv. Average recall score      
v. Tip: Make sure you are familiar with all of them. How would you expect the things to work for the multi-label scenario? Read about micro/macro/weighted averaging

In [29]:
clf.fit(train_data_features, Y_train_2)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [32]:
print("Train Accuracy:", clf.score(train_data_features, Y_train_2))

Train Accuracy: 0.9857142857142858


In [0]:
Y_pred = clf.predict(test_data_features)

In [35]:
print("Test Accuracy:" + str(accuracy_score(Y_test_2, Y_pred)))
print("F1: " + str(f1_score(Y_test_2, Y_pred, average='micro')))
print("F1_macro: " + str(f1_score(Y_test_2, Y_pred, average='macro')))
print("Precision: " + str(precision_score(Y_test_2, Y_pred, average='micro')))
print("Precision_macro: " + str(precision_score(Y_test_2, Y_pred, average='macro')))
print("Recall: " + str(recall_score(Y_test_2, Y_pred, average='micro')))
print("Recall_macro: " + str(recall_score(Y_test_2, Y_pred, average='macro')))

Test Accuracy:0.22333333333333333
F1: 0.6168937329700273
F1_macro: 0.27478691916278347
Precision: 0.8311306901615272
Precision_macro: 0.48662975401868175
Recall: 0.4904679376083189
Recall_macro: 0.217100800285069


# 10. Print true label and predicted label for any five examples (7.5 points)

In [0]:
Y_pred_inv = MLB_model.inverse_transform(Y_pred) 
Y_test_2_inv =  MLB_model.inverse_transform(Y_test_2)

In [42]:
print("Prediction of 40th person: \n", Y_pred_inv[40], "\nActual data of 40th person: \n", Y_test_2_inv[40])

Prediction of 40th person: 
 ('male', 'indunk') 
Actual data of 40th person: 
 ('female', 'indunk', '24', 'scorpio')


In [43]:
print("Prediction of 80th person: \n", Y_pred_inv[80], "\nActual data of 80th person: \n", Y_test_2_inv[80])

Prediction of 80th person: 
 ('aquarius', 'female') 
Actual data of 80th person: 
 ('male', '15', 'student', 'aquarius')


In [44]:
print("Prediction of 120th person: \n", Y_pred_inv[120], "\nActual data of 120th person: \n", Y_test_2_inv[120])

Prediction of 120th person: 
 ('female', 'indunk') 
Actual data of 120th person: 
 ('female', 'indunk', '24', 'scorpio')


In [45]:
print("Prediction of 160th person: \n", Y_pred_inv[160], "Actual data of 160th person: \n", Y_test_2_inv[160])

Prediction of 160th person: 
 ('male',) Actual data of 160th person: 
 ('male', '33', 'investmentbanking', 'aquarius')
