In [0]:
import pandas as pd
import numpy as np
# importing all the necessary packages

In [2]:
from google.colab import drive
drive.mount('/content/drive')
# mounting the drive

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Load the dataset (5 points)

Tip: As the dataset is large, use fewer rows. Check what is working well on your machine and decide accordingly.

In [0]:
import os
# import os

In [0]:
os.chdir('/content/drive/My Drive/Statistical_NLP')
# change the directory

In [0]:

from zipfile import ZipFile
with ZipFile('blog-authorship-corpus.zip', 'r') as z:
  z.extractall()
  # extract the content from the zip file

In [0]:
df=pd.read_csv('blogtext.csv')
# read the blogs csv file

In [13]:
df.head()
# Use 'Head' function to show the first five rows

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [14]:
df.shape

(681284, 7)

In [0]:
df=df[0:20001]

In [16]:
df.topic.value_counts()

indUnk                     7789
Technology                 2989
Student                    2638
Fashion                    1622
Internet                    778
Education                   759
Communications-Media        414
Arts                        358
Engineering                 357
Marketing                   207
Non-Profit                  204
Government                  187
BusinessServices            184
Religion                    182
Consulting                  166
Sports-Recreation           120
Automotive                  111
Manufacturing                93
LawEnforcement-Security      90
Banking                      89
Science                      87
InvestmentBanking            71
Publishing                   70
Museums-Libraries            67
Law                          47
Agriculture                  46
Transportation               46
Architecture                 45
Advertising                  42
Biotech                      36
Accounting                   35
Construc

In [17]:
df.columns

Index(['id', 'gender', 'age', 'topic', 'sign', 'date', 'text'], dtype='object')

In [18]:
df.dtypes

id         int64
gender    object
age        int64
topic     object
sign      object
date      object
text      object
dtype: object

In [0]:
# datasize is huge. Let's keep only few topics which are our target classe 
# after that perform downsampling to balance the classes
df_student = df[ (df.topic == 'Student') ] 
df_tech =   df[ (df.topic == 'Technology') ]
df_arts     = df[(df.topic == 'Arts')]       
df_non_profit = df[(df.topic == 'Non-Profit')]
df_law = df[df.topic == 'Law']
df_gov = df[(df.topic == 'Government')]
df_rel = df[(df.topic == 'Religion')]
df_fashion = df[(df.topic == 'Fashion')]
#df_marketing = df[(df.topic == 'Marketing')]
#df_advertising = df[(df.topic == 'Advertising')]
#df_BusinessServices=df[(df.topic=='BusinessServices')]

In [0]:
# use resample method from scikit-learn
from sklearn.utils import resample

df_student = resample(df_student, 
                      replace=True,    # sample with replacement
                      n_samples=200,     # to match number of values in each class
                      random_state=123) # reproducible results

df_tech = resample(df_tech, 
                      replace=True,    # sample with replacement
                      n_samples=200,     # to match number of values in each class
                      random_state=123) # reproducible results

df_arts = resample(df_arts, 
                      replace=True,    # sample with replacement
                      n_samples=200,     # to match number of values in each class
                      random_state=123) # reproducible results

df_non_profit = resample(df_non_profit, 
                      replace=True,    # sample with replacement
                      n_samples=200,     # to match number of values in each class
                      random_state=123) # reproducible results

df_law = resample(df_law, 
                      replace=True,    # sample with replacement
                      n_samples=200,     # to match number of values in each class
                      random_state=123) # reproducible results

df_gov = resample(df_gov, 
                      replace=True,    # sample with replacement
                      n_samples=200,     # to match number of values in each class
                      random_state=123) # reproducible results

df_rel = resample(df_rel, 
                      replace=True,    # sample with replacement
                      n_samples=200,     # to match number of values in each class
                      random_state=123) # reproducible results
df_fashion = resample(df_fashion, 
                      replace=True,    # sample with replacement
                      n_samples=200,     # to match number of values in each class
                      random_state=123) # reproducible results


In [21]:
df = pd.concat([df_student, df_tech, df_arts, df_non_profit, df_law, df_gov, df_rel, df_fashion ])

# check for the class balance
df['topic'].value_counts()

Law           200
Student       200
Arts          200
Non-Profit    200
Government    200
Technology    200
Religion      200
Fashion       200
Name: topic, dtype: int64

In [22]:

df.columns

Index(['id', 'gender', 'age', 'topic', 'sign', 'date', 'text'], dtype='object')

In [23]:
# 1. Lower casing  - change all the words to lower case to avoid duplication. Because "Python" and "python" considered 2 words
df['text'] = df['text'].apply( lambda t : ' '.join( word.lower() for word in t.split()  ) )
df['text'].head(5)

11117    an interesting article i found this intriguing...
9875     what happened to us? by hoobastank album : the...
15898    big thank you to urllink rach for coming to my...
17797    today and tomorrow are the worst days of work....
10053    lol. first time taking a maths exams in such a...
Name: text, dtype: object

2. Preprocess rows of the “text” column (7.5 points)

a. Remove unwanted characters
b. Convert text to lowercase
c. Remove unwanted spaces
d. Remove stopwords

In [24]:
# 2. Remove punctuations
# the [^\w\s] means remove everything, keep only words(w) and spaces(s)
# this step should be done after feature extraction like hashtags, user tagged
df['text'] = df['text'].str.replace( '[^\w\s]' , '' )
df['text'].head(10)

11117    an interesting article i found this intriguing...
9875     what happened to us by hoobastank album  the r...
15898    big thank you to urllink rach for coming to my...
17797    today and tomorrow are the worst days of work ...
10053    lol first time taking a maths exams in such a ...
13763    welcome to the 2nd edition of the supposedly s...
15893    honourable combat the waiting is finally over ...
834      yeah been a few days since i updated lots of s...
17804    having a great holiday so far and finally gett...
222      urllink church bells rang of my failures rows ...
Name: text, dtype: object

In [25]:
# 3 . Remove stop words
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['text'] = df['text'].apply( lambda t : " ".join( word for word in t.split() 
                                                       if word not in stop ) )

In [27]:
# Remove white spaces
df['text'] = df['text'].str.strip()
df['text'].head(10)

11117    interesting article found intriguing urllink a...
9875     happened us hoobastank album reason thought go...
15898    big thank urllink rach coming rescue broke blo...
17797    today tomorrow worst days work days need tell ...
10053    lol first time taking maths exams slacking man...
13763    welcome 2nd edition supposedly sad account exa...
15893    honourable combat waiting finally months excus...
834      yeah days since updated lots stuff going thoug...
17804    great holiday far finally getting little time ...
222      urllink church bells rang failures rows empty ...
Name: text, dtype: object

In [28]:
# Remove Numberic
import re

def remove_num(t):
    removed_num_text = re.sub(r'\d+', '', t)
    return removed_num_text

df['text'] = df['text'].apply( lambda t : remove_num(t) )
df['text'].head(10)

11117    interesting article found intriguing urllink a...
9875     happened us hoobastank album reason thought go...
15898    big thank urllink rach coming rescue broke blo...
17797    today tomorrow worst days work days need tell ...
10053    lol first time taking maths exams slacking man...
13763    welcome nd edition supposedly sad account exam...
15893    honourable combat waiting finally months excus...
834      yeah days since updated lots stuff going thoug...
17804    great holiday far finally getting little time ...
222      urllink church bells rang failures rows empty ...
Name: text, dtype: object

As we want to make this into a multi-label classification problem, you are required to merge
all the label columns together, so that we have all the labels together for a particular sentence
(7.5 points)

In [0]:
df['labels'] ="["+ df.iloc[:,1].astype(str) +","+ df.iloc[:,2].astype(str) +","+ df.iloc[:,3].astype(str) +","+ df.iloc[:,4].astype(str)+"]"

In [30]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,labels
11117,2462841,male,17,Student,Cancer,"17,May,2004",interesting article found intriguing urllink a...,"[male,17,Student,Cancer]"
9875,3976976,female,17,Student,Taurus,"03,August,2004",happened us hoobastank album reason thought go...,"[female,17,Student,Taurus]"
15898,2836391,male,27,Student,Sagittarius,"13,May,2004",big thank urllink rach coming rescue broke blo...,"[male,27,Student,Sagittarius]"
17797,3858875,female,25,Student,Libra,"29,July,2004",today tomorrow worst days work days need tell ...,"[female,25,Student,Libra]"
10053,3756895,male,16,Student,Virgo,"29,June,2004",lol first time taking maths exams slacking man...,"[male,16,Student,Virgo]"


In [0]:
df.drop(['id','gender','age','topic','sign','date'],axis=1, inplace=True)

In [32]:
df.head()

Unnamed: 0,text,labels
11117,interesting article found intriguing urllink a...,"[male,17,Student,Cancer]"
9875,happened us hoobastank album reason thought go...,"[female,17,Student,Taurus]"
15898,big thank urllink rach coming rescue broke blo...,"[male,27,Student,Sagittarius]"
17797,today tomorrow worst days work days need tell ...,"[female,25,Student,Libra]"
10053,lol first time taking maths exams slacking man...,"[male,16,Student,Virgo]"


Separate features and labels, and split the data into training and testing (5 points)

In [0]:
X=df['text']

In [0]:
y=df['labels']

In [0]:
from sklearn.model_selection import train_test_split

X_train , X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)

Vectorize the features (5 points)
a. Create a Bag of Words using count vectorizer
i. Use ngram_range=(1, 2)
ii. Vectorize training and testing features
b. Print the term-document matrix

In [0]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [37]:
cv = CountVectorizer(ngram_range=(1,2))
cv.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 2), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [0]:
X_train_data_cv=cv.transform(X_train)

In [39]:
X_train_data_cv.shape

(1120, 89493)

In [0]:
X_train_data_cv=X_train_data_cv.toarray()

In [0]:
#data_dtm = pd.DataFrame(X_train_data_cv.toarray(), columns=cv.get_feature_names())


In [0]:
X_test_data_cv=cv.transform(X_test)

In [50]:
X_test_data_cv.shape

(480, 89493)

In [0]:
X_test_data_cv=X_test_data_cv.toarray()

Create a dictionary to get the count of every label i.e. the key will be label name and value will
be the total count of the label. Check below image for reference (5 points)

In [112]:
vectorizer_labels = CountVectorizer(min_df = 1,ngram_range = (1,1),stop_words = "english")
labels_vector = vectorizer_labels.fit_transform(y)
vectorizer_labels.vocabulary_

{'13': 0,
 '14': 1,
 '15': 2,
 '16': 3,
 '17': 4,
 '23': 5,
 '24': 6,
 '25': 7,
 '26': 8,
 '27': 9,
 '33': 10,
 '34': 11,
 '35': 12,
 '36': 13,
 '38': 14,
 '42': 15,
 '43': 16,
 '45': 17,
 '46': 18,
 '47': 19,
 'aquarius': 20,
 'aries': 21,
 'arts': 22,
 'cancer': 23,
 'capricorn': 24,
 'fashion': 25,
 'female': 26,
 'gemini': 27,
 'government': 28,
 'law': 29,
 'leo': 30,
 'libra': 31,
 'male': 32,
 'non': 33,
 'pisces': 34,
 'profit': 35,
 'religion': 36,
 'sagittarius': 37,
 'scorpio': 38,
 'student': 39,
 'taurus': 40,
 'technology': 41,
 'virgo': 42}

In [113]:
label_classes = []  
for key in vectorizer_labels.vocabulary_.keys():
    label_classes.append(key)
    
print(sorted(label_classes))

['13', '14', '15', '16', '17', '23', '24', '25', '26', '27', '33', '34', '35', '36', '38', '42', '43', '45', '46', '47', 'aquarius', 'aries', 'arts', 'cancer', 'capricorn', 'fashion', 'female', 'gemini', 'government', 'law', 'leo', 'libra', 'male', 'non', 'pisces', 'profit', 'religion', 'sagittarius', 'scorpio', 'student', 'taurus', 'technology', 'virgo']


Transform the labels - (7.5 points)

In [0]:
from sklearn.preprocessing import MultiLabelBinarizer

In [0]:
mlb = MultiLabelBinarizer()

In [0]:
y_train_cv=mlb.fit_transform(y_train)

In [0]:
y_test_cv=mlb.transform(y_test)

Choose a classifier - (5 points)

In [0]:
from sklearn.multiclass import OneVsRestClassifier

In [0]:
from sklearn.linear_model import LogisticRegression

In [0]:
clf=LogisticRegression(solver='lbfgs')

In [0]:
clf=OneVsRestClassifier(clf)

Fit the classifier, make predictions and get the accuracy (5 points)
a. Print the following
i. Accuracy score
ii. F1 score
iii. Average precision score
iv. Average recall score

In [62]:
clf.fit(X_train_data_cv,y_train_cv)

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [63]:
clf.score(X_test_data_cv,y_test_cv)

0.3770833333333333

In [0]:
from sklearn.metrics import accuracy_score

In [0]:
y_pred=clf.predict(X_test_data_cv)

In [68]:
accuracy_score(y_test_cv, y_pred)

0.3770833333333333

In [0]:
from sklearn.metrics import f1_score

In [72]:
f1_score(y_pred,y_test_cv,average='micro')

0.8676230023908392

In [0]:
from sklearn.metrics import average_precision_score

In [79]:
average_precision_score(y_score=y_pred, y_true=y_test_cv)

0.6279421684237881

In [0]:
from sklearn.metrics import recall_score

In [83]:
recall_score(y_test_cv,y_pred,average='micro')

0.8058672276764843

In [95]:
print(y_test_cv[0:5])

[[1 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 1 1 0 1 0 1 1 0 0 1 1 1 1 1
  1 0 1 1 1 0 0 0 0]
 [1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 1 0 1 0 1 1 1 1 1
  0 1 1 1 0 1 0 0 0]
 [1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 1 1 0 0 0 1 1 1 0 1 1 1 1 1
  0 0 1 1 0 1 0 0 0]
 [1 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1
  0 0 1 1 0 0 0 0 1]
 [1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 1 1 0 0 0 1 1 1 0 1 1 1 1 1
  0 0 1 1 0 1 0 0 0]]


Print true label and predicted label for any five examples

In [0]:
y_test_pred_inv=mlb.inverse_transform(clf.predict(X_test_data_cv)) 

In [0]:
y_test_inv=mlb.inverse_transform(y_test_cv)

In [122]:
print(" predicted :",y_test_pred_inv[0:5])
print(" Actual :",y_test_inv[0:5])

 predicted : [(',', 'A', '[', ']', 'a', 'c', 'e', 'f', 'i', 'l', 'm', 'n', 'o', 'r', 's', 't'), (',', '2', '7', 'A', 'R', '[', ']', 'a', 'e', 'g', 'i', 'l', 'm', 'n', 'o', 'q', 'r', 's', 'u'), (',', '2', '7', 'R', 'T', '[', ']', 'a', 'e', 'f', 'g', 'i', 'l', 'm', 'n', 'o', 'r', 's', 'u'), (',', '[', ']', 'a', 'e', 'i', 'l', 'm', 'n', 'o', 'r', 't'), (',', '2', '7', 'R', 'T', '[', ']', 'a', 'e', 'f', 'g', 'i', 'l', 'm', 'n', 'o', 'r', 's', 'u')]
 Actual : [(',', '2', '3', 'A', 'C', '[', ']', 'a', 'c', 'e', 'f', 'i', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't'), (',', '2', '7', 'A', 'R', '[', ']', 'a', 'e', 'g', 'i', 'l', 'm', 'n', 'o', 'q', 'r', 's', 'u'), (',', '2', '7', 'R', 'T', '[', ']', 'a', 'e', 'f', 'g', 'i', 'l', 'm', 'n', 'o', 'r', 's', 'u'), (',', '3', '5', 'A', 'T', '[', ']', 'a', 'c', 'e', 'g', 'h', 'i', 'l', 'm', 'n', 'o', 'r', 's', 'y'), (',', '2', '7', 'R', 'T', '[', ']', 'a', 'e', 'f', 'g', 'i', 'l', 'm', 'n', 'o', 'r', 's', 'u')]
