In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, average_precision_score, accuracy_score, recall_score
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from pprint import pprint
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


## 1. Load the dataset

In [0]:
df = pd.read_csv('/content/drive/My Drive/great-learning/nlp/project/blogtext.csv')

In [74]:
df.shape

(681284, 7)

Selecting 2000 rows only, due to memory constraints.

In [0]:
df = df.head(2000)

In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 7 columns):
id        2000 non-null int64
gender    2000 non-null object
age       2000 non-null int64
topic     2000 non-null object
sign      2000 non-null object
date      2000 non-null object
text      2000 non-null object
dtypes: int64(2), object(5)
memory usage: 109.5+ KB


## 2. Preprocess rows of the “text” column

a. Remove unwanted characters

In [0]:
df['text'] = df['text'].replace('[^a-zA-Z ]', '', regex=True)

b. Convert text to lowercase

In [0]:
df['text'] = df['text'].str.lower()

c. Remove unwanted spaces

In [0]:
df['text'] = df['text'].replace('\s+', ' ', regex=True)

d. Remove stopwords

In [0]:
stop = stopwords.words('english')

In [0]:
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [0]:
df['age'] = df['age'].astype(str)

In [83]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004",info found pages mb pdf files wait untill team...
1,2059027,male,15,Student,Leo,"13,May,2004",team members drewes van der laag urllink mail ...
2,2059027,male,15,Student,Leo,"12,May,2004",het kader van kernfusie op aarde maak je eigen...
3,2059027,male,15,Student,Leo,"12,May,2004",testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks yahoos toolbar capture urls popupswhich...


## 3. Merge all the label columns together

In [0]:
df['labels'] = df[['gender','age','topic','sign']].values.tolist()

In [0]:
df.drop(columns=['id','gender','age','topic','sign','date'], inplace=True)

In [86]:
df.head()

Unnamed: 0,text,labels
0,info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,testing testing,"[male, 15, Student, Leo]"
4,thanks yahoos toolbar capture urls popupswhich...,"[male, 33, InvestmentBanking, Aquarius]"


## 4. Separate features and labels, and split the data into training and testing

In [0]:
X = df.text
y = df.labels

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

## 5. Vectorize the features

a. Create a Bag of Words using count vectorizer

In [0]:
vect = CountVectorizer(ngram_range=(1,2))
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

b. Print the term-document matrix

In [90]:
X_train_dtm

<1500x153299 sparse matrix of type '<class 'numpy.int64'>'
	with 266939 stored elements in Compressed Sparse Row format>

In [91]:
print(X_train_dtm)

  (0, 29621)	2
  (0, 110268)	1
  (0, 4476)	1
  (0, 25427)	1
  (0, 36672)	1
  (0, 105708)	1
  (0, 62056)	1
  (0, 122801)	1
  (0, 77867)	1
  (0, 103417)	1
  (0, 115983)	1
  (0, 133572)	1
  (0, 44837)	1
  (0, 138006)	1
  (0, 52424)	1
  (0, 84072)	1
  (0, 148521)	1
  (0, 27477)	1
  (0, 148329)	1
  (0, 91772)	1
  (0, 52193)	2
  (0, 144568)	1
  (0, 146256)	3
  (0, 136298)	2
  (0, 50149)	1
  :	:
  (1499, 135968)	1
  (1499, 74575)	2
  (1499, 20100)	1
  (1499, 20107)	1
  (1499, 53937)	1
  (1499, 89011)	1
  (1499, 120520)	1
  (1499, 75331)	1
  (1499, 151326)	1
  (1499, 132254)	1
  (1499, 69608)	1
  (1499, 49804)	1
  (1499, 50393)	1
  (1499, 62601)	1
  (1499, 99967)	1
  (1499, 18277)	1
  (1499, 139253)	1
  (1499, 57143)	1
  (1499, 85666)	1
  (1499, 119895)	1
  (1499, 34772)	1
  (1499, 17648)	1
  (1499, 66852)	1
  (1499, 111879)	1
  (1499, 101604)	1


In [92]:
print(X_test_dtm)

  (0, 941)	1
  (0, 1096)	1
  (0, 4095)	1
  (0, 6073)	1
  (0, 7297)	1
  (0, 7339)	1
  (0, 7730)	1
  (0, 11925)	1
  (0, 11947)	1
  (0, 14982)	1
  (0, 19313)	1
  (0, 19332)	1
  (0, 23969)	1
  (0, 28421)	1
  (0, 29087)	1
  (0, 29122)	1
  (0, 29505)	2
  (0, 32267)	1
  (0, 32269)	1
  (0, 33388)	1
  (0, 33511)	1
  (0, 34185)	1
  (0, 34316)	1
  (0, 36205)	1
  (0, 37066)	1
  :	:
  (499, 112596)	1
  (499, 112705)	1
  (499, 113871)	1
  (499, 117784)	2
  (499, 120896)	1
  (499, 122735)	1
  (499, 126997)	1
  (499, 131335)	1
  (499, 131541)	1
  (499, 132661)	1
  (499, 132792)	1
  (499, 138875)	1
  (499, 142119)	1
  (499, 144177)	1
  (499, 144293)	1
  (499, 144812)	1
  (499, 144862)	1
  (499, 144912)	1
  (499, 145747)	1
  (499, 146256)	1
  (499, 146498)	1
  (499, 146756)	1
  (499, 148012)	1
  (499, 150260)	1
  (499, 150559)	1


## 6. Create a dictionary to get the count of every label

In [93]:
labels_dict = {}
for labels in df.labels.iteritems():
  for item in labels[1]:
    if (item in labels_dict):
      labels_dict[item] += 1
    else:
      labels_dict[item] = 1

pprint(labels_dict)

{'14': 74,
 '15': 299,
 '16': 25,
 '17': 147,
 '23': 93,
 '24': 334,
 '25': 110,
 '26': 43,
 '27': 86,
 '33': 94,
 '34': 6,
 '35': 607,
 '37': 19,
 '39': 32,
 '41': 14,
 '44': 3,
 '45': 14,
 'Accounting': 2,
 'Aquarius': 286,
 'Aries': 699,
 'Arts': 2,
 'Banking': 16,
 'BusinessServices': 21,
 'Cancer': 76,
 'Capricorn': 77,
 'Communications-Media': 14,
 'Education': 118,
 'Engineering': 119,
 'Gemini': 21,
 'Internet': 20,
 'InvestmentBanking': 70,
 'Leo': 55,
 'Libra': 313,
 'Museums-Libraries': 2,
 'Non-Profit': 46,
 'Pisces': 2,
 'Sagittarius': 113,
 'Science': 33,
 'Scorpio': 243,
 'Sports-Recreation': 75,
 'Student': 403,
 'Taurus': 76,
 'Technology': 607,
 'Virgo': 39,
 'female': 728,
 'indUnk': 452,
 'male': 1272}


## 7. Transform the labels

In [94]:
mlb = MultiLabelBinarizer()
mlb.fit(y_train.to_list())

MultiLabelBinarizer(classes=None, sparse_output=False)

In [95]:
mlb.classes_

array(['14', '15', '16', '17', '23', '24', '25', '26', '27', '33', '34',
       '35', '37', '39', '41', '44', '45', 'Accounting', 'Aquarius',
       'Aries', 'Arts', 'Banking', 'BusinessServices', 'Cancer',
       'Capricorn', 'Communications-Media', 'Education', 'Engineering',
       'Gemini', 'Internet', 'InvestmentBanking', 'Leo', 'Libra',
       'Museums-Libraries', 'Non-Profit', 'Pisces', 'Sagittarius',
       'Science', 'Scorpio', 'Sports-Recreation', 'Student', 'Taurus',
       'Technology', 'Virgo', 'female', 'indUnk', 'male'], dtype=object)

a. Convert your train and test labels using MultiLabelBinarizer

In [0]:
y_train_transformed = mlb.transform(y_train.to_list())

In [0]:
y_test_transformed = mlb.transform(y_test.to_list())

## 8. Choose a classifier

In [0]:
clf = LogisticRegression(solver='lbfgs')
clf = OneVsRestClassifier(clf)

## 9. Fit the classifier, make predictions and get the accuracy 

In [99]:
clf.fit(X_train_dtm, y_train_transformed)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [0]:
y_train_pred = clf.predict(X_train_dtm)

In [0]:
y_test_pred = clf.predict(X_test_dtm)

### Accuracy Score

Train Set

In [102]:
accuracy_score(y_train_transformed, y_train_pred)

0.982

Test Set

In [103]:
accuracy_score(y_test_transformed, y_test_pred)

0.372

### F1 score

Train Set

In [104]:
f1_score(y_train_transformed, y_train_pred, average='weighted')

0.990985457917627

Test Set

In [105]:
f1_score(y_test_transformed, y_test_pred, average='weighted')

0.5988529324638497

### Average Precision Score

Train Set

In [106]:
y_score = clf.decision_function(X_train_dtm)
average_precision_score(y_train_transformed, y_score, average='weighted')

0.99966955762978

Test Set

In [107]:
y_score = clf.decision_function(X_test_dtm)
average_precision_score(y_test_transformed, y_score, average='weighted')

0.7211126445963525

### Average Recall Score

Train Set

In [108]:
recall_score(y_train_transformed,y_train_pred, average='weighted')

0.9875

Test Set

In [109]:
recall_score(y_test_transformed,y_test_pred,average='weighted')

0.551

## 10. Print true label and predicted label for any five examples

In [110]:
predicted_labels = mlb.inverse_transform(y_test_pred)
for i in range(1,6):
  print('\nExample ',i)
  print('True label: ', y_test.iloc[i])
  print('Predicted label: ', predicted_labels[i])


Example  1
True label:  ['male', '35', 'Technology', 'Aries']
Predicted label:  ('male',)

Example  2
True label:  ['female', '26', 'Science', 'Sagittarius']
Predicted label:  ('35', 'Aries', 'Technology', 'male')

Example  3
True label:  ['male', '39', 'Education', 'Virgo']
Predicted label:  ('female',)

Example  4
True label:  ['male', '17', 'Student', 'Virgo']
Predicted label:  ('24', '26', 'Libra', 'Student', 'female')

Example  5
True label:  ['male', '35', 'Technology', 'Aries']
Predicted label:  ('35', 'Aries', 'Technology', 'male')
