# Building classifier that predicts multiple features of the author of a given text.

#### Mounting the drive to work on Google Colab

In [80]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


- ### Let's import necessary packages for building model

In [0]:
import nltk
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [157]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
os.chdir('/content/drive/My Drive/AIML_Projects/Statistical NLP')

In [159]:
import os
os.getcwd()

'/content/drive/My Drive/AIML_Projects/Statistical NLP'

In [160]:
os.listdir()                                                                    # Checking the contents in the project_path 

['blog-authorship-corpus.zip', 'blogtext.csv']

In [0]:
project_path = '/content/drive/My Drive/AIML_Projects/Statistical NLP/'

In [0]:
zip_path = project_path + 'blog-authorship-corpus.zip'                          # Specifing the zip_path 

In [0]:
from zipfile import ZipFile                                                     # Extracting the zip file 
with ZipFile(zip_path , 'r') as z:
  z.extractall()

In [0]:
blog = pd.read_csv(r'blogtext.csv')

In [164]:
blog.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [165]:
blog.shape

(681284, 7)

In [0]:
df = blog.copy()

In [167]:
df.shape

(681284, 7)

### Let's look for first few records in `'text'` column.

In [168]:
df.iloc[0,6]

'           Info has been found (+/- 100 pages, and 4.5 MB of .pdf files) Now i have to wait untill our team leader has processed it and learns html.         '

In [169]:
df.iloc[1,6]

'           These are the team members:   Drewes van der Laag           urlLink mail  Ruiyu Xie                     urlLink mail  Bryan Aaldering (me)          urlLink mail          '

- ### From the above two results we can observe that there is need to cleaning the text columns.
- ## We need to perform below mentioned text cleaning steps.
> - a. Remove unwanted characters
> - b. Convert text to lowercase
> - c. Remove unwanted spaces
> - d. Remove stopwords

### Information about a DataFrame including the index dtype and column dtypes, non-null values and memory usage.

In [170]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681284 entries, 0 to 681283
Data columns (total 7 columns):
id        681284 non-null int64
gender    681284 non-null object
age       681284 non-null int64
topic     681284 non-null object
sign      681284 non-null object
date      681284 non-null object
text      681284 non-null object
dtypes: int64(2), object(5)
memory usage: 36.4+ MB


### Descriptive statistics to summarize the central tendency, dispersion and shape of a dataset’s distribution, excluding NaN values. 
### Analyzing both numeric and object column.

In [171]:
df.describe(include = 'all')

Unnamed: 0,id,gender,age,topic,sign,date,text
count,681284.0,681284,681284.0,681284,681284,681284,681284
unique,,2,,40,12,2616,611652
top,,male,,indUnk,Cancer,"02,August,2004",urlLink
freq,,345193,,251015,65048,16544,445
mean,2397802.0,,23.932326,,,,
std,1247723.0,,7.786009,,,,
min,5114.0,,13.0,,,,
25%,1239610.0,,17.0,,,,
50%,2607577.0,,24.0,,,,
75%,3525660.0,,26.0,,,,


### Removing unwanted characters in text column using regular expressions.

In [0]:
import re

In [173]:
print('Before removing unwanted spaces\n', df.iloc[1,6])

Before removing unwanted spaces
            These are the team members:   Drewes van der Laag           urlLink mail  Ruiyu Xie                     urlLink mail  Bryan Aaldering (me)          urlLink mail          


In [0]:
df['text'] = df['text'].apply(lambda x: re.sub('[^A-Za-z0-9 ]', '', x)) #re.sub('[^A-Za-z0-9 ]+', '', blog.iloc[0,6])
df['text'] = df['text'].apply(lambda x: re.sub('urlLink', '', x)) #re.sub('[^A-Za-z0-9 ]+', '', blog.iloc[0,6])
df['text'] = df['text'].apply(lambda x: re.sub('mail', '', x)) #re.sub('[^A-Za-z0-9 ]+', '', blog.iloc[0,6])

In [178]:
print('After removing unwanted spaces\n', df.iloc[1,6])

After removing unwanted spaces
            These are the team members   Drewes van der Laag              Ruiyu Xie                        Bryan Aaldering me                     


### Removing unwanted leading and trailing spaces in text column using strip() method.

In [180]:
print('Before removing unwanted spaces\n', df.iloc[1,6])

Before removing unwanted spaces
            These are the team members   Drewes van der Laag              Ruiyu Xie                        Bryan Aaldering me                     


In [0]:
df['text'] = df['text'].apply(lambda x: x.strip())
df['text'] = df['text'].apply(lambda x: x.replace("  "," "))

In [0]:
df['text'] = df['text'].apply(lambda x: x.replace("       "," "))

In [184]:
print('After removing unwanted spaces\n', df.iloc[1,6])

After removing unwanted spaces
 These are the team members  Drewes van der Laag Ruiyu Xie      Bryan Aaldering me


### Converting text to lowercase using lower() method.

In [185]:
print('Before making all charachters to lower case\n', df.iloc[0,6])

Before making all charachters to lower case
 Info has been found 100 pages and 45 MB of pdf files Now i have to wait untill our team leader has processed it and learns html


In [0]:
df['text'] = df['text'].apply(lambda x: x.lower())

In [187]:
print('After making all charachters to lower case\n', df.iloc[0,6])

After making all charachters to lower case
 info has been found 100 pages and 45 mb of pdf files now i have to wait untill our team leader has processed it and learns html


### Removing stopwords from text using package "NLTK".

In [0]:
from nltk.corpus import stopwords

In [189]:
print('Before removing stopwords\n', df.iloc[0,6])

Before removing stopwords
 info has been found 100 pages and 45 mb of pdf files now i have to wait untill our team leader has processed it and learns html


In [190]:
print('Length of text before removing stopwords -', len(df.iloc[0,6]))

Length of text before removing stopwords - 127


In [0]:
from collections import Counter
stop_words = stopwords.words('english')
stopwords_dict = Counter(stop_words)

df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords_dict]))

In [192]:
print('After removing stopwords\n', df.iloc[0,6])

After removing stopwords
 info found 100 pages 45 mb pdf files wait untill team leader processed learns html


In [193]:
print('Length of text after removing stopwords -', len(df.iloc[0,6]))

Length of text after removing stopwords - 82


## Let's take the 5% of dataset to build our model to classify multiple features of the author of a given text.

In [0]:
df1 = df.sample(frac=0.05,random_state=42)

### Merging all the label columns together, so that we have all the labels together for a particular sentence.
> - #### Label columns to merge: “gender”, “age”, “topic”, “sign”

In [0]:
#df['new'] = (blog[['gender', 'age', 'topic', 'sign']].iloc[0:6,:].apply(lambda x: ' '.join(str(x)),axis = 0)).tolist()

In [0]:
df1['labels'] = df1[['gender', 'age', 'topic', 'sign']].apply(lambda x: [','.join(x.astype(str))],axis=1)

In [200]:
df1.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,labels
240436,1058543,female,24,indUnk,Sagittarius,"02,August,2004",1 corinthians 511 written keep company anyone ...,"[female,24,indUnk,Sagittarius]"
54139,3440336,female,23,Arts,Virgo,"08,July,2004",moved jersey city nearly month ago idea would ...,"[female,23,Arts,Virgo]"
637911,3546243,female,24,indUnk,Taurus,"02,July,2004",hey metro need save money take hike comes heel...,"[female,24,indUnk,Taurus]"
61178,3367064,female,26,indUnk,Taurus,"29,July,2004",hilarious movie seen year comedy movie absolut...,"[female,26,indUnk,Taurus]"
314089,3590478,female,34,Technology,Capricorn,"20,April,2004",short attached bumper pickup truck cock ring d...,"[female,34,Technology,Capricorn]"


### As we have to focus only on text and labels column we will create seperate dataframe of `“text”` and `“labels”`.


In [0]:
blog_df = df1[['text','labels']]

In [204]:
blog_df.head()

Unnamed: 0,text,labels
240436,1 corinthians 511 written keep company anyone ...,"[female,24,indUnk,Sagittarius]"
54139,moved jersey city nearly month ago idea would ...,"[female,23,Arts,Virgo]"
637911,hey metro need save money take hike comes heel...,"[female,24,indUnk,Taurus]"
61178,hilarious movie seen year comedy movie absolut...,"[female,26,indUnk,Taurus]"
314089,short attached bumper pickup truck cock ring d...,"[female,34,Technology,Capricorn]"


### Separating features and labels, and spliting the data into training and testing.

In [0]:
features = blog_df['text']
labels = blog_df['labels']

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.30, random_state=5)

In [208]:
print(X_train.shape)

(23844,)


In [209]:
print(X_test.shape)

(10220,)


In [210]:
print(y_train.shape)

(23844,)


In [211]:
print(y_test.shape)

(10220,)


#### Import and instantiating CountVectorizer (with the default parameters) and ngram_range=(1, 2)

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(ngram_range=(1, 2))

### Learning the 'vocabulary' of the `'text'` column.

In [213]:
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 2), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

#### Transforming training data(X_train) into a 'document-term matrix'.

In [0]:
X_train_dtm = vect.transform(X_train)

#### Transforming test data(X_test) into a 'document-term matrix'.

In [0]:
X_test_dtm = vect.transform(X_test)

In [216]:
type(X_train_dtm)

scipy.sparse.csr.csr_matrix

In [217]:
type(X_train_dtm)

scipy.sparse.csr.csr_matrix

In [218]:
print(X_train_dtm.shape)

(23844, 1727483)


In [219]:
print(X_test_dtm.shape)

(10220, 1727483)


### Displaying Document Term Matrix for X_train

In [220]:
print(X_train_dtm)

  (0, 92075)	1
  (0, 92144)	1
  (0, 116054)	1
  (0, 116799)	1
  (0, 179645)	1
  (0, 180172)	1
  (0, 332376)	1
  (0, 332510)	1
  (0, 433367)	1
  (0, 433894)	1
  (0, 462795)	1
  (0, 464109)	1
  (0, 609528)	1
  (0, 610660)	1
  (0, 618736)	3
  (0, 619691)	1
  (0, 619760)	1
  (0, 619966)	1
  (0, 620784)	1
  (0, 621409)	1
  (0, 681320)	2
  (0, 681582)	1
  (0, 681641)	1
  (0, 692153)	2
  (0, 692155)	1
  :	:
  (23843, 880164)	1
  (23843, 900462)	1
  (23843, 900463)	1
  (23843, 1026949)	1
  (23843, 1026950)	1
  (23843, 1085957)	1
  (23843, 1085958)	1
  (23843, 1086240)	1
  (23843, 1086241)	1
  (23843, 1131855)	1
  (23843, 1132007)	1
  (23843, 1241544)	1
  (23843, 1241545)	1
  (23843, 1263039)	1
  (23843, 1263040)	1
  (23843, 1297282)	1
  (23843, 1297283)	1
  (23843, 1330177)	1
  (23843, 1330178)	1
  (23843, 1422606)	1
  (23843, 1422607)	1
  (23843, 1618845)	1
  (23843, 1619200)	1
  (23843, 1724147)	1
  (23843, 1724148)	1


### Displaying Document Term Matrix for X_test

In [221]:
print(X_test_dtm)

  (0, 468353)	1
  (0, 468705)	1
  (0, 833759)	1
  (0, 834715)	1
  (0, 1085290)	1
  (0, 1232847)	1
  (0, 1295303)	1
  (0, 1640865)	1
  (0, 1642068)	1
  (0, 1712098)	1
  (1, 140433)	1
  (1, 142556)	1
  (1, 287352)	1
  (1, 356888)	1
  (1, 356923)	1
  (1, 371337)	1
  (1, 426512)	1
  (1, 459755)	1
  (1, 563467)	1
  (1, 563646)	1
  (1, 572872)	2
  (1, 573189)	1
  (1, 621864)	1
  (1, 624605)	1
  (1, 641047)	1
  :	:
  (10219, 1025601)	1
  (10219, 1027109)	2
  (10219, 1028956)	1
  (10219, 1074985)	1
  (10219, 1075056)	1
  (10219, 1102674)	1
  (10219, 1207731)	2
  (10219, 1254822)	1
  (10219, 1292909)	1
  (10219, 1338057)	1
  (10219, 1348539)	1
  (10219, 1352522)	1
  (10219, 1372332)	1
  (10219, 1372559)	1
  (10219, 1433465)	1
  (10219, 1444262)	1
  (10219, 1444421)	1
  (10219, 1462859)	1
  (10219, 1517051)	1
  (10219, 1517465)	1
  (10219, 1569969)	1
  (10219, 1634560)	1
  (10219, 1640865)	1
  (10219, 1682222)	1
  (10219, 1712098)	1


### Creating a dictionary to get the count of every label i.e. the key will be label name and value will be the total count of the label.

#### Creating empty dict

In [0]:
myDict = dict()

In [223]:
for i, j in enumerate(labels):
    my_list = j[0].split(',')
  
    for item in my_list:
        if (item in myDict): 
            myDict[item] += 1
        else: 
             myDict[item] = 1     
for key, value in myDict.items(): 
      print ("% s : % d"%(key, value))

female :  16903
24 :  3947
indUnk :  12649
Sagittarius :  2470
23 :  3711
Arts :  1555
Virgo :  3100
Taurus :  3212
26 :  2674
34 :  1086
Technology :  1993
Capricorn :  2496
male :  17161
27 :  2254
25 :  3420
Leo :  2616
40 :  249
Pisces :  2683
14 :  1433
Education :  1451
Libra :  3110
Gemini :  2566
17 :  4121
16 :  3625
Aquarius :  2513
Student :  7890
Aries :  3212
37 :  457
LawEnforcement-Security :  95
Scorpio :  2859
Law :  457
Manufacturing :  127
Cancer :  3227
Marketing :  265
42 :  126
35 :  841
Non-Profit :  697
39 :  280
Communications-Media :  1001
Engineering :  607
15 :  2082
33 :  902
13 :  671
Internet :  838
Publishing :  375
48 :  156
Science :  367
Religion :  246
Museums-Libraries :  123
Biotech :  102
43 :  231
HumanResources :  149
44 :  97
Sports-Recreation :  170
Banking :  192
41 :  178
36 :  667
Tourism :  90
Advertising :  217
BusinessServices :  239
Accounting :  184
Military :  141
Chemicals :  213
47 :  100
Agriculture :  57
38 :  382
InvestmentBankin

In [224]:
print(myDict)

{'female': 16903, '24': 3947, 'indUnk': 12649, 'Sagittarius': 2470, '23': 3711, 'Arts': 1555, 'Virgo': 3100, 'Taurus': 3212, '26': 2674, '34': 1086, 'Technology': 1993, 'Capricorn': 2496, 'male': 17161, '27': 2254, '25': 3420, 'Leo': 2616, '40': 249, 'Pisces': 2683, '14': 1433, 'Education': 1451, 'Libra': 3110, 'Gemini': 2566, '17': 4121, '16': 3625, 'Aquarius': 2513, 'Student': 7890, 'Aries': 3212, '37': 457, 'LawEnforcement-Security': 95, 'Scorpio': 2859, 'Law': 457, 'Manufacturing': 127, 'Cancer': 3227, 'Marketing': 265, '42': 126, '35': 841, 'Non-Profit': 697, '39': 280, 'Communications-Media': 1001, 'Engineering': 607, '15': 2082, '33': 902, '13': 671, 'Internet': 838, 'Publishing': 375, '48': 156, 'Science': 367, 'Religion': 246, 'Museums-Libraries': 123, 'Biotech': 102, '43': 231, 'HumanResources': 149, '44': 97, 'Sports-Recreation': 170, 'Banking': 192, '41': 178, '36': 667, 'Tourism': 90, 'Advertising': 217, 'BusinessServices': 239, 'Accounting': 184, 'Military': 141, 'Chemica

### Transforming the labels

In [0]:
list_class = [] 
for key in myDict.keys(): 
    list_class.append(key) 
list_class_array=np.array(list_class)

In [226]:
list_class_array

array(['female', '24', 'indUnk', 'Sagittarius', '23', 'Arts', 'Virgo',
       'Taurus', '26', '34', 'Technology', 'Capricorn', 'male', '27',
       '25', 'Leo', '40', 'Pisces', '14', 'Education', 'Libra', 'Gemini',
       '17', '16', 'Aquarius', 'Student', 'Aries', '37',
       'LawEnforcement-Security', 'Scorpio', 'Law', 'Manufacturing',
       'Cancer', 'Marketing', '42', '35', 'Non-Profit', '39',
       'Communications-Media', 'Engineering', '15', '33', '13',
       'Internet', 'Publishing', '48', 'Science', 'Religion',
       'Museums-Libraries', 'Biotech', '43', 'HumanResources', '44',
       'Sports-Recreation', 'Banking', '41', '36', 'Tourism',
       'Advertising', 'BusinessServices', 'Accounting', 'Military',
       'Chemicals', '47', 'Agriculture', '38', 'InvestmentBanking',
       'Transportation', 'Government', 'Fashion', '46', '45',
       'Architecture', 'Consulting', 'Telecommunications', 'RealEstate',
       'Maritime', 'Environment', 'Construction', 'Automotive'],
    

In [0]:
# transform to dictionary as Acceptable format of MultiLabelBinarizer
y_train_pass = [set(i[0].split(',')) for i in y_train]
y_test_pass = [set(i[0].split(',')) for i in y_test]

In [0]:
from sklearn.preprocessing import MultiLabelBinarizer

In [0]:
mlb = MultiLabelBinarizer()

In [230]:
mlb.fit(y_train_pass)

MultiLabelBinarizer(classes=None, sparse_output=False)

In [231]:
mlb.transform(y_train_pass)

array([[0, 0, 0, ..., 0, 1, 1],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 1, 0],
       ...,
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0]])

In [232]:
mlb.transform(y_test_pass)

array([[0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 1, 1],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])

In [233]:
len(mlb.transform(y_test_pass))

10220

In [234]:
# retriving the lables 
mlb.classes_

array(['13', '14', '15', '16', '17', '23', '24', '25', '26', '27', '33',
       '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44',
       '45', '46', '47', '48', 'Accounting', 'Advertising', 'Agriculture',
       'Aquarius', 'Architecture', 'Aries', 'Arts', 'Automotive',
       'Banking', 'Biotech', 'BusinessServices', 'Cancer', 'Capricorn',
       'Chemicals', 'Communications-Media', 'Construction', 'Consulting',
       'Education', 'Engineering', 'Environment', 'Fashion', 'Gemini',
       'Government', 'HumanResources', 'Internet', 'InvestmentBanking',
       'Law', 'LawEnforcement-Security', 'Leo', 'Libra', 'Manufacturing',
       'Maritime', 'Marketing', 'Military', 'Museums-Libraries',
       'Non-Profit', 'Pisces', 'Publishing', 'RealEstate', 'Religion',
       'Sagittarius', 'Science', 'Scorpio', 'Sports-Recreation',
       'Student', 'Taurus', 'Technology', 'Telecommunications', 'Tourism',
       'Transportation', 'Virgo', 'female', 'indUnk', 'male'],
      dtype

In [0]:
y_trn_mlb = mlb.transform(y_train_pass)

In [0]:
y_test_mlb =mlb.transform(y_test_pass)

## Choose a classifier

In [0]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

In [0]:
clf = LogisticRegression(solver = 'lbfgs',n_jobs=-1,max_iter=2000,verbose=2)
clf = OneVsRestClassifier(clf,n_jobs=-1)

In [245]:
clf.fit(X_train_dtm,y_trn_mlb)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=2000,
                                                 multi_class='auto', n_jobs=-1,
                                                 penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=2, warm_start=False),
                    n_jobs=-1)

In [0]:
pred_class = clf.predict (X_test_dtm)                                           # Predicting on test data

In [248]:
from sklearn import metrics                                                     
metrics.accuracy_score(y_test_mlb, pred_class)

0.006653620352250489

In [249]:
print(metrics.classification_report(y_test_mlb, pred_class))

              precision    recall  f1-score   support

           0       0.43      0.05      0.09       200
           1       0.33      0.05      0.08       418
           2       0.25      0.03      0.06       617
           3       0.41      0.09      0.14      1056
           4       0.39      0.09      0.15      1237
           5       0.25      0.02      0.04      1095
           6       0.26      0.04      0.06      1197
           7       0.21      0.02      0.04      1029
           8       0.20      0.02      0.03       806
           9       0.30      0.03      0.05       693
          10       0.38      0.02      0.03       277
          11       0.51      0.06      0.11       344
          12       0.56      0.04      0.07       249
          13       0.17      0.01      0.01       175
          14       0.71      0.07      0.13       145
          15       0.30      0.03      0.05       116
          16       0.00      0.00      0.00        78
          17       0.85    

## Tried with solver `'saga'` which is preffred for large dataset.

In [0]:
df2 = df.sample(frac=0.1,random_state=42)

In [0]:
df2['labels'] = df2[['gender', 'age', 'topic', 'sign']].apply(lambda x: [','.join(x.astype(str))],axis=1)

In [256]:
df2.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,labels
240436,1058543,female,24,indUnk,Sagittarius,"02,August,2004",1 corinthians 511 written keep company anyone ...,"[female,24,indUnk,Sagittarius]"
54139,3440336,female,23,Arts,Virgo,"08,July,2004",moved jersey city nearly month ago idea would ...,"[female,23,Arts,Virgo]"
637911,3546243,female,24,indUnk,Taurus,"02,July,2004",hey metro need save money take hike comes heel...,"[female,24,indUnk,Taurus]"
61178,3367064,female,26,indUnk,Taurus,"29,July,2004",hilarious movie seen year comedy movie absolut...,"[female,26,indUnk,Taurus]"
314089,3590478,female,34,Technology,Capricorn,"20,April,2004",short attached bumper pickup truck cock ring d...,"[female,34,Technology,Capricorn]"


### As we have to focus only on text and labels column we will create seperate dataframe of `“text”` and `“labels”`.


In [0]:
blog_df2 = df2[['text','labels']]

### Separating features and labels, and spliting the data into training and testing.

In [0]:
features = blog_df2['text']
labels = blog_df2['labels']

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(features, labels, test_size=0.30, random_state=5)

In [269]:
print(X_train1.shape)

(47689,)


In [270]:
print(X_test1.shape)

(20439,)


In [271]:
print(y_train1.shape)

(47689,)


In [272]:
print(y_test1.shape)

(20439,)


#### Import and instantiating CountVectorizer (with the default parameters) and ngram_range=(1, 2)

In [0]:
vect2 = CountVectorizer(ngram_range=(1, 2))

### Learning the 'vocabulary' of the `'text'` column.

In [275]:
vect2.fit(X_train1)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 2), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

#### Transforming training data(X_train) into a 'document-term matrix'.

In [0]:
X_train_dtm1 = vect2.transform(X_train1)

#### Transforming test data(X_test) into a 'document-term matrix'.

In [0]:
X_test_dtm1 = vect2.transform(X_test1)

In [0]:
# transform to dictionary as Acceptable format of MultiLabelBinarizer
y_train_pass1 = [set(i[0].split(',')) for i in y_train1]
y_test_pass1 = [set(i[0].split(',')) for i in y_test1]

In [0]:
mlb = MultiLabelBinarizer()

In [281]:
mlb.fit(y_train_pass1)

MultiLabelBinarizer(classes=None, sparse_output=False)

In [282]:
mlb.transform(y_train_pass1)

array([[0, 1, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])

In [283]:
mlb.transform(y_test_pass1)

array([[1, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 1, 0],
       [1, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 1, 1]])

In [0]:
y_trn_mlb1 = mlb.transform(y_train_pass1)

In [0]:
y_test_mlb1 =mlb.transform(y_test_pass1)

In [0]:
clf = LogisticRegression(solver = 'saga',n_jobs=-1,verbose=1)
clf = OneVsRestClassifier(clf,n_jobs=-1)

In [289]:
clf.fit(X_train_dtm1, y_trn_mlb1)                                                 # Fitting the classifier 

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto', n_jobs=-1,
                                                 penalty='l2',
                                                 random_state=None,
                                                 solver='saga', tol=0.0001,
                                                 verbose=1, warm_start=False),
                    n_jobs=-1)

In [0]:
pred_class1 = clf.predict (X_test_dtm1)                                         # Predicting on test data

In [0]:
from sklearn import metrics                                                     
metrics.accuracy_score(y_test_mlb1, pred_class1)

0.01042125348598268

In [0]:
print(metrics.classification_report(y_test_mlb1, pred_class1))

              precision    recall  f1-score   support

           0       0.33      0.03      0.06       383
           1       0.46      0.08      0.14       840
           2       0.35      0.07      0.11      1223
           3       0.44      0.11      0.17      2213
           4       0.45      0.12      0.19      2424
           5       0.33      0.04      0.07      2201
           6       0.35      0.05      0.09      2339
           7       0.36      0.04      0.08      2032
           8       0.27      0.03      0.05      1624
           9       0.23      0.02      0.04      1385
          10       0.15      0.01      0.02       576
          11       0.62      0.09      0.16       662
          12       0.30      0.03      0.05       525
          13       0.33      0.03      0.06       407
          14       0.39      0.05      0.09       270
          15       0.47      0.04      0.07       221
          16       0.00      0.00      0.00       169
          17       0.85    

In [0]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

In [0]:
print("F1: " , (f1_score(y_test_mlb1, pred_class, average='micro')))
print("F1_macro: " , (f1_score(y_test_mlb1, pred_class1, average='macro')))
print("Recall micro: " , recall_score(y_test_mlb1, pred_class1, average='micro'))
print("F1_micro: " , (f1_score(y_test_mlb1, pred_class1, average='micro')))
print("Recall macro: " , recall_score(y_test_mlb1, pred_class1, average='macro'))
print("Average Precision: " ,(average_precision_score(y_test_mlb1,pred_class1, average='micro')))
print("Accuracy:" , (accuracy_score(y_test_mlb1, pred_class1))) 

F1:  0.5051713435819717
F1_macro:  0.30538949765367895
Recall micro:  0.3916625
F1_micro:  0.5051713435819717
Recall macro:  0.2120484063104803
Average Precision:  0.3090148252939908
Accuracy: 0.1293


## Print true label and predicted label for any five examples

In [0]:
y_test_pred_inversed = mlb.inverse_transform(pred_class1)
y_test_inversed = mlb.inverse_transform(y_test_mlb1)
for i in range(15,20):
    print( 'True labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        
        ','.join(y_test_inversed[i]),
        ','.join(y_test_pred_inversed[i])
    ))

True labels:	36,Pisces,Technology,male
Predicted labels:	Pisces,male


True labels:	25,Cancer,Non-Profit,male
Predicted labels:	male


True labels:	35,Aries,Technology,male
Predicted labels:	male


True labels:	25,Gemini,indUnk,male
Predicted labels:	male


True labels:	17,Virgo,indUnk,male
Predicted labels:	17,Virgo,indUnk,male




# This is worked on small portion of dataset so accuracy is not great. Working on the model but it's taking long time to run so couldn't able to update.

****
****

### Trying the Logistic regression model with solver 'lbfgs' where it can handle L2 or no penalty on 60000 records among whole dataset

In [0]:
X_train_dtm[0:200000].shape

(200000, 18907795)

In [0]:
y_trn_mlb.shape

(476898, 80)

In [0]:
clf_1 = LogisticRegression(solver = 'lbfgs')
clf_1 = OneVsRestClassifier(clf_1)

In [0]:
clf_1.fit(X_train_dtm[0:60000], y_trn_mlb[0:60000])                                                 # Fitting the classifier 

In [0]:
y_pred_class2 = clf_1.predict (X_test_dtm)                                         # Predicting on test data

In [0]:
from sklearn import metrics                                                     
metrics.accuracy_score(y_test_mlb, y_pred_class2)

### Not able to run as model taking long time to run for large data set with solver = '`lbfgs`'