In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [72]:
from zipfile import ZipFile
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

In [39]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

####Reading Zip file & Storing into Panda

In [3]:
zip_file = ZipFile('/content/drive/MyDrive/AI_ML_CLass/Data/blog-authorship-corpus/blogtext.csv.zip')

In [4]:
zip_file.infolist()

[<ZipInfo filename='blogtext.csv' compress_type=deflate file_size=800419647 compress_size=303646582>]

In [5]:
auth_blog_pd =  pd.read_csv(zip_file.open("blogtext.csv"))

#### Looking at data insight :

In [6]:
# shape
auth_blog_pd.shape

(681284, 7)

In [7]:
auth_blog_pd.head(10)

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...
5,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",I had an interesting conversation...
6,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",Somehow Coca-Cola has a way of su...
7,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004","If anything, Korea is a country o..."
8,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",Take a read of this news article ...
9,3581210,male,33,InvestmentBanking,Aquarius,"09,June,2004",I surf the English news sites a l...


#### Preparing Small Set Data from Large corpus :

In [47]:
#dataset is hugh taking 20000 record 
# select 20000 rows randomly 
auth_blog_new = auth_blog_pd.sample(n=20000)

In [9]:
auth_blog_new.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
540307,3581641,male,14,Student,Pisces,"08,June,2004",urlLink I found this little bir...
70073,1826722,male,35,Military,Capricorn,"21,July,2004","Saw a lawyer today, and&nbsp;I..."
629318,1697956,male,23,Chemicals,Libra,"18,August,2003",Greetings from Wellington! Thats a...
338153,3313546,female,23,Student,Cancer,"31,July,2004","Well, I have been there done that a few..."
241592,529513,male,33,Internet,Taurus,"13,November,2002",A hen is only an eggs way of makin...


In [48]:
#drop colum id and date
auth_blog_new.drop(labels=['id','date'],inplace=True,axis=1)

In [11]:
auth_blog_new.head(2)

Unnamed: 0,gender,age,topic,sign,text
540307,male,14,Student,Pisces,urlLink I found this little bir...
70073,male,35,Military,Capricorn,"Saw a lawyer today, and&nbsp;I..."


In [49]:
auth_blog_new.reset_index()

Unnamed: 0,index,gender,age,topic,sign,text
0,338750,female,23,indUnk,Scorpio,alone. sad. jaded. three words i ...
1,15482,female,14,indUnk,Aries,i dont no y i put that as the title i j...
2,424170,male,25,Technology,Aries,urlLink Pretty garden!
3,325140,male,27,Student,Aries,How simple to prepare a japan meal? wel...
4,307089,female,33,Communications-Media,Scorpio,1. You're walking down a path. What do...
...,...,...,...,...,...,...
19995,342773,female,23,Arts,Scorpio,sorry that my posts are so short bu...
19996,233380,female,26,Student,Scorpio,Hi! I'm in!!! I hope Jhayne gets to blo...
19997,540526,male,25,Telecommunications,Libra,The Mets season opener was today an...
19998,132188,female,23,indUnk,Libra,Where the hell did urlLink this festiv...


In [13]:
auth_blog_new.shape

(20000, 5)

#### EDA of data:

In [50]:
auth_blog_new.isnull().sum()

gender    0
age       0
topic     0
sign      0
text      0
dtype: int64

In [51]:
auth_blog_new.groupby('topic').text.count()

topic
Accounting                  118
Advertising                 125
Agriculture                  38
Architecture                 49
Arts                       1004
Automotive                   38
Banking                     114
Biotech                      59
BusinessServices            122
Chemicals                    99
Communications-Media        592
Construction                 36
Consulting                  170
Education                   896
Engineering                 352
Environment                  21
Fashion                     139
Government                  211
HumanResources               81
Internet                    438
InvestmentBanking            29
Law                         273
LawEnforcement-Security      52
Manufacturing                81
Maritime                     10
Marketing                   164
Military                     75
Museums-Libraries            89
Non-Profit                  416
Publishing                  238
RealEstate                   88
Re

In [52]:
#define age group
# 13-17 group defined as 10
# 23-27 group defined as 20
# 33-47 group defined as 30
ageGroup=[]
for x in auth_blog_new.age :
  if(x>=13 and x<=17) :
    ageGroup.append(10)
  elif(x>=23 and x<=27) :
    ageGroup.append(20)
  elif(x>=33 and x<=48) :
    ageGroup.append(30)

In [53]:
# adding with new column
auth_blog_new['ageGroup']=ageGroup

In [54]:
# age group distribution
auth_blog_new.ageGroup.value_counts()

20    9559
10    6834
30    3607
Name: ageGroup, dtype: int64

In [55]:
#gender distribution
auth_blog_new.gender.value_counts()

male      10042
female     9958
Name: gender, dtype: int64

In [56]:
#zodic distribution
auth_blog_new.sign.value_counts()

Cancer         1947
Taurus         1869
Libra          1859
Aries          1853
Virgo          1774
Scorpio        1711
Leo            1597
Pisces         1533
Capricorn      1504
Gemini         1489
Sagittarius    1436
Aquarius       1428
Name: sign, dtype: int64

#### Preprocessing Text

In [57]:
import re

In [58]:
## cleaning text & removing unwanted text
auth_blog_new['text']=auth_blog_new.text.apply(lambda x: re.sub(r'[^A-Za-z]+',' ',x))
auth_blog_new['text']=auth_blog_new.text.apply(lambda x:x.lower())
auth_blog_new['text']=auth_blog_new.text.apply(lambda x : x.strip())

In [60]:
## removing stop word from text
stop_words = set(stopwords.words('english'))
auth_blog_new['text']=auth_blog_new.text.apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [61]:
# stemming text
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
auth_blog_new['text']=auth_blog_new.text.apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

In [62]:
auth_blog_new.text.head(5)

338750    alon sad jade three word dread though appli to...
15482     dont put titl couldnt think anyth mate r read ...
424170                                urllink pretti garden
325140    simpl prepar japan meal well u see miso soup i...
307089    walk path see around deep dark forest tree thi...
Name: text, dtype: object

#### preparing train and test data

In [63]:
# merging all predictive parameter into one column
auth_blog_new['labels']=auth_blog_new.apply(lambda col: [col['gender'],str(col['ageGroup']),col['topic'],col['sign']], axis=1)

In [64]:
# deleting all other column 
auth_blog_new.drop(['gender','age','ageGroup','topic','sign'],axis=1,inplace=True)

In [65]:
auth_blog_new.head(2)

Unnamed: 0,text,labels
338750,alon sad jade three word dread though appli to...,"[female, 20, indUnk, Scorpio]"
15482,dont put titl couldnt think anyth mate r read ...,"[female, 10, indUnk, Aries]"


In [69]:
# coverting label into numeric 
multiBinry=MultiLabelBinarizer()

In [70]:
multiLabels=multiBinry.fit_transform(auth_blog_new.labels)

In [71]:
feautres=auth_blog_new.text

In [74]:
X_train,X_test,Y_train,Y_test=train_test_split(feautres,multiLabels,shuffle=True,train_size=0.75,random_state=22)

#### Define function for Vectorization

In [43]:
def create_vector(df,vecName) :
  # tf = Convert a collection of raw documents to a matrix of TF-IDF features.
  # cv = Convert a collection of text documents to a matrix of token counts
  if 'cv'== vecName :
    vt = CountVectorizer()
  elif 'tf' == vecName :
    vt = TfidfVectorizer()
  return vt

##### 1. using TfidfVectorizer

In [76]:
tf = TfidfVectorizer()

In [77]:
X_train_tf=tf.fit_transform(X_train)
X_test_tf=tf.transform(X_test)

In [79]:
print(X_train_tf)

  (0, 54928)	0.0836543690058465
  (0, 50723)	0.11894114720873826
  (0, 28086)	0.05001247434050537
  (0, 14046)	0.175077772757996
  (0, 30867)	0.18577087850745333
  (0, 36353)	0.07372486500710254
  (0, 35221)	0.12889272156467865
  (0, 27214)	0.12066563653868026
  (0, 5419)	0.05021477164321032
  (0, 50011)	0.043912116843759987
  (0, 22271)	0.053305598534177955
  (0, 17893)	0.06710335471394352
  (0, 49965)	0.08191224598243428
  (0, 47394)	0.12260621889015244
  (0, 8777)	0.12889272156467865
  (0, 16193)	0.05995735070126725
  (0, 19840)	0.09366472001662862
  (0, 54927)	0.057291954951239746
  (0, 19477)	0.07381346195282561
  (0, 18762)	0.03166759349977437
  (0, 17455)	0.07333232699477149
  (0, 27648)	0.07892401419816315
  (0, 39828)	0.048097246734360555
  (0, 15599)	0.05439644234722354
  (0, 10365)	0.08315950497830807
  :	:
  (14999, 36579)	0.03699707142140124
  (14999, 53072)	0.047439341581653145
  (14999, 53298)	0.05653446759789411
  (14999, 11468)	0.03372718800143621
  (14999, 13552)	0.14

In [82]:
tf.get_feature_names()[:5]

['aa', 'aaa', 'aaaa', 'aaaaa', 'aaaaaa']

## Build with model

In [98]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score,f1_score,recall_score
from sklearn.ensemble import RandomForestClassifier

In [99]:
mnb = MultinomialNB()
lr = LogisticRegression()
rn = RandomForestClassifier(n_estimators=100)

In [96]:
def print_score(y_pred,y_actual,clf):
    print("Clf: ", clf.__class__.__name__)
    print('accuracy score: ', accuracy_score(y_actual, y_pred))
    print('F1 score: ', f1_score(y_actual, y_pred,average='micro'))
    print('Average recall score: ', recall_score(y_actual, y_pred,average='micro'))
    print("---")

In [None]:
for classifier in [lr, mnb,rn] :
    clf = OneVsRestClassifier(classifier)
    clf.fit(X_train_tf, Y_train)
    y_pred = clf.predict(X_test_tf)
    #Y_pred_inversed = multibinry.inverse_transform(y_pred)
    #y_test_inversed = multibinry.inverse_transform(Y_test)
    print_score(y_pred,Y_test,classifier)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Clf:  LogisticRegression
F1 score:  0.0012
F1 score:  0.40709855272226053
Average recall score:  0.29535
---
Clf:  MultinomialNB
F1 score:  0.0004
F1 score:  0.33845697988814405
Average recall score:  0.22845
---
