In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
# library for NLP tasks, 
from textblob import TextBlob
import nltk
nltk.download('wordnet')
from textblob import Word 
# bag of words and tf-idf
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
df = pd.read_csv(r'/content/drive/My Drive/AIML/Project_Blogs_Corpus/blogtext.csv')

In [4]:
df.shape

(681284, 7)

In [5]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [6]:
df.topic.value_counts()

indUnk                     251015
Student                    153903
Technology                  42055
Arts                        32449
Education                   29633
Communications-Media        20140
Internet                    16006
Non-Profit                  14700
Engineering                 11653
Law                          9040
Publishing                   7753
Science                      7269
Government                   6907
Consulting                   5862
Religion                     5235
Fashion                      4851
Marketing                    4769
Advertising                  4676
BusinessServices             4500
Banking                      4049
Chemicals                    3928
Telecommunications           3891
Accounting                   3832
Military                     3128
Museums-Libraries            3096
Sports-Recreation            3038
HumanResources               3010
RealEstate                   2870
Transportation               2326
Manufacturing 

In [0]:
##
# 1. Dataset is huge to process
# 2. Topic counts are inconsistent 

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681284 entries, 0 to 681283
Data columns (total 7 columns):
id        681284 non-null int64
gender    681284 non-null object
age       681284 non-null int64
topic     681284 non-null object
sign      681284 non-null object
date      681284 non-null object
text      681284 non-null object
dtypes: int64(2), object(5)
memory usage: 36.4+ MB


In [0]:
df.topic = df.topic.astype(str)
df.age = df.age.astype(str)

In [0]:
# To downsize the dataset, let's keep only few topics which are our target class

df_student = df[ (df.topic == 'Student') ] 
df_tech =   df[ (df.topic == 'Technology') ]
df_arts     = df[(df.topic == 'Arts')]       
df_non_profit = df[(df.topic == 'Non-Profit')]
df_law = df[df.topic == 'Law']
df_gov = df[(df.topic == 'Government')]
df_rel = df[(df.topic == 'Religion')]
df_fashion = df[(df.topic == 'Fashion')]

In [0]:
# use resample method from scikit-learn
from sklearn.utils import resample

df_student = resample(df_student, 
                      replace=True,    # sample with replacement
                      n_samples=5000,     # to match number of values in each class
                      random_state=123) # reproducible results

df_tech = resample(df_tech, 
                      replace=True,    # sample with replacement
                      n_samples=5000,     # to match number of values in each class
                      random_state=123) # reproducible results

df_arts = resample(df_arts, 
                      replace=True,    # sample with replacement
                      n_samples=5000,     # to match number of values in each class
                      random_state=123) # reproducible results

df_non_profit = resample(df_non_profit, 
                      replace=True,    # sample with replacement
                      n_samples=5000,     # to match number of values in each class
                      random_state=123) # reproducible results

df_law = resample(df_law, 
                      replace=True,    # sample with replacement
                      n_samples=5000,     # to match number of values in each class
                      random_state=123) # reproducible results

df_gov = resample(df_gov, 
                      replace=True,    # sample with replacement
                      n_samples=5000,     # to match number of values in each class
                      random_state=123) # reproducible results

df_rel = resample(df_rel, 
                      replace=True,    # sample with replacement
                      n_samples=5000,     # to match number of values in each class
                      random_state=123) # reproducible results

In [0]:
# Combine all the class with equal number of values
df = pd.concat([df_student, df_tech, df_arts, df_non_profit, df_law, df_gov, df_rel, df_fashion ])

In [13]:
# check for the class balance
df['topic'].value_counts()

Government    5000
Law           5000
Non-Profit    5000
Religion      5000
Student       5000
Technology    5000
Arts          5000
Fashion       4851
Name: topic, dtype: int64

In [14]:
df.head(5)

Unnamed: 0,id,gender,age,topic,sign,date,text
72398,479019,male,24,Student,Gemini,"02,July,2004",Ive been watching a lot of movies...
132682,3622174,male,16,Student,Leo,"25,June,2004","Zuma, by Pop Cap games, is one o..."
82810,1478632,male,17,Student,Leo,"30,May,2003","[Step Up Day? WTF?] Yeah, the eighth ..."
543401,2973911,male,17,Student,Sagittarius,"01,September,2003","Ok, so i haven't posted for some ti..."
650855,891544,male,27,Student,Libra,"28,July,2004",hi gang... the hunt may have to be pos...


In [15]:
#  Lower casing  - change all the words to lower case to avoid duplication
df['text'] = df['text'].apply( lambda t : ' '.join( word.lower() for word in t.split()  ) )
df['text'].head(5)

72398     ive been watching a lot of movies lately. in t...
132682    zuma, by pop cap games, is one of the most add...
82810     [step up day? wtf?] yeah, the eighth graders c...
543401    ok, so i haven't posted for some time... sorry...
650855    hi gang... the hunt may have to be postponed. ...
Name: text, dtype: object

In [0]:
# Frequent word removal from the text; text which are not stopwords
# first we will take whole  data and split into words and then calculate their frequency
#  join words only with strings so, there needs to be some string to join other string
all_words = ' '.join( df['text'] ).split()
freqeuncy = pd.Series(all_words).value_counts()[:20]

In [17]:
freqeuncy

the     317793
i       238493
to      225248
and     200159
a       166972
of      146790
in      103145
that     95718
is       82828
my       76902
it       74151
for      69806
was      63073
you      62007
on       54903
have     49323
with     49220
but      47704
this     46675
be       44062
dtype: int64

In [18]:
# remove frequenct words
df['text'] = df['text'].apply( lambda t : ' '.join( word for word in t.split() 
                                                      if word not in freqeuncy) )
df['text'].head(10)

72398     ive been watching lot movies lately. month may...
132682    zuma, by pop cap games, one most addicting web...
82810     [step up day? wtf?] yeah, eighth graders come ...
543401    ok, so haven't posted some time... sorry `bout...
650855    hi gang... hunt may postponed. there possibili...
584378    wow. haven't been here few weeks. thanks comme...
229176    shots and... shots welll... today an interesti...
679363    when she loved me by sarah mclachlan when some...
317983    urllink so think found last can tab earth; fou...
541092    bam! whole nother week has gone by. 'nother' r...
Name: text, dtype: object

In [19]:
# Remove Rare words, which will not contribute to our model
all_words = ' '.join(df['text'] ).split()
rarely = pd.Series(all_words).value_counts()[-160000:]
rarely.sort_values

<bound method Series.sort_values of lolprincess:&nbsp;austin    1
entertaining:               1
concered...oy               1
room....so                  1
been;                       1
                           ..
pilipinas.'                 1
350z--note                  1
chemotherapy.               1
maharashtrian               1
'alright...'                1
Length: 160000, dtype: int64>

In [20]:
# remove rare words
# remove frequenct words
df['text'] = df['text'].apply( lambda t : ' '.join( word for word in t.split() 
                                                      if word not in rarely) )
df['text'].head(10)

72398     ive been watching lot movies lately. month may...
132682    zuma, by pop cap games, one most addicting web...
82810     up day? yeah, eighth graders come up school mo...
543401    ok, so haven't posted some time... sorry that....
650855    hi hunt may postponed. there possibility i'll ...
584378    wow. haven't been here few weeks. thanks comme...
229176    shots and... shots today an interesting day. n...
679363    when she loved me by sarah mclachlan when some...
317983    urllink so think found last can tab earth; fou...
541092    bam! whole nother week has gone by. real word?...
Name: text, dtype: object

In [21]:
#  Remove whitespaces
df['text'] = df['text'].str.strip()
df['text'].head(10)

72398     ive been watching lot movies lately. month may...
132682    zuma, by pop cap games, one most addicting web...
82810     up day? yeah, eighth graders come up school mo...
543401    ok, so haven't posted some time... sorry that....
650855    hi hunt may postponed. there possibility i'll ...
584378    wow. haven't been here few weeks. thanks comme...
229176    shots and... shots today an interesting day. n...
679363    when she loved me by sarah mclachlan when some...
317983    urllink so think found last can tab earth; fou...
541092    bam! whole nother week has gone by. real word?...
Name: text, dtype: object

In [22]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
# Remove stopwords

from nltk.corpus import stopwords
stop = stopwords.words('english')

df['text'] = df['text'].apply( lambda t : " ".join( word for word in t.split() 
                                                       if word not in stop ) )

In [24]:
# Remove unwanted characters
# the [^\w\s] means remove everything, keep only words(w) and spaces(s)
df['text'] = df['text'].str.replace( '[^\w\s]' , '' )
df['text'].head(10)

72398     ive watching lot movies lately month may ive w...
132682    zuma pop cap games one addicting web games eve...
82810     day yeah eighth graders come school monday woo...
543401    ok posted time sorry that anyways ayan nagpara...
650855    hi hunt may postponed possibility ill going to...
584378    wow weeks thanks comments guys  apologize lack...
229176    shots and shots today interesting day ordinary...
679363    loved sarah mclachlan somebody loved everythin...
317983    urllink think found last tab earth found old b...
541092    bam whole nother week gone by real word anyway...
Name: text, dtype: object

In [25]:
# Remove Numberic
import re

def remove_num(t):
    removed_num_text = re.sub(r'\d+', '', t)
    return removed_num_text

df['text'] = df['text'].apply( lambda t : remove_num(t) )
df['text'].head(10)

72398     ive watching lot movies lately month may ive w...
132682    zuma pop cap games one addicting web games eve...
82810     day yeah eighth graders come school monday woo...
543401    ok posted time sorry that anyways ayan nagpara...
650855    hi hunt may postponed possibility ill going to...
584378    wow weeks thanks comments guys  apologize lack...
229176    shots and shots today interesting day ordinary...
679363    loved sarah mclachlan somebody loved everythin...
317983    urllink think found last tab earth found old b...
541092    bam whole nother week gone by real word anyway...
Name: text, dtype: object

In [26]:
#  Lemmatization ; its preferred over stemming because if finds the root word
df['text'] = df['text'].apply( lambda t : " ".join( [Word(word).lemmatize() for word in t.split()  ]) )
df['text'].head(5)

72398     ive watching lot movie lately month may ive wa...
132682    zuma pop cap game one addicting web game ever ...
82810     day yeah eighth grader come school monday wooh...
543401    ok posted time sorry that anyways ayan nagpara...
650855    hi hunt may postponed possibility ill going to...
Name: text, dtype: object

In [0]:
import numpy as np
labels_array = df[['gender', 'age', 'topic', 'sign']].to_numpy()

In [28]:
labels_array

array([['male', '24', 'Student', 'Gemini'],
       ['male', '16', 'Student', 'Leo'],
       ['male', '17', 'Student', 'Leo'],
       ...,
       ['female', '23', 'Fashion', 'Libra'],
       ['female', '23', 'Fashion', 'Libra'],
       ['female', '23', 'Fashion', 'Libra']], dtype=object)

In [0]:
df['labels'] = labels_array.tolist()

In [30]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,labels
72398,479019,male,24,Student,Gemini,"02,July,2004",ive watching lot movie lately month may ive wa...,"[male, 24, Student, Gemini]"
132682,3622174,male,16,Student,Leo,"25,June,2004",zuma pop cap game one addicting web game ever ...,"[male, 16, Student, Leo]"
82810,1478632,male,17,Student,Leo,"30,May,2003",day yeah eighth grader come school monday wooh...,"[male, 17, Student, Leo]"
543401,2973911,male,17,Student,Sagittarius,"01,September,2003",ok posted time sorry that anyways ayan nagpara...,"[male, 17, Student, Sagittarius]"
650855,891544,male,27,Student,Libra,"28,July,2004",hi hunt may postponed possibility ill going to...,"[male, 27, Student, Libra]"


In [0]:
#Merge the columns
#df['labels'] = df[['gender', 'age', 'topic', 'sign']].apply(lambda x: ','.join(x), axis=1)

In [0]:
df.drop(['id', 'gender', 'age', 'topic', 'sign', 'date'], axis = 1,  inplace = True)

In [32]:
df.head()

Unnamed: 0,text,labels
72398,ive watching lot movie lately month may ive wa...,"[male, 24, Student, Gemini]"
132682,zuma pop cap game one addicting web game ever ...,"[male, 16, Student, Leo]"
82810,day yeah eighth grader come school monday wooh...,"[male, 17, Student, Leo]"
543401,ok posted time sorry that anyways ayan nagpara...,"[male, 17, Student, Sagittarius]"
650855,hi hunt may postponed possibility ill going to...,"[male, 27, Student, Libra]"


In [0]:
from sklearn.model_selection import train_test_split

In [0]:
y = df.labels
X = df.drop('labels', axis=1)

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)


In [36]:
print("\nX_train:\n")
print(X_train.head())
print(X_train.shape)

print("\nX_test:\n")
print(X_test.head())
print(X_test.shape)

print("\ny_train:\n")
print(y_train.head())
print(y_train.shape)

print("\ny_test:\n")
print(y_test.head())
print(y_test.shape)


X_train:

                                                     text
200918  good job morningi couldve totally blown deanes...
503483  taking thing theyre worth were trapped togethe...
68803   ive meaning while but like everything life rea...
316294  anyone else ever watched urllink philadelphia ...
648966  three day nv cum many present yen bought pigqi...
(31880, 1)

X_test:

                                                     text
7762    im alive getting as kicked workout daily basis...
18743   urllink kerry campaign blast cheney criticism ...
414997  difference nbsp were different heart hand smil...
383379  slowly breaking day light morning got ready sc...
653784  urllink excellent people urllink white rose se...
(7971, 1)

y_train:

200918    [female, 24, Non-Profit, Gemini]
503483          [male, 16, Student, Libra]
68803      [female, 45, Technology, Virgo]
316294         [male, 24, Government, Leo]
648966          [female, 14, Student, Leo]
Name: labels, dtype: object
(31880,)

In [0]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(ngram_range=(1, 2))

In [0]:
train_text_list = X_train["text"].tolist()

In [0]:
test_text_list = X_test["text"].tolist()

In [40]:
# learn the 'vocabulary' of the training data
vect.fit(train_text_list)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 2), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [41]:
# transform training data into a 'document-term matrix'
train_dtm = vect.transform(train_text_list)
train_dtm

<31880x1674170 sparse matrix of type '<class 'numpy.int64'>'
	with 5476770 stored elements in Compressed Sparse Row format>

In [0]:
# convert sparse matrix to a dense matrix
#train_dtm.toarray()

#### RAM Crashed ####

In [0]:
# examine the vocabulary and document-term matrix together
#pd.DataFrame(train_dtm.toarray(), columns=vect.get_feature_names())

### RAM Crashed ####

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn import metrics

from sklearn.linear_model import LogisticRegression

In [45]:
lb = preprocessing.MultiLabelBinarizer()
Y = lb.fit_transform(y_train)
Y_test = lb.fit_transform(y_test)

classifier = Pipeline([
('vectorizer', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', OneVsRestClassifier(LogisticRegression(solver = 'lbfgs')))])

classifier.fit(train_text_list, Y)
predicted = classifier.predict(test_text_list)

metrics.accuracy_score(Y_test, predicted)



Accuracy Score:  0.03324551499184544


In [56]:
# print the confusion matrix
metrics.multilabel_confusion_matrix(Y_test, predicted)

array([[[7924,    0],
        [  47,    0]],

       [[7813,    0],
        [ 158,    0]],

       [[7632,    5],
        [ 322,   12]],

       [[7389,    4],
        [ 541,   37]],

       [[6998,   25],
        [ 805,  143]],

       [[7323,    0],
        [ 647,    1]],

       [[6967,    0],
        [ 968,   36]],

       [[7278,    1],
        [ 688,    4]],

       [[6950,    9],
        [ 924,   88]],

       [[7250,    3],
        [ 701,   17]],

       [[7786,    0],
        [ 185,    0]],

       [[7851,    0],
        [ 116,    4]],

       [[7672,    0],
        [ 296,    3]],

       [[7353,    3],
        [ 430,  185]],

       [[7809,    0],
        [ 159,    3]],

       [[7930,    0],
        [  41,    0]],

       [[7895,    0],
        [  76,    0]],

       [[7931,    0],
        [  37,    3]],

       [[7923,    0],
        [  48,    0]],

       [[7942,    0],
        [  29,    0]],

       [[7905,    0],
        [  66,    0]],

       [[7965,    0],
        [   

In [59]:
# print the f1 score
metrics.f1_score(Y_test, predicted, average=None)

  'precision', 'predicted', average, warn_for)


array([0.        , 0.        , 0.06837607, 0.11954766, 0.2562724 ,
       0.00308166, 0.06923077, 0.01147776, 0.15870153, 0.04607046,
       0.        , 0.06451613, 0.01986755, 0.4607721 , 0.03636364,
       0.        , 0.        , 0.13953488, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.04761905,
       0.        , 0.09230769, 0.21831637, 0.03198495, 0.08739076,
       0.02298851, 0.38436482, 0.11258278, 0.15354331, 0.19672131,
       0.00452489, 0.00568182, 0.06185567, 0.23361823, 0.41121495,
       0.        , 0.01838235, 0.10820896, 0.14073072, 0.07590133,
       0.04329004, 0.63112192, 0.79143755])

In [54]:
# print precision and recall
metrics.precision_recall_fscore_support(Y_test, predicted, average='weighted')

  'precision', 'predicted', average, warn_for)


(0.8093672933540468, 0.23905407100740184, 0.2769718444315596, None)