# Loading Libraries

In [1]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

# Looading the data set 

In [161]:
corpus = pd.read_csv('blogtext.csv')

### Checking the size and shape of the dataset

In [162]:
corpus.shape

(681284, 7)

In [163]:
corpus.sample(5)

Unnamed: 0,id,gender,age,topic,sign,date,text
225655,841857,female,17,indUnk,Virgo,"11,March,2004","oh biddles, what a weird day. all i can..."
245155,4138869,male,17,Technology,Cancer,"04,August,2004","I'm the only one left , the capta..."
291029,3580741,male,25,Internet,Libra,"11,June,2004",cloudy friday. tonight i have to p...
31249,3581446,female,17,Student,Pisces,"08,July,2004",Theo : How are you today? Seule :...
345888,2283987,female,23,Communications-Media,Capricorn,"03,May,2004","On Monday, Curtis Leskanic is sent home..."


# Reducing the size of the dataset for the purpose of this project

In [164]:
corpus = corpus.sample(3000)

In [165]:
corpus.shape

(3000, 7)

In [166]:
corpus.topic.value_counts()

indUnk                     1134
Student                     670
Technology                  179
Arts                        142
Education                   133
Communications-Media         87
Internet                     76
Non-Profit                   66
Engineering                  40
Law                          38
Government                   37
Publishing                   34
Science                      31
BusinessServices             24
Religion                     24
Marketing                    21
Museums-Libraries            21
Accounting                   20
Banking                      18
Transportation               16
Chemicals                    16
Sports-Recreation            16
Telecommunications           15
Consulting                   15
Military                     13
Biotech                      12
Fashion                      12
RealEstate                   12
Advertising                  11
Automotive                   10
HumanResources               10
Architec

# Proprocessing text

### 1. To Lowercase

In [167]:
#Lower casing
corpus['text'] = corpus['text'].apply( lambda t : ' '.join( word.lower() for word in t.split()  ) )
corpus['text'].head(5)

444652    a girly-girl. you dont have a lot of self-este...
419368    on the first day god created the dog. god said...
453355                              hi i'm sick! thats all!
617155    the concert,&nbsp;i rode to.&nbsp; lately, tho...
53233     right. for those of you who are wondering, i s...
Name: text, dtype: object

### 2. Removing Punctuations

In [168]:
## 2. Remove punctuations
corpus['text'] = corpus['text'].str.replace( '[^\w\s]' , '' )
corpus['text'].head(10)

444652    a girlygirl you dont have a lot of selfesteem ...
419368    on the first day god created the dog god said ...
453355                                 hi im sick thats all
617155    the concertnbspi rode tonbsp lately though ive...
53233     right for those of you who are wondering i sti...
475477    hideeho bloggeroonies i same to be borrowing a...
356960    so i just got back from the gym today was a sp...
210304    sometimes a tv show comes along that is so ref...
20748     anything new i have nothing to say today is go...
93207     if the page looks messed up thats cause it is ...
Name: text, dtype: object

### 3. Removing Stop Words (English)

In [169]:

stop = stopwords.words('english')

corpus['text'] = corpus['text'].apply( lambda t : " ".join( word for word in t.split() 
                                                       if word not in stop ) )

In [170]:
corpus['text'].head(10)

444652    girlygirl dont lot selfesteem people always br...
419368    first day god created dog god said sit day doo...
453355                                     hi im sick thats
617155    concertnbspi rode tonbsp lately though ive fix...
53233     right wondering still havent heard mit yet rep...
475477    hideeho bloggeroonies borrowing page ned fland...
356960    got back gym today specially long inning mark ...
210304    sometimes tv show comes along refreshingly dif...
20748     anything new nothing say today going anyother ...
93207     page looks messed thats cause messed messed bi...
Name: text, dtype: object

### 4. Removing frequently appearing words. Here top 40 frequent words are removed

In [171]:
# 4. Frequent word removal from the text

words = ' '.join(corpus['text'] ).split()
freqeuncy = pd.Series(words).value_counts()[:40]

corpus['text'] = corpus['text'].apply( lambda t : ' '.join( word for word in t.split() 
                                                      if word not in freqeuncy) )
corpus['text'].head(10)

444652    girlygirl lot selfesteem always bringing sad a...
419368    first god created dog god sit door house bark ...
453355                                        hi sick thats
617155    concertnbspi rode tonbsp lately though fixated...
53233     wondering havent heard mit yet repeat familiar...
475477    hideeho bloggeroonies borrowing page ned fland...
356960    gym specially long inning mark week healthy li...
210304    sometimes tv show comes along refreshingly dif...
20748     anything nothing anyother boring challenging t...
93207     page looks messed thats cause messed messed bi...
Name: text, dtype: object

### 5. Removing rarely used words. Here bottom 1800 frequenr words are removed

In [172]:
# 5. Remove Rare words
words = ' '.join(corpus['text'] ).split()
rare_words = pd.Series(words).value_counts()[-1800:]
rare_words.sort_values

<bound method Series.sort_values of sportscenter     1
lotoops          1
wedsun           1
399              1
kibblesinbits    1
                ..
tadi             1
usreeeeli        1
adin             1
charred          1
unduly           1
Length: 1800, dtype: int64>

In [173]:
corpus['text'] = corpus['text'].apply( lambda t : ' '.join( word for word in t.split() 
                                                      if word not in rare_words) )
corpus['text'].head(10)

444652    girlygirl lot selfesteem always bringing sad a...
419368    first god created dog god sit door house bark ...
453355                                        hi sick thats
617155    concertnbspi rode tonbsp lately though fixated...
53233     wondering havent heard mit yet repeat familiar...
475477    hideeho bloggeroonies borrowing page ned fland...
356960    gym specially long inning mark week healthy li...
210304    sometimes tv show comes along refreshingly dif...
20748     anything nothing anyother boring challenging t...
93207     page looks messed thats cause messed messed bi...
Name: text, dtype: object

### 6. Removing white spaces

In [174]:
# 6. Remove extra spaces
corpus['text'] = corpus['text'].str.strip()
corpus['text'].head(10)

444652    girlygirl lot selfesteem always bringing sad a...
419368    first god created dog god sit door house bark ...
453355                                        hi sick thats
617155    concertnbspi rode tonbsp lately though fixated...
53233     wondering havent heard mit yet repeat familiar...
475477    hideeho bloggeroonies borrowing page ned fland...
356960    gym specially long inning mark week healthy li...
210304    sometimes tv show comes along refreshingly dif...
20748     anything nothing anyother boring challenging t...
93207     page looks messed thats cause messed messed bi...
Name: text, dtype: object

In [175]:
corpus.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
444652,3894887,female,15,indUnk,Gemini,"11,July,2004",girlygirl lot selfesteem always bringing sad a...
419368,206770,male,24,Engineering,Scorpio,"05,July,2004",first god created dog god sit door house bark ...
453355,1999563,female,14,Student,Virgo,"02,July,2004",hi sick thats
617155,3917802,male,25,indUnk,Aries,"20,July,2004",concertnbspi rode tonbsp lately though fixated...
53233,2040049,female,17,indUnk,Scorpio,"14,December,2003",wondering havent heard mit yet repeat familiar...


### Checking the summary of the document

In [176]:

corpus.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3000 entries, 444652 to 599784
Data columns (total 7 columns):
id        3000 non-null int64
gender    3000 non-null object
age       3000 non-null int64
topic     3000 non-null object
sign      3000 non-null object
date      3000 non-null object
text      3000 non-null object
dtypes: int64(2), object(5)
memory usage: 187.5+ KB


In [177]:
sub_corpus = corpus.drop(['id','text','date'], axis = 1)


# Merging labels as mentioned in teh quetion for multiclass classification

In [178]:
corpus.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
444652,3894887,female,15,indUnk,Gemini,"11,July,2004",girlygirl lot selfesteem always bringing sad a...
419368,206770,male,24,Engineering,Scorpio,"05,July,2004",first god created dog god sit door house bark ...
453355,1999563,female,14,Student,Virgo,"02,July,2004",hi sick thats
617155,3917802,male,25,indUnk,Aries,"20,July,2004",concertnbspi rode tonbsp lately though fixated...
53233,2040049,female,17,indUnk,Scorpio,"14,December,2003",wondering havent heard mit yet repeat familiar...


In [179]:
corpus['labels']=sub_corpus.values.tolist()

In [180]:
corpus.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,labels
444652,3894887,female,15,indUnk,Gemini,"11,July,2004",girlygirl lot selfesteem always bringing sad a...,"[female, 15, indUnk, Gemini]"
419368,206770,male,24,Engineering,Scorpio,"05,July,2004",first god created dog god sit door house bark ...,"[male, 24, Engineering, Scorpio]"
453355,1999563,female,14,Student,Virgo,"02,July,2004",hi sick thats,"[female, 14, Student, Virgo]"
617155,3917802,male,25,indUnk,Aries,"20,July,2004",concertnbspi rode tonbsp lately though fixated...,"[male, 25, indUnk, Aries]"
53233,2040049,female,17,indUnk,Scorpio,"14,December,2003",wondering havent heard mit yet repeat familiar...,"[female, 17, indUnk, Scorpio]"


In [181]:
sub_corpus.head()

Unnamed: 0,gender,age,topic,sign
444652,female,15,indUnk,Gemini
419368,male,24,Engineering,Scorpio
453355,female,14,Student,Virgo
617155,male,25,indUnk,Aries
53233,female,17,indUnk,Scorpio


In [182]:
corpus.drop(['id','gender','topic','sign','date'], axis = 1, inplace = True)

In [183]:
corpus.head()

Unnamed: 0,age,text,labels
444652,15,girlygirl lot selfesteem always bringing sad a...,"[female, 15, indUnk, Gemini]"
419368,24,first god created dog god sit door house bark ...,"[male, 24, Engineering, Scorpio]"
453355,14,hi sick thats,"[female, 14, Student, Virgo]"
617155,25,concertnbspi rode tonbsp lately though fixated...,"[male, 25, indUnk, Aries]"
53233,17,wondering havent heard mit yet repeat familiar...,"[female, 17, indUnk, Scorpio]"


In [184]:
corpus.drop('age', axis = 1, inplace = True)

In [185]:
corpus.head()

Unnamed: 0,text,labels
444652,girlygirl lot selfesteem always bringing sad a...,"[female, 15, indUnk, Gemini]"
419368,first god created dog god sit door house bark ...,"[male, 24, Engineering, Scorpio]"
453355,hi sick thats,"[female, 14, Student, Virgo]"
617155,concertnbspi rode tonbsp lately though fixated...,"[male, 25, indUnk, Aries]"
53233,wondering havent heard mit yet repeat familiar...,"[female, 17, indUnk, Scorpio]"


In [186]:
np.unique(corpus.labels).shape

(1212,)

# Separetating to features and labels 

In [187]:
features = corpus['text']
labels = corpus['labels']

# Separating into training and test datasets

In [188]:

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25
                                                   )

In [189]:
X_train.shape

(2250,)

In [190]:
X_test.shape

(750,)

# Creating a bag of words using Count Vectorizer

### ngram_range is used as (1,2)

In [191]:
vect = CountVectorizer(ngram_range=(1, 2))

### Instantiating the vectorizer

In [None]:
vect.fit(features)

### Vectorizing the train and test datasets 

In [193]:
X_train_dtm = vect.transform(X_train)
X_train_dtm


<2250x289085 sparse matrix of type '<class 'numpy.int64'>'
	with 366273 stored elements in Compressed Sparse Row format>

In [194]:
X_test_dtm = vect.transform(X_test)
X_test_dtm

<750x289085 sparse matrix of type '<class 'numpy.int64'>'
	with 126579 stored elements in Compressed Sparse Row format>

### Document Term Matrices

In [195]:
X_train_dtm.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [196]:
X_test_dtm.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# Creating a dictionary to show count of each class

In [197]:
sub_corpus.head()

Unnamed: 0,gender,age,topic,sign
444652,female,15,indUnk,Gemini
419368,male,24,Engineering,Scorpio
453355,female,14,Student,Virgo
617155,male,25,indUnk,Aries
53233,female,17,indUnk,Scorpio


In [198]:
dummy = sub_corpus.age.value_counts()




In [199]:
sample_age = dict(zip(dummy.index, dummy.values))

In [200]:
sample_age

{24: 355,
 17: 339,
 16: 312,
 23: 312,
 25: 277,
 26: 253,
 27: 209,
 15: 188,
 14: 123,
 34: 105,
 35: 77,
 33: 77,
 13: 73,
 36: 53,
 37: 42,
 38: 41,
 45: 31,
 39: 24,
 40: 20,
 43: 18,
 42: 15,
 46: 14,
 48: 12,
 44: 12,
 41: 11,
 47: 7}

In [201]:
dummy = sub_corpus.gender.value_counts()
sample_gender = dict(zip(dummy.index, dummy.values))
sample_gender

{'female': 1500, 'male': 1500}

In [202]:
dummy = sub_corpus.topic.value_counts()
sample_topic = dict(zip(dummy.index, dummy.values))
sample_topic

{'indUnk': 1134,
 'Student': 670,
 'Technology': 179,
 'Arts': 142,
 'Education': 133,
 'Communications-Media': 87,
 'Internet': 76,
 'Non-Profit': 66,
 'Engineering': 40,
 'Law': 38,
 'Government': 37,
 'Publishing': 34,
 'Science': 31,
 'BusinessServices': 24,
 'Religion': 24,
 'Marketing': 21,
 'Museums-Libraries': 21,
 'Accounting': 20,
 'Banking': 18,
 'Transportation': 16,
 'Chemicals': 16,
 'Sports-Recreation': 16,
 'Telecommunications': 15,
 'Consulting': 15,
 'Military': 13,
 'Biotech': 12,
 'Fashion': 12,
 'RealEstate': 12,
 'Advertising': 11,
 'Automotive': 10,
 'HumanResources': 10,
 'Architecture': 9,
 'Tourism': 8,
 'LawEnforcement-Security': 8,
 'InvestmentBanking': 5,
 'Agriculture': 5,
 'Construction': 5,
 'Manufacturing': 4,
 'Maritime': 2,
 'Environment': 1}

In [203]:
dummy = sub_corpus.sign.value_counts()
sample_sign = dict(zip(dummy.index, dummy.values))
sample_sign

{'Taurus': 295,
 'Aries': 291,
 'Cancer': 274,
 'Libra': 264,
 'Virgo': 261,
 'Leo': 255,
 'Scorpio': 251,
 'Pisces': 230,
 'Aquarius': 228,
 'Gemini': 221,
 'Capricorn': 215,
 'Sagittarius': 215}

# Below is the final dictionary wth class frequencies

In [204]:
sample_final = dict(sample_gender)
sample_final.update(sample_age)
sample_final.update(sample_sign)
sample_final.update(sample_topic)
sample_final

{'female': 1500,
 'male': 1500,
 24: 355,
 17: 339,
 16: 312,
 23: 312,
 25: 277,
 26: 253,
 27: 209,
 15: 188,
 14: 123,
 34: 105,
 35: 77,
 33: 77,
 13: 73,
 36: 53,
 37: 42,
 38: 41,
 45: 31,
 39: 24,
 40: 20,
 43: 18,
 42: 15,
 46: 14,
 48: 12,
 44: 12,
 41: 11,
 47: 7,
 'Taurus': 295,
 'Aries': 291,
 'Cancer': 274,
 'Libra': 264,
 'Virgo': 261,
 'Leo': 255,
 'Scorpio': 251,
 'Pisces': 230,
 'Aquarius': 228,
 'Gemini': 221,
 'Capricorn': 215,
 'Sagittarius': 215,
 'indUnk': 1134,
 'Student': 670,
 'Technology': 179,
 'Arts': 142,
 'Education': 133,
 'Communications-Media': 87,
 'Internet': 76,
 'Non-Profit': 66,
 'Engineering': 40,
 'Law': 38,
 'Government': 37,
 'Publishing': 34,
 'Science': 31,
 'BusinessServices': 24,
 'Religion': 24,
 'Marketing': 21,
 'Museums-Libraries': 21,
 'Accounting': 20,
 'Banking': 18,
 'Transportation': 16,
 'Chemicals': 16,
 'Sports-Recreation': 16,
 'Telecommunications': 15,
 'Consulting': 15,
 'Military': 13,
 'Biotech': 12,
 'Fashion': 12,
 'RealE

# Using Multilabel binarizer we are bnow converting the classes to binary form 

In [205]:
mlb = MultiLabelBinarizer()

In [206]:
y_train.shape

(2250,)

In [207]:
y_train = list(y_train)
y_test  =list(y_test)
y_train[0]

['male', 23, 'Arts', 'Sagittarius']

In [208]:
for i in np.arange(0,2250):
    y_train[i] = tuple(y_train[i])
     



In [209]:
y_train = [[str(j) for j in i] for i in y_train]

In [210]:
y_test = [[str(j) for j in i] for i in y_test]

In [211]:
y_train_transformed = mlb.fit_transform(y_train)
y_test_transformed = mlb.transform(y_test)
y_train_transformed

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 1, 1],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 1, 1]])

In [212]:
y_train_transformed.shape

(2250, 80)

In [213]:
y_test_transformed.shape

(750, 80)

In [214]:
X_test_dtm.shape

(750, 289085)

In [215]:
X_train.shape

(2250,)

### The classes decoded are below

In [216]:
mlb.classes_

array(['13', '14', '15', '16', '17', '23', '24', '25', '26', '27', '33',
       '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44',
       '45', '46', '47', '48', 'Accounting', 'Advertising', 'Agriculture',
       'Aquarius', 'Architecture', 'Aries', 'Arts', 'Automotive',
       'Banking', 'Biotech', 'BusinessServices', 'Cancer', 'Capricorn',
       'Chemicals', 'Communications-Media', 'Construction', 'Consulting',
       'Education', 'Engineering', 'Environment', 'Fashion', 'Gemini',
       'Government', 'HumanResources', 'Internet', 'InvestmentBanking',
       'Law', 'LawEnforcement-Security', 'Leo', 'Libra', 'Manufacturing',
       'Maritime', 'Marketing', 'Military', 'Museums-Libraries',
       'Non-Profit', 'Pisces', 'Publishing', 'RealEstate', 'Religion',
       'Sagittarius', 'Science', 'Scorpio', 'Sports-Recreation',
       'Student', 'Taurus', 'Technology', 'Telecommunications', 'Tourism',
       'Transportation', 'Virgo', 'female', 'indUnk', 'male'],
      dtype

# Using Logistic Regression classifier with One vs Rest Wrapped around it to classify

In [217]:
OVR = OneVsRestClassifier(LogisticRegression()).fit(X_train_dtm.toarray(),y_train_transformed)

# Train Accuracy Score

In [218]:
OVR.score(X_train_dtm.toarray(), y_train_transformed)

0.9244444444444444

# Test Accuracy Score

In [None]:
OVR.score(X_test_dtm.toarray(), y_test_transformed)

##### As the number of data chosen is a very small subset of the full set, the accuracy of test set is very poor

In [220]:
from sklearn.metrics import classification_report
result = OVR.predict(X_test_dtm.toarray())
result

array([[0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]])

# F1, Precision and Recall scores for each class and Average values of these metrics are below

In [221]:
target_names = mlb.classes_
print(classification_report(y_test_transformed, result, target_names=target_names))

                         precision    recall  f1-score   support

                     13       0.00      0.00      0.00        23
                     14       0.00      0.00      0.00        27
                     15       0.00      0.00      0.00        42
                     16       0.60      0.03      0.06        92
                     17       0.29      0.02      0.04        84
                     23       0.00      0.00      0.00        70
                     24       0.00      0.00      0.00        81
                     25       0.00      0.00      0.00        81
                     26       0.00      0.00      0.00        56
                     27       0.00      0.00      0.00        52
                     33       0.00      0.00      0.00        21
                     34       0.00      0.00      0.00        30
                     35       0.00      0.00      0.00        21
                     36       0.00      0.00      0.00        12
                     37 

# PREDICTIONS

In [231]:
test = vect.transform([list(features)[0]])
prediction = OVR.predict(test)
print("Text", [list(features)[34]])
print("True", list(labels)[34])
print("Prediction", mlb.inverse_transform(prediction))
print("------------------------\n")
test = vect.transform([list(features)[100]])
prediction = OVR.predict(test)
print("Text", [list(features)[100]])
print("True", list(labels)[100])
print("Prediction", mlb.inverse_transform(prediction))
print("------------------------\n")
test = vect.transform([list(features)[123]])
prediction = OVR.predict(test)
print("Text", [list(features)[123]])
print("True", list(labels)[123])
print("Prediction", mlb.inverse_transform(prediction))
print("------------------------\n")
test = vect.transform([list(X_train)[2]])
prediction = OVR.predict(test)
print("Text", [list(X_train)[2]])
print("True", y_train[2])
print("Prediction", mlb.inverse_transform(prediction))
print("------------------------\n")
test = vect.transform([list(X_test)[21]])
prediction = OVR.predict(test)
print("Text", [list(X_test)[21]])
print("True", y_test[21])
print("Prediction", mlb.inverse_transform(prediction))
print("------------------------\n")

Text ['another class begins joy seriously getting burned find attention dwindling nbsp charles starting class tonight also asking questions protocol awww hes cute hes nervous read paper 4 times sure okay directed blog show examples nbsp walking noticed leaves ground closer started jumping realized arent leaves grasshoppers grasshoppa disgusting blanket grasshoppers step took jumped thing wore pants instead shorts']
True ['female', 25, 'indUnk', 'Capricorn']
Prediction [('female', 'indUnk')]
------------------------

Text ['finished putting together logo moms wardrobe consultant business wasnt woke home handrolled cigarette took forever kept threatening fall apart carib leave poison least neighbors come smoke grocery shopping finally procured amys lentil soup sale mrs bairds bread large brick cheese excited possibilities although havent chance delve yet dollar pocket guys doesnt apostrophe parents house helped dad fax project little mom mac stuff logo steps send announcement letter happ

#### Prediction is again very poor