# Statiscal NLP Project - Blog Authorship Corpus

In [0]:
#Import Libraries
import numpy as np
import pandas as pd
import re
import nltk 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

## Load the dataset

In [0]:
#After mounting the drive from the files section, read the dataset
df = pd.read_csv("/content/drive/My Drive/blogtext.csv")

In [6]:
df.shape

(681284, 7)

In [7]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [8]:
df["gender"].unique()

array(['male', 'female'], dtype=object)

In [0]:
df["age"].nunique()

26

In [0]:
df["age"].unique()

array([15, 33, 14, 25, 17, 23, 37, 26, 24, 27, 45, 34, 41, 44, 16, 39, 35,
       36, 46, 42, 13, 38, 43, 40, 47, 48])

In [0]:
df["topic"].nunique()

40

In [0]:
df["topic"].unique()

array(['Student', 'InvestmentBanking', 'indUnk', 'Non-Profit', 'Banking',
       'Education', 'Engineering', 'Science', 'Communications-Media',
       'BusinessServices', 'Sports-Recreation', 'Arts', 'Internet',
       'Museums-Libraries', 'Accounting', 'Technology', 'Law',
       'Consulting', 'Automotive', 'Religion', 'Fashion', 'Publishing',
       'Marketing', 'LawEnforcement-Security', 'HumanResources',
       'Telecommunications', 'Military', 'Government', 'Transportation',
       'Architecture', 'Advertising', 'Agriculture', 'Biotech',
       'RealEstate', 'Manufacturing', 'Construction', 'Chemicals',
       'Maritime', 'Tourism', 'Environment'], dtype=object)

In [0]:
df["topic"].value_counts()

indUnk                     251015
Student                    153903
Technology                  42055
Arts                        32449
Education                   29633
Communications-Media        20140
Internet                    16006
Non-Profit                  14700
Engineering                 11653
Law                          9040
Publishing                   7753
Science                      7269
Government                   6907
Consulting                   5862
Religion                     5235
Fashion                      4851
Marketing                    4769
Advertising                  4676
BusinessServices             4500
Banking                      4049
Chemicals                    3928
Telecommunications           3891
Accounting                   3832
Military                     3128
Museums-Libraries            3096
Sports-Recreation            3038
HumanResources               3010
RealEstate                   2870
Transportation               2326
Manufacturing 

In [0]:
df["sign"].nunique()

12

In [9]:
df["sign"].unique()

array(['Leo', 'Aquarius', 'Aries', 'Capricorn', 'Gemini', 'Cancer',
       'Sagittarius', 'Scorpio', 'Libra', 'Virgo', 'Taurus', 'Pisces'],
      dtype=object)

In [0]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681284 entries, 0 to 681283
Data columns (total 7 columns):
id        681284 non-null int64
gender    681284 non-null object
age       681284 non-null int64
topic     681284 non-null object
sign      681284 non-null object
date      681284 non-null object
text      681284 non-null object
dtypes: int64(2), object(5)
memory usage: 36.4+ MB


In [0]:
#Observation : No null values in the dataset

In [0]:
#selecting fewer rows for further processing without crashing the system
dframe = df.sample(n=1000, random_state=7)

In [0]:
#We can also select fewer records based on column 'topic' - We can select fewer 'topic' records.
#checked column "topic". it seems value "IndUnk" means unknown values in the column which has count of 251015. So , dropping all the rows having this value , 
#so as to reduce the size of the dataframe for us to process further withour any crash.
#dframe = df[df.topic != "indUnk"]

In [70]:
dframe.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
42965,3304516,male,27,Technology,Pisces,"24,June,2004",urllink do you see the man in the picture o...
218209,303162,female,38,indUnk,Virgo,"30,May,2004",yeah cause its obviously a urllink family show
138338,2806788,female,26,Marketing,Taurus,"27,January,2004",damned tarnished halo showing again
643895,3682951,female,24,indUnk,Taurus,"18,June,2004",i suggest that the underground press could per...
452608,3368641,male,39,Education,Pisces,"18,May,2004",sometimes when i am quiet and in tune i feel ...


In [69]:
dframe.shape

(1000, 7)

## Preprocess rows of the “text” column :
a. Remove unwanted characters  
b. Convert text to lowercase  
c. Remove unwanted spaces  
d. Remove stopwords  

In [0]:
#Converting Text to Lowercase
dframe['text'] = dframe['text'].map(lambda s: s.lower())

In [0]:
#Removing unwanted characters and selecting a-z 
#dframe['text'] = dframe['text'].map(lambda s: re.sub('[^0-9a-z _]','',s))
dframe['text'] = dframe['text'].map(lambda s: re.sub('[^a-z ]','',s))

In [22]:
dframe.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
42965,3304516,male,27,Technology,Pisces,"24,June,2004",urllink do you see the ma...
218209,303162,female,38,indUnk,Virgo,"30,May,2004",yeah cause its obviously a urllink f...
138338,2806788,female,26,Marketing,Taurus,"27,January,2004",damned tarnished halo showing again...
643895,3682951,female,24,indUnk,Taurus,"18,June,2004",i suggest that the underground press co...
452608,3368641,male,39,Education,Pisces,"18,May,2004",sometimes when i am quiet and in ...


In [0]:
#remove unwanted spaces
dframe['text'] = dframe['text'].map(lambda s: s.strip())

In [0]:
#We would remove stopwords in CountVectorizer step 
#set(stopwords.words('english'))  #No stopwords lib in Nltk package

### As we want to make this into a multi-label classification problem, you are required to merge all the label columns together, so that we have all the labels together for a particular sentence
a. Label columns to merge: “gender”, “age”, “topic”, “sign”  
b. After completing the previous step, there should be only two columns in your data
frame i.e. “text” and “labels” as shown in the below image

In [0]:
df2 = pd.DataFrame(dframe.text)

In [32]:
df2.head()

Unnamed: 0,text
42965,urllink do you see the man in the picture o...
218209,yeah cause its obviously a urllink family show
138338,damned tarnished halo showing again
643895,i suggest that the underground press could per...
452608,sometimes when i am quiet and in tune i feel ...


In [0]:
df2["labels"] = dframe.apply(lambda x: list([x['gender'],
                                        x['age'],
                                        x['topic'],
                                         x["sign"]]),axis=1)

In [35]:
df2.head()

Unnamed: 0,text,labels
42965,urllink do you see the man in the picture o...,"[male, 27, Technology, Pisces]"
218209,yeah cause its obviously a urllink family show,"[female, 38, indUnk, Virgo]"
138338,damned tarnished halo showing again,"[female, 26, Marketing, Taurus]"
643895,i suggest that the underground press could per...,"[female, 24, indUnk, Taurus]"
452608,sometimes when i am quiet and in tune i feel ...,"[male, 39, Education, Pisces]"


### Separate features and labels, and split the data into training and testing

In [0]:
#Feature and Label seperation
X = df2["text"] 
y = df2["labels"]

In [71]:
X.shape

(1000,)

In [72]:
y.shape

(1000,)

In [0]:
#Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

In [11]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(700,)
(300,)
(700,)
(300,)


In [12]:
X_train.head()

96335     it appears that when you change the settings i...
421131    somtimes this house feels like a prison that i...
579234    had a good hr min session in the pool today ba...
212936    the funniest thing happened today  i overslept...
633899    a mother is she who can take the place of all ...
Name: text, dtype: object

In [16]:
y_train.head()

96335            [female, 45, Education, Gemini]
421131              [female, 15, indUnk, Gemini]
579234       [male, 33, Technology, Sagittarius]
212936             [female, 24, Student, Cancer]
633899    [female, 24, HumanResources, Aquarius]
Name: labels, dtype: object

In [17]:
X_test.head()

550553                             urllink this  feels good
171944    listening to beautiful you are  urllink snoop ...
354256    and another thing charle has decided that mayb...
87645     so im listening to the woes of having a large ...
203417    another project that i have on my needles righ...
Name: text, dtype: object

In [18]:
y_test.head()

550553          [male, 27, Law, Aries]
171944      [male, 16, Student, Virgo]
354256    [female, 16, Student, Aries]
87645     [female, 23, Education, Leo]
203417     [female, 42, indUnk, Libra]
Name: labels, dtype: object

## Vectorize the features
a. Create a Bag of Words using count vectorizer      
    i. Use ngram_range=(1, 2)      
    ii. Vectorize training and testing features      
b. Print the term-document matrix    

In [0]:
#Here we are removing stopwords with the help of CountVectorizer
vect = CountVectorizer(stop_words='english',ngram_range=(1,2))

In [20]:
#vect.fit(train) learns the vocabulary of the training data
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 2), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [21]:
# examine the fitted vocabulary
vect.get_feature_names()

['aaa',
 'aaa discounts',
 'aaaaaaanyway',
 'aaaaaaanyway probably',
 'aaaaaah',
 'aaaaaah said',
 'aaaaaahhhh',
 'aaaarg',
 'aaaarg vowing',
 'aaahhhh',
 'aaahhhh kayaking',
 'aaalllll',
 'aaalllll day',
 'aampm',
 'aampm college',
 'aampm helping',
 'aampm machines',
 'aaron',
 'aaron rob',
 'abandoned',
 'abandoned gas',
 'abandoned sleeve',
 'abarbablog',
 'abarbablog members',
 'abaya',
 'abaya like',
 'abaya set',
 'abaya shayla',
 'abbreviations',
 'abbreviations usb',
 'abc',
 'abc really',
 'abcs',
 'abcs type',
 'aber',
 'aber woman',
 'abercrombie',
 'abercrombie chick',
 'aberystwyth',
 'aberystwyth family',
 'abilities',
 'abilities energy',
 'ability',
 'ability appreciate',
 'ability control',
 'ability deliver',
 'ability discerned',
 'ability evolve',
 'ability forecast',
 'ability govern',
 'ability language',
 'ability learners',
 'ability multiple',
 'ability people',
 'ability project',
 'ability roam',
 'ability scrambled',
 'ability trust',
 'ability unscramble',

In [0]:
#vect.transform(train) uses the fitted vocabulary to build a document-term matrix from the training data
train_dtm = vect.transform(X_train)

In [15]:
# check the type of the document-term matrix
type(train_dtm)

scipy.sparse.csr.csr_matrix

In [59]:
train_dtm.shape

(700, 74792)

In [23]:
# examine the sparse matrix contents by printing the DTM for Train dataset
print(train_dtm)

  (0, 2593)	1
  (0, 2594)	1
  (0, 5854)	1
  (0, 5874)	1
  (0, 6183)	1
  (0, 6235)	1
  (0, 9312)	1
  (0, 9340)	1
  (0, 9365)	2
  (0, 9369)	1
  (0, 9374)	1
  (0, 17693)	1
  (0, 17694)	1
  (0, 37504)	1
  (0, 37514)	1
  (0, 43658)	1
  (0, 43678)	1
  (0, 48358)	1
  (0, 49329)	1
  (0, 49353)	1
  (0, 57261)	1
  (0, 57262)	1
  (0, 66873)	1
  (0, 66929)	1
  (1, 1001)	1
  :	:
  (699, 72073)	1
  (699, 72082)	1
  (699, 72083)	1
  (699, 72358)	1
  (699, 72359)	1
  (699, 72662)	1
  (699, 72663)	1
  (699, 73313)	1
  (699, 73314)	1
  (699, 73624)	1
  (699, 73632)	1
  (699, 73805)	1
  (699, 73806)	1
  (699, 73817)	2
  (699, 73818)	1
  (699, 73819)	1
  (699, 73820)	2
  (699, 73821)	1
  (699, 73822)	1
  (699, 74111)	1
  (699, 74144)	1
  (699, 74275)	1
  (699, 74276)	1
  (699, 74424)	1
  (699, 74425)	1


In [0]:
#vect.transform(test) uses the fitted vocabulary to build a document-term matrix from the testing data and ignores tokens it hasn't seen before
test_dtm = vect.transform(X_test)

In [25]:
test_dtm.shape

(300, 74792)

In [26]:
#Print Document Term Matrix for test
print(test_dtm)

  (0, 21412)	1
  (0, 21414)	1
  (0, 25214)	1
  (0, 68909)	1
  (1, 4820)	1
  (1, 21218)	1
  (1, 37434)	1
  (1, 68909)	1
  (2, 2396)	1
  (2, 2488)	1
  (2, 3235)	1
  (2, 5966)	1
  (2, 8107)	1
  (2, 10573)	1
  (2, 10585)	1
  (2, 11483)	1
  (2, 11586)	1
  (2, 13638)	1
  (2, 14269)	1
  (2, 14778)	1
  (2, 15849)	1
  (2, 16280)	1
  (2, 16387)	1
  (2, 16465)	1
  (2, 17110)	8
  :	:
  (299, 7929)	1
  (299, 10391)	1
  (299, 13108)	1
  (299, 15459)	1
  (299, 16886)	2
  (299, 17659)	1
  (299, 19829)	1
  (299, 21051)	1
  (299, 22568)	1
  (299, 23489)	1
  (299, 23501)	1
  (299, 26079)	1
  (299, 27058)	1
  (299, 28468)	1
  (299, 28491)	1
  (299, 32668)	1
  (299, 45312)	1
  (299, 48189)	1
  (299, 51052)	1
  (299, 52367)	1
  (299, 52578)	1
  (299, 55966)	1
  (299, 60122)	1
  (299, 66346)	1
  (299, 66873)	1


### Create a dictionary to get the count of every label i.e. the key will be label name and value will be the total count of the label. Check below image for reference

In [0]:
x=[]
for i in range(len(df2)) : 
  x = x + df2['labels'].iloc[i]

In [29]:
x

['male',
 27,
 'Technology',
 'Pisces',
 'female',
 38,
 'indUnk',
 'Virgo',
 'female',
 26,
 'Marketing',
 'Taurus',
 'female',
 24,
 'indUnk',
 'Taurus',
 'male',
 39,
 'Education',
 'Pisces',
 'female',
 24,
 'Accounting',
 'Aquarius',
 'male',
 16,
 'Communications-Media',
 'Scorpio',
 'male',
 33,
 'indUnk',
 'Taurus',
 'male',
 17,
 'indUnk',
 'Sagittarius',
 'female',
 14,
 'HumanResources',
 'Scorpio',
 'male',
 16,
 'Student',
 'Capricorn',
 'male',
 25,
 'Internet',
 'Sagittarius',
 'male',
 36,
 'Fashion',
 'Gemini',
 'male',
 25,
 'indUnk',
 'Leo',
 'female',
 34,
 'indUnk',
 'Aries',
 'male',
 14,
 'Student',
 'Leo',
 'male',
 48,
 'Government',
 'Leo',
 'female',
 15,
 'Student',
 'Capricorn',
 'female',
 25,
 'Advertising',
 'Scorpio',
 'male',
 26,
 'Technology',
 'Taurus',
 'female',
 15,
 'indUnk',
 'Capricorn',
 'male',
 33,
 'Banking',
 'Sagittarius',
 'male',
 34,
 'Education',
 'Aquarius',
 'female',
 23,
 'indUnk',
 'Scorpio',
 'male',
 15,
 'Student',
 'Sagittar

In [0]:
def CountFrequency(my_list): 
  
    # Creating an empty dictionary  
    freq = {} 
    for item in my_list: 
        if (item in freq): 
            freq[item] += 1
        else: 
            freq[item] = 1
  
    print(freq)

In [31]:
CountFrequency(x)

{'male': 511, 27: 69, 'Technology': 55, 'Pisces': 81, 'female': 489, 38: 9, 'indUnk': 376, 'Virgo': 85, 26: 67, 'Marketing': 7, 'Taurus': 107, 24: 118, 39: 10, 'Education': 55, 'Accounting': 6, 'Aquarius': 91, 16: 93, 'Communications-Media': 29, 'Scorpio': 78, 33: 30, 17: 114, 'Sagittarius': 60, 14: 56, 'HumanResources': 8, 'Student': 230, 'Capricorn': 85, 25: 105, 'Internet': 19, 36: 24, 'Fashion': 10, 'Gemini': 84, 'Leo': 69, 34: 31, 'Aries': 97, 48: 6, 'Government': 8, 15: 72, 'Advertising': 10, 'Banking': 9, 23: 99, 'Arts': 46, 'Libra': 79, 'Cancer': 84, 35: 24, 'Military': 6, 44: 2, 13: 22, 'Consulting': 15, 42: 4, 'Agriculture': 2, 'Engineering': 11, 'RealEstate': 2, 'Chemicals': 7, 'BusinessServices': 7, 'Non-Profit': 15, 37: 12, 45: 6, 43: 9, 'Religion': 6, 'Science': 10, 'InvestmentBanking': 1, 41: 8, 'Law': 10, 'Architecture': 1, 'Biotech': 6, 46: 5, 'Publishing': 9, 47: 3, 'Transportation': 6, 40: 2, 'Manufacturing': 4, 'Sports-Recreation': 5, 'LawEnforcement-Security': 1, '

## Transform the labels -
As we have noticed before, in this task each example can have multiple tags. To deal with such kind of prediction, we need to transform labels in a binary form and the prediction will be a mask of 0s and 1s. For this purpose, it is convenient to use MultiLabelBinarizer from sklearn  

a. Convert your train and test labels using MultiLabelBinarizer

In [0]:
from sklearn.preprocessing import MultiLabelBinarizer

In [0]:
mlb = MultiLabelBinarizer()

In [34]:
y_train.shape

(700,)

In [0]:
y_train = list(y_train)

In [0]:
y_test = list(y_test)

In [37]:
y_train[0]

['female', 45, 'Education', 'Gemini']

In [0]:
for i in np.arange(0,700):
  y_train[i] = tuple(y_train[i])

In [39]:
y_train[0]

('female', 45, 'Education', 'Gemini')

In [0]:
for i in np.arange(0,300):
  y_test[i] = tuple(y_test[i])

In [0]:
y_train = [[str(j) for j in i] for i in y_train]

In [42]:
y_train[0]

['female', '45', 'Education', 'Gemini']

In [0]:
y_test = [[str(j) for j in i] for i in y_test]

In [0]:
#Fit -Transform MLB on train labels
y_train_transformed = mlb.fit_transform(y_train)

In [45]:
y_train_transformed

array([[0, 0, 0, ..., 1, 0, 0],
       [0, 0, 1, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 1, 1]])

In [46]:
 y_train_transformed.shape

(700, 76)

In [47]:
#The classes are decoded as below :
mlb.classes_

array(['13', '14', '15', '16', '17', '23', '24', '25', '26', '27', '33',
       '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44',
       '45', '46', '47', '48', 'Accounting', 'Advertising', 'Agriculture',
       'Aquarius', 'Architecture', 'Aries', 'Arts', 'Banking', 'Biotech',
       'BusinessServices', 'Cancer', 'Capricorn', 'Chemicals',
       'Communications-Media', 'Consulting', 'Education', 'Engineering',
       'Environment', 'Fashion', 'Gemini', 'Government', 'HumanResources',
       'Internet', 'InvestmentBanking', 'Law', 'Leo', 'Libra',
       'Manufacturing', 'Marketing', 'Military', 'Museums-Libraries',
       'Non-Profit', 'Pisces', 'Publishing', 'RealEstate', 'Religion',
       'Sagittarius', 'Science', 'Scorpio', 'Sports-Recreation',
       'Student', 'Taurus', 'Technology', 'Telecommunications', 'Tourism',
       'Transportation', 'Virgo', 'female', 'indUnk', 'male'],
      dtype=object)

In [0]:
#MLB transform on test labels
y_test_transformed = mlb.transform(y_test)

In [49]:
y_test_transformed

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])

##8. Choose a classifier -
In this task, we suggest using the One-vs-Rest approach, which is implemented in
OneVsRestClassifier class. In this approach k classifiers (= number of tags) are trained. As a basic classifier, use LogisticRegression . It is one of the simplest methods, but often it performs good enough in text classification tasks. It might take some time because the number of classifiers to train is large.  

a. Use a linear classifier of your choice, wrap it up in OneVsRestClassifier to train it on every label  
b. As One-vs-Rest approach might not have been discussed in the sessions, we are
providing you the code for that


In [0]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

In [0]:
clf = LogisticRegression(solver='lbfgs')

In [0]:
clf = OneVsRestClassifier(clf)

## 9. Fit the classifier, make predictions and get the accuracy.  

a. Print the following

> 
i. Accuracy score  
ii. F1 score    
iii. Average precision score  
iv. Average recall score  
v. Tip: Make sure you are familiar with all of them. How would you expect the
things to work for the multi-label scenario? Read about micro/macro/weighted
averaging




In [53]:
#Fit the classifier
clf.fit(train_dtm.toarray(), y_train_transformed)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [0]:
#Make Predictions
y_pred_train = clf.predict(train_dtm.toarray())

In [64]:
#Accuracy
print(metrics.accuracy_score(y_train_transformed, y_pred_train))  #Training Accuracy

0.9114285714285715


In [32]:
#For multilabel classification- we have to use the average parameter which determines the type of averaging performed on the data.
#Here we would use 'micro' averaging - it calculate metrics globally by counting the total true positives, false negatives and false positives.

#F1 score 
metrics.f1_score(y_train_transformed,y_pred_train, average='micro')

0.9751371115173675

In [33]:
#Average precision score
metrics.average_precision_score(y_train_transformed,y_pred_train,average='micro')

0.9539297752808988

In [34]:
#Average Recall score
metrics.recall_score(y_train_transformed,y_pred_train,average='micro')

0.9525

In [0]:
#Making predictions on test
y_test_pred = clf.predict(test_dtm.toarray())

In [66]:
#Test Accuracy
print(metrics.accuracy_score(y_test_transformed,y_test_pred))  

0.0


In [42]:
#Test F1 score 
metrics.f1_score(y_test_transformed,y_test_pred, average='micro')

0.2196825396825397

In [43]:
#Average precision score
metrics.average_precision_score(y_test_transformed,y_test_pred,average='micro')

0.11138732631803099

In [44]:
#Average Recall score
metrics.recall_score(y_test_transformed,y_test_pred,average='micro')

0.14428690575479566

In [0]:
#Conclusion : Training metrics are very high and testing metrics are very low as we have taken very few records to train our dataset. 
#Accuracy can be improved by running the model on large dataset. 

### 10. Print true label and predicted label for any five examples

In [0]:
#Get prediction function to predict and using 'inverse_transform' to convert binary labels into original text labels 
def get_pred(q):
    q_pred = clf.predict(q)
    return mlb.inverse_transform(q_pred)

In [66]:
#Printing Labels for any five samples for Train dataset

for i in range(5): 
  k = random.randint(1,700)
  print("Blog: ", X_train.iloc[k], "\nPredicted Label: ", get_pred(train_dtm[k].toarray())), print("Actual Label: ",y_train[k], "\n")


Blog:  urllink    why arent they smiling  why so seriousnbsp urllink 
Predicted Label:  [('Pisces', 'female')]
Actual Label:  ['female', '35', 'Transportation', 'Pisces'] 

Blog:  well for those who are going down to san antonio sometime here is my review on the staybridge suite near the airport i give it  stars it was nice to have  rooms  beds  tvs and a kitchen the reason though for taking away  stars was that it was near traffic so it was hard to leave also the comunity wasnt all spectacular and while the hotel was nice it did show some age so for a low price hotel for a prolonged stay that should suffice your needs 
Predicted Label:  [('13', 'Capricorn', 'Technology', 'male')]
Actual Label:  ['male', '13', 'Technology', 'Capricorn'] 

Blog:  tonight was great the carpirate ship was awesome its too bad i didnt get to go to the party afterwards but its all good theres another one tomorrow night that i wanted to go to more so there  erika is my best friend ever she lives in sandusky a

In [68]:
#Printing Labels for any five samples for Test dataset

for i in range(5): 
  k = random.randint(1,300)
  print("Blog: ", X_test.iloc[k], "\nPredicted Label: ", get_pred(test_dtm[k].toarray())), print("Actual Label: ",y_test[k], "\n")


Blog:  more  another  inches im still recovering from the  inches last week sheesh let a girl catch her breath already 
Predicted Label:  [('female',)]
Actual Label:  ['female', '39', 'indUnk', 'Sagittarius'] 

Blog:  i decided that i didnt like the redraft of the first chapter and have started to redraft it again im glad i decided to because when i sat down to it last night i realised i had so much more to add so i have deleted it from fanfiction and i hope to have the new one posted for the weekend  other than that i have letters to catch up on and im away to do that just now 
Predicted Label:  [('male',)]
Actual Label:  ['female', '24', 'indUnk', 'Virgo'] 

Blog:  were backand we had lots of fun too in bewteen stepping in gum mixing up the words soxs and soaked and avoiding giant evil bug things called sequetasmost likely spelled wrongthat have no point in living but other than that we home  ps rave candy is now the offically best sugar source ever  dilussional child 
Predicted Labe