### 1. Blog Data Loading and exploration

Blog data is available as a zip file of CSV file. In the code below, we are copying the from Google drive.

In [0]:
from google.colab import drive
drive.mount('/gdrive')

In [0]:
import pandas as pd
import numpy as np

In [0]:
# read file into pandas using a relative path. Please change the path as needed
# blog_df = pd.read_table('/gdrive/My Drive/collab/data/blogtext.csv.zip', compression='zip', skiprows=0, nrows=1000) # This works if it is non mac compressed. Since mac compression creates MACOSX files as well.
# blog_df = pd.read_table('/gdrive/My Drive/collab/data/blogtext.csv', header=None, names=['id', 'gender','age','topic','sign','date','text'])
blog_df = pd.read_csv('/gdrive/My Drive/collab/data/blogtext.csv', skiprows=0, nrows=10000) #Load only nrows of data instead of complete file

In [0]:
#Total number of SMS
blog_df.shape

In [0]:
#Check the contents of dataframe
blog_df.sample(n=5)

In [0]:
blog_df['labels'] = '['+blog_df['gender']+','+blog_df['age'].astype(str)+','+blog_df['topic']+','+blog_df['sign']+']'

In [0]:
blog_df.groupby('labels').count()

In [313]:
# Create a new dataframe and remove all other columns.
blog_multilabel_df = blog_df.copy(deep=True)
blog_multilabel_df = blog_multilabel_df.drop(columns=['id','gender','age','topic','sign','date'], axis=1)
blog_multilabel_df.sample(n=5)

Unnamed: 0,text,labels
7887,"Alright, so I've missed the FF... ...","[male,36,Fashion,Aries]"
7660,Things that 'went down' for HAL & ...,"[male,36,Fashion,Aries]"
5994,This from our Olympic hopeful - tot...,"[female,27,indUnk,Taurus]"
1486,so what did everyone think of the s...,"[male,35,Technology,Aries]"
6425,Who needs to watch the VMA's when y...,"[male,36,Fashion,Aries]"


In [314]:
blog_multilabel_df.shape

(10000, 2)

In [0]:
#clean the text data. Remove unwanted characters, convert to lowercase, remove unwanted spaces and remove stopwords
# blog_multilabel_df['text'] = blog_multilabel_df['text'].str.replace("'","")
# blog_multilabel_df['text'] = blog_multilabel_df['text'].str.replace(".","")
# blog_multilabel_df['text'] = blog_multilabel_df['text'].str.replace(",","")
# blog_multilabel_df['text'] = blog_multilabel_df['text'].str.replace("  "," ")
# blog_multilabel_df['text'] = blog_multilabel_df['text'].str.replace("?"," ")
# blog_multilabel_df['text'] = blog_multilabel_df['text'].str.replace(":"," ")
# blog_multilabel_df['text'] = blog_multilabel_df['text'].str.replace("!"," ")
# blog_multilabel_df['text'] = blog_multilabel_df['text'].str.replace("#"," ")
# blog_multilabel_df['text'] = blog_multilabel_df['text'].str.replace("("," ")
# blog_multilabel_df['text'] = blog_multilabel_df['text'].str.replace(")"," ")
# blog_multilabel_df['text'] = blog_multilabel_df['text'].str.lower()
# blog_multilabel_df.sample(n=5)

In [316]:
import unicodedata
import re
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
# !pip install -q wordcloud
# import wordcloud

# import nltk
# nltk.download('stopwords')
# nltk.corpus.stopwords.words('english')
# stop = stopwords.words('english')

# blog_multilabel_df['text'] = blog_multilabel_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
# blog_multilabel_df.sample(n=5)

In [0]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [0]:
def remove_special_characters(text, remove_digits=False):
    #Using regex
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

In [0]:
def lemmatize_text(text):

    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()]) 

In [0]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

#simple_stemmer("My system keeps crashing his crashed yesterday, ours crashes daily")

In [0]:
def normalize_corpus(corpus, html_stripping=True, accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

In [0]:
blog_multilabel_df['text'] = normalize_corpus(blog_multilabel_df['text'], html_stripping=False, text_lemmatization=False)

In [324]:
blog_multilabel_df.sample(n=5)

Unnamed: 0,text,labels
7124,my movie comments and right on panda keeping ...,"[male,36,Fashion,Aries]"
398,happy birthday thats what i should have said ...,"[female,24,indUnk,Scorpio]"
3207,i propose a film entitled the mueslix evacuat...,"[male,35,Technology,Aries]"
6210,the lost art of kissing by william d hicks ki...,"[female,17,indUnk,Cancer]"
9211,urllink your hazel green eyes the best poetry...,"[female,24,indUnk,Sagittarius]"


Create Dictionary to get count of all the labels

In [0]:
label_array = blog_multilabel_df['labels'].to_numpy()

In [326]:
map = {}
count = 0
for label in label_array:
  s = label[1:len(label)-1]
  a = s.split(',')
  count = count+ 1
  for token in a:
    if(token not in map.keys()):
      map[token] = 1
    else:
      map[token]= map[token] + 1 
print(count)

10000


In [327]:
for token, count in map.items(): 
    print(token, ":", count) 

male : 5916
15 : 602
Student : 1137
Leo : 301
33 : 136
InvestmentBanking : 70
Aquarius : 571
female : 4084
14 : 212
indUnk : 3287
Aries : 4198
25 : 386
Capricorn : 215
17 : 1185
Gemini : 150
23 : 253
Non-Profit : 71
Cancer : 504
Banking : 16
37 : 33
Sagittarius : 1097
26 : 234
24 : 655
Scorpio : 971
27 : 1054
Education : 270
45 : 16
Engineering : 127
Libra : 491
Science : 63
34 : 553
41 : 20
Communications-Media : 99
BusinessServices : 91
Sports-Recreation : 80
Virgo : 236
Taurus : 812
Arts : 45
Pisces : 454
44 : 3
16 : 440
Internet : 118
Museums-Libraries : 17
Accounting : 4
39 : 79
35 : 2315
Technology : 2654
36 : 1708
Law : 11
46 : 7
Consulting : 21
Automotive : 14
42 : 14
Religion : 9
13 : 42
Fashion : 1622
38 : 46
43 : 6
Publishing : 4
40 : 1
Marketing : 156
LawEnforcement-Security : 10
HumanResources : 2
Telecommunications : 2


### 2. Create Training & Test Dataset

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
# split X and y into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(blog_multilabel_df.text, blog_multilabel_df.labels, test_size=0.20, random_state=2)

In [0]:
#Traing data
print(x_train.shape)
print(y_train.shape)

In [0]:
#Test Data
print(x_test.shape)
print(y_test.shape)

In [332]:
print(x_train.head())
print(x_test.head())
print(y_train.head())
print(y_test.head())

8856     can someone tell me how we got to be like thi...
1545     now i am here and i expect my life to change ...
8136     urllink look at the sun reflecting off the wa...
7768     like a good neighbor id like to go into detai...
7782     dufs looking mighty svelte or hes got a large...
Name: text, dtype: object
7878     hi folks the resident atlantan decided to act...
3224     cant say that i miss life in urllink cubevill...
1919     group i apologize for using the blog in this ...
4432     huge success the wine club event was wonderfu...
4835     well im not usually one to follow the sheep b...
Name: text, dtype: object
8856            [female,17,Student,Sagittarius]
1545                 [male,35,Technology,Aries]
8136    [female,25,Communications-Media,Pisces]
7768                    [male,36,Fashion,Aries]
7782                    [male,36,Fashion,Aries]
Name: labels, dtype: object
7878           [male,36,Fashion,Aries]
3224        [male,35,Technology,Aries]
1919        [male,35,T

### 3. Tokenization & Vectorization

Using **CountVectorizer**, to get numeric features.

In [0]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
cvect = CountVectorizer(ngram_range=(1,2))

In [0]:
#Feed blog data to CountVectorizer
cvect.fit(x_train)

#Check the vocablury size
len(cvect.vocabulary_)

In [0]:
#What is there in the vocabulary
# cvect.vocabulary_

Build Document-term Matrix (DTM)

In [0]:
#Convert Training blog messages into Count Vectors
X_train_ct = cvect.transform(x_train)
X_test_ct = cvect.transform(x_test)

In [0]:
#Size of Document Term Matrix
print(X_train_ct.shape)
print(X_test_ct.shape)

In [0]:
#Let's check the first record
print(X_train_ct[0])

In [0]:
from sklearn.preprocessing import MultiLabelBinarizer

In [340]:
print(y_train[0])
y_test

[male,15,Student,Leo]


7878                  [male,36,Fashion,Aries]
3224               [male,35,Technology,Aries]
1919               [male,35,Technology,Aries]
4432           [female,34,indUnk,Sagittarius]
4835               [female,42,Consulting,Leo]
                        ...                  
9073           [female,24,indUnk,Sagittarius]
8584    [male,25,Communications-Media,Pisces]
5390               [female,17,indUnk,Scorpio]
5092               [female,17,indUnk,Scorpio]
2964               [male,35,Technology,Aries]
Name: labels, Length: 2000, dtype: object

In [341]:
mlb = MultiLabelBinarizer()
y_train_mlb = mlb.fit_transform(y_train)
y_test_mlb = mlb.transform(y_test)
# list(mlb.classes_)

  .format(sorted(unknown, key=str)))


In [342]:
print(y_train_mlb[100])
print(y_test_mlb[100])
print(y_train_mlb.shape)
print(y_test_mlb.shape)

[1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 0
 1 1 0 1 1 1 1 0 0 1 1 0 0 0 0 0]
[1 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 0 1
 0 1 0 1 1 1 0 0 0 1 0 0 0 0 0 0]
(8000, 53)
(2000, 53)


In [0]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression(solver='lbfgs',max_iter=1000)
clf = OneVsRestClassifier(lg)

In [344]:
clf.fit(X_train_ct, y_train_mlb)

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [345]:
predicted = clf.predict(X_test_ct)
print(predicted)
# summarize the fit of the model
model_score = clf.score(X_test_ct, y_test_mlb)

[[1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 1]
 ...
 [1 0 0 ... 0 0 0]
 [1 0 1 ... 0 0 0]
 [1 0 0 ... 0 0 1]]


In [346]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test_mlb, predicted)

0.2305

In [347]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
print('Accuracy: ', accuracy_score(y_test_mlb, predicted))
print('Precison: ', precision_score(y_test_mlb, predicted,average='micro'))
print('Recall: ', recall_score(y_test_mlb, predicted,average='micro'))
print('F1: ', f1_score(y_test_mlb, predicted,average='micro'))

Accuracy:  0.2305
Precison:  0.909649770947244
Recall:  0.8078639298650848
F1:  0.8557407587616254


Print True labels and predicted label for any 5 examples

In [348]:
y_test_mlb[0]
y = mlb.inverse_transform(y_test_mlb)
len(y[0])

17

In [349]:
for i in range(0,4):
  print("True Label : " ,y_test_mlb[i])
  print("Predicted Label: " ,predicted[i])

True Label :  [1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 0
 1 1 0 1 1 1 1 0 0 1 1 0 0 0 0 0]
Predicted Label:  [1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 0
 0 1 0 1 1 1 1 0 0 1 1 0 0 0 0 0]
True Label :  [1 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 0 1 0 1 0 1
 1 1 0 1 1 1 1 0 0 1 1 0 0 0 0 1]
Predicted Label:  [1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 0
 0 1 0 1 1 1 1 0 0 1 1 0 0 0 0 0]
True Label :  [1 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 0 1 0 1 0 1
 1 1 0 1 1 1 1 0 0 1 1 0 0 0 0 1]
Predicted Label:  [1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 0 1 0 1 0 1
 1 1 0 1 1 1 1 0 0 1 1 0 0 0 0 1]
True Label :  [1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 1 0 0 1 1 1 1
 0 1 1 1 1 1 0 0 0 1 1 1 1 0 0 0]
Predicted Label:  [1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 1 0 0 1 1 1 1
 0 1 1 1 1 1 0 0 0 1 1 1 1 0 0 0]


In [350]:
y = mlb.inverse_transform(y_test_mlb)
y[0]

(',',
 '3',
 '6',
 'A',
 'F',
 '[',
 ']',
 'a',
 'e',
 'h',
 'i',
 'l',
 'm',
 'n',
 'o',
 'r',
 's')

In [351]:
y_pred = mlb.inverse_transform(predicted)
y_pred[0]

(',', '3', '6', '[', ']', 'a', 'e', 'i', 'l', 'm', 'n', 'o', 'r', 's')