In [0]:
import os
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report,f1_score, accuracy_score, recall_score, precision_score
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')

##Loading the Dateset##

In [0]:
os.chdir("/content/drive/My Drive/Colab Notebooks/Python/Lab/Lab 8/")

In [3]:
df = pd.read_csv("blogtext.csv")
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [4]:
df.shape

(681284, 7)

##Data Cleaning##

Due to computational ease we are taking first 10000 rows for model preprocessing.

In [5]:
df = df[:3000]
print(df.shape)
df["text"].loc[0]

(3000, 7)


'           Info has been found (+/- 100 pages, and 4.5 MB of .pdf files) Now i have to wait untill our team leader has processed it and learns html.         '

In [0]:
df.dropna(inplace=True)

In [7]:
df.isnull().sum()

id        0
gender    0
age       0
topic     0
sign      0
date      0
text      0
dtype: int64

In [0]:
df["text"] = df["text"].str.replace('[^A-Za-z]',' ')
df["text"] = df["text"].str.lower()
df["text"] = df["text"].str.strip()
df["text"] = df["text"].str.split()

In [9]:
df["text"].loc[0]

['info',
 'has',
 'been',
 'found',
 'pages',
 'and',
 'mb',
 'of',
 'pdf',
 'files',
 'now',
 'i',
 'have',
 'to',
 'wait',
 'untill',
 'our',
 'team',
 'leader',
 'has',
 'processed',
 'it',
 'and',
 'learns',
 'html']

##Data Preprocessing##

###Removing Stopwords##

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
stop = stopwords.words('english')
def removestopwords(y):   
 stopwordremoved = [w for w in y if w not in stop]
 return(" ".join(stopwordremoved))

In [12]:
text_col_size = df["text"].size
print("text column size :", text_col_size)

cleaned_text = []

# Loop over each text
for i in range( 0, text_col_size):
    cleaned_text.append(removestopwords(df["text"][i]))

text column size : 3000


In [13]:
cleaned_text[4]

'thanks yahoo toolbar capture urls popups means show cool links korean pop k pop audio video without need relate instructions like go site click pop audio button choose without ado link hour k pop urllink audio urllink video streaming enjoy'

In [14]:
#Replace text column with cleaner_corpus_df_sample_text 
df["text"] = cleaned_text
df["text"][4]

'thanks yahoo toolbar capture urls popups means show cool links korean pop k pop audio video without need relate instructions like go site click pop audio button choose without ado link hour k pop urllink audio urllink video streaming enjoy'

###Lemmatization###

In [15]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    lemm = [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]
    return(" ".join(lemm)) 

df["text"] = df.text.apply(lemmatize_text)

In [17]:
df["text"][4]

'thanks yahoo toolbar capture url popups mean show cool link korean pop k pop audio video without need relate instruction like go site click pop audio button choose without ado link hour k pop urllink audio urllink video streaming enjoy'

##Creating input for model##

In [0]:
df = df[['gender', 'age', 'topic', 'sign', 'text']]

In [0]:
df['labels']= df['gender'].astype(str) +","+ df['age'].astype(str) +","+df['topic'].astype(str) +","+ df['sign'].astype(str)
df.drop(columns=['gender', 'age', 'topic', 'sign'], inplace=True)

In [20]:
df.head()

Unnamed: 0,text,labels
0,info found page mb pdf file wait untill team l...,"male,15,Student,Leo"
1,team member drewes van der laag urllink mail r...,"male,15,Student,Leo"
2,het kader van kernfusie op aarde maak je eigen...,"male,15,Student,Leo"
3,testing testing,"male,15,Student,Leo"
4,thanks yahoo toolbar capture url popups mean s...,"male,33,InvestmentBanking,Aquarius"


##Seperation of Data for Training and Testing##

In [0]:
X = df["text"]
df['labels'] = df['labels'].str.lower()
y = df["labels"]

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=10)

##Creating "Bad of Words"##

In [0]:
vect = CountVectorizer(ngram_range=(1, 2), max_df=3, stop_words='english')
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [24]:
print(X_train_dtm)

  (0, 26211)	1
  (0, 12167)	1
  (0, 121938)	1
  (0, 12047)	1
  (0, 21452)	1
  (0, 83513)	1
  (0, 81347)	1
  (1, 34993)	1
  (1, 84819)	1
  (1, 52268)	1
  (1, 34994)	1
  (1, 40460)	1
  (2, 126352)	1
  (2, 15932)	1
  (2, 130411)	1
  (2, 92809)	1
  (2, 6509)	3
  (2, 100158)	1
  (2, 116865)	1
  (2, 122152)	1
  (2, 139864)	1
  (2, 126353)	1
  (2, 15937)	1
  (2, 130413)	1
  (2, 92810)	1
  :	:
  (2249, 3031)	1
  (2249, 59927)	1
  (2249, 89937)	1
  (2249, 133654)	1
  (2249, 60375)	1
  (2249, 133603)	1
  (2249, 38862)	1
  (2249, 115984)	1
  (2249, 9901)	1
  (2249, 54846)	1
  (2249, 129812)	1
  (2249, 98205)	1
  (2249, 118147)	1
  (2249, 20350)	1
  (2249, 4654)	1
  (2249, 79388)	1
  (2249, 13480)	1
  (2249, 60827)	1
  (2249, 89933)	1
  (2249, 130420)	1
  (2249, 3032)	1
  (2249, 84237)	1
  (2249, 43329)	1
  (2249, 60372)	1
  (2249, 59928)	1


In [25]:
print(X_test_dtm)

  (0, 3926)	1
  (0, 13049)	1
  (0, 29949)	1
  (0, 29951)	1
  (0, 66270)	1
  (0, 72506)	1
  (0, 80056)	1
  (0, 90071)	1
  (0, 90455)	1
  (0, 98871)	1
  (0, 102996)	1
  (0, 105908)	1
  (0, 118955)	1
  (0, 125347)	1
  (1, 3020)	3
  (1, 3021)	1
  (1, 3248)	1
  (1, 11564)	1
  (1, 19174)	1
  (1, 19630)	1
  (1, 22981)	1
  (1, 25145)	1
  (1, 32021)	1
  (1, 41694)	1
  (1, 44737)	1
  :	:
  (747, 138959)	1
  (747, 140632)	1
  (747, 140704)	1
  (747, 140807)	1
  (748, 1492)	1
  (748, 24842)	1
  (748, 27665)	1
  (748, 31411)	1
  (748, 40404)	1
  (748, 47548)	1
  (748, 59639)	1
  (748, 64375)	1
  (748, 67079)	1
  (748, 75226)	1
  (748, 78387)	1
  (748, 98129)	1
  (748, 105399)	1
  (748, 107368)	1
  (748, 110735)	1
  (748, 116425)	1
  (748, 121322)	1
  (748, 125617)	1
  (748, 126172)	1
  (748, 133955)	1
  (748, 140229)	1


In [26]:
type(X_train_dtm)

scipy.sparse.csr.csr_matrix

##Creating a dictionary for label count##

In [27]:
vectorizer_labels = CountVectorizer(min_df = 1,ngram_range = (1,1),stop_words = "english")
labels_vector = vectorizer_labels.fit_transform(y)
vectorizer_labels.vocabulary_

{'14': 0,
 '15': 1,
 '16': 2,
 '17': 3,
 '23': 4,
 '24': 5,
 '25': 6,
 '26': 7,
 '27': 8,
 '33': 9,
 '34': 10,
 '35': 11,
 '37': 12,
 '39': 13,
 '41': 14,
 '44': 15,
 '45': 16,
 'accounting': 17,
 'aquarius': 18,
 'aries': 19,
 'arts': 20,
 'banking': 21,
 'businessservices': 22,
 'cancer': 23,
 'capricorn': 24,
 'communications': 25,
 'education': 26,
 'engineering': 27,
 'female': 28,
 'gemini': 29,
 'indunk': 30,
 'internet': 31,
 'investmentbanking': 32,
 'leo': 33,
 'libra': 34,
 'libraries': 35,
 'male': 36,
 'media': 37,
 'museums': 38,
 'non': 39,
 'pisces': 40,
 'profit': 41,
 'recreation': 42,
 'sagittarius': 43,
 'science': 44,
 'scorpio': 45,
 'sports': 46,
 'student': 47,
 'taurus': 48,
 'technology': 49,
 'virgo': 50}

##Transforming the labels##

In [28]:
label_classes = []  
for key in vectorizer_labels.vocabulary_.keys():
    label_classes.append(key)
    
print(sorted(label_classes))

['14', '15', '16', '17', '23', '24', '25', '26', '27', '33', '34', '35', '37', '39', '41', '44', '45', 'accounting', 'aquarius', 'aries', 'arts', 'banking', 'businessservices', 'cancer', 'capricorn', 'communications', 'education', 'engineering', 'female', 'gemini', 'indunk', 'internet', 'investmentbanking', 'leo', 'libra', 'libraries', 'male', 'media', 'museums', 'non', 'pisces', 'profit', 'recreation', 'sagittarius', 'science', 'scorpio', 'sports', 'student', 'taurus', 'technology', 'virgo']


In [0]:
mlb = MultiLabelBinarizer(classes = label_classes) #Tranforming the labels using MultiLabelBinarizer

In [30]:
y = [["".join(re.findall("\w",f)) for f in lst] for lst in [s.split(",") for s in y]]
y[30]

['male', '33', 'investmentbanking', 'aquarius']

In [31]:
y_new = mlb.fit(y) # transforming entire set of lables
y_new

MultiLabelBinarizer(classes=['male', '15', 'student', 'leo', '33',
                             'investmentbanking', 'aquarius', 'female', '14',
                             'indunk', 'aries', '25', 'capricorn', '17',
                             'gemini', '23', 'non', 'profit', 'cancer',
                             'banking', '37', 'sagittarius', '26', '24',
                             'scorpio', '27', 'education', '45', 'engineering',
                             'libra', ...],
                    sparse_output=False)

In [0]:
y_train = [["".join(re.findall("\w",f)) for f in lst] for lst in [s.split(",") for s in y_train]]
y_train_new = mlb.transform(y_train)
y_test = [["".join(re.findall("\w",f)) for f in lst] for lst in [s.split(",") for s in y_test]]
y_test_new = mlb.transform(y_test)

In [33]:
y_train_new[30]

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0])

In [34]:
y_train[30]

['male', '17', 'sportsrecreation', 'capricorn']

In [35]:
mlb.classes_

array(['male', '15', 'student', 'leo', '33', 'investmentbanking',
       'aquarius', 'female', '14', 'indunk', 'aries', '25', 'capricorn',
       '17', 'gemini', '23', 'non', 'profit', 'cancer', 'banking', '37',
       'sagittarius', '26', '24', 'scorpio', '27', 'education', '45',
       'engineering', 'libra', 'science', '34', '41', 'communications',
       'media', 'businessservices', 'sports', 'recreation', 'virgo',
       'taurus', 'arts', 'pisces', '44', '16', 'internet', 'museums',
       'libraries', 'accounting', '39', '35', 'technology'], dtype=object)

##Choose a classifier##

In [0]:
clf = LogisticRegression(solver = 'lbfgs',max_iter = 3000) 
clf = OneVsRestClassifier(clf)

##Fit the classifier, make predictions and get the accuracy##

In [37]:
clf.fit(X_train_dtm, y_train_new)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=3000,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [38]:
print("Train Accuracy:",clf.score(X_train_dtm,y_train_new)*100)

Train Accuracy: 97.06666666666666


In [0]:
y_pred = clf.predict(X_test_dtm)

In [40]:
print("Test Accuracy:" + str(accuracy_score(y_test_new, y_pred)))
print("F1: " + str(f1_score(y_test_new, y_pred, average='micro')))
print("F1_macro: " + str(f1_score(y_test_new, y_pred, average='macro')))
print("Precision: " + str(precision_score(y_test_new, y_pred, average='micro')))
print("Precision_macro: " + str(precision_score(y_test_new, y_pred, average='macro')))
print("Recall: " + str(recall_score(y_test_new, y_pred, average='micro')))
print("Recall_macro: " + str(recall_score(y_test_new, y_pred, average='macro')))

Test Accuracy:0.5253333333333333
F1: 0.6117241379310345
F1_macro: 0.06581883354139424
Precision: 0.6248679112363509
Precision_macro: 0.15664508793887516
Recall: 0.5991219182708545
Recall_macro: 0.08132388166476308


##Print true label and predicted label##

In [0]:
y_pred_inv = mlb.inverse_transform(y_pred)   # inverse transforming predited label data
y_test_new_inv =  mlb.inverse_transform(y_test_new) # inverse transforming original test label data

In [42]:
print("Example 1 - predicted :",y_pred_inv[0])
print("Example 1 - Actual :",y_test_new_inv[0])
print("Example 1 - Actual_before mlb transformation :",y_test[0])

Example 1 - predicted : ('male', 'aries', '35', 'technology')
Example 1 - Actual : ('male', 'aries', '35', 'technology')
Example 1 - Actual_before mlb transformation : ['male', '35', 'technology', 'aries']


In [43]:
print("Example 2 - predicted :",y_pred_inv[5])
print("Example 2 - Actual :",y_test_new_inv[5])
print("Example 2 - Actual_before mlb transformation :",y_test[5])

Example 2 - predicted : ('male', 'aries', '35', 'technology')
Example 2 - Actual : ('male', 'aries', '35', 'technology')
Example 2 - Actual_before mlb transformation : ['male', '35', 'technology', 'aries']


In [44]:
print("Example 3 - predicted :",y_pred_inv[10])
print("Example 3 - Actual :",y_test_new_inv[10])
print("Example 3 - Actual_before mlb transformation :",y_test[10])

Example 3 - predicted : ('male', 'aries', '35', 'technology')
Example 3 - Actual : ('male', '15', 'student', 'aquarius')
Example 3 - Actual_before mlb transformation : ['male', '15', 'student', 'aquarius']


In [45]:
print("Example 4 - predicted :",y_pred_inv[15])
print("Example 4 - Actual :",y_test_new_inv[15])
print("Example 4 - Actual_before mlb transformation :",y_test[15])

Example 4 - predicted : ('male', 'aries', '35', 'technology')
Example 4 - Actual : ('male', 'aries', '35', 'technology')
Example 4 - Actual_before mlb transformation : ['male', '35', 'technology', 'aries']


In [46]:
print("Example 5 - predicted :",y_pred_inv[20])
print("Example 5 - Actual :",y_test_new_inv[20])
print("Example 5 - Actual_before mlb transformation :",y_test[20])

Example 5 - predicted : ('male', 'aries', '35', 'technology')
Example 5 - Actual : ('student', 'female', 'aries', '17')
Example 5 - Actual_before mlb transformation : ['female', '17', 'student', 'aries']


In [47]:
print("Example 6 - predicted :",y_pred_inv[25])
print("Example 6 - Actual :",y_test_new_inv[25])
print("Example 6 - Actual_before mlb transformation :",y_test[25])

Example 6 - predicted : ('male', 'aries', '35', 'technology')
Example 6 - Actual : ('male', 'leo', '26')
Example 6 - Actual_before mlb transformation : ['male', '26', 'museumslibraries', 'leo']


In [48]:
print("Example 7 - predicted :",y_pred_inv[30])
print("Example 7 - Actual :",y_test_new_inv[30])
print("Example 7 - Actual_before mlb transformation :",y_test[30])

Example 7 - predicted : ('male', 'aries', '35', 'technology')
Example 7 - Actual : ('male', 'aries', '35', 'technology')
Example 7 - Actual_before mlb transformation : ['male', '35', 'technology', 'aries']


In [49]:
print("Example 7 - predicted :",y_pred_inv[35])
print("Example 7 - Actual :",y_test_new_inv[35])
print("Example 7 - Actual_before mlb transformation :",y_test[35])

Example 7 - predicted : ('male', 'aries', '35', 'technology')
Example 7 - Actual : ('male', 'aries', '35', 'technology')
Example 7 - Actual_before mlb transformation : ['male', '35', 'technology', 'aries']


In [50]:
print("Example 8 - predicted :",y_pred_inv[40])
print("Example 8 - Actual :",y_test_new_inv[40])
print("Example 8 - Actual_before mlb transformation :",y_test[40])

Example 8 - predicted : ('male', 'aries', '35', 'technology')
Example 8 - Actual : ('male', 'student', '17', 'sagittarius')
Example 8 - Actual_before mlb transformation : ['male', '17', 'student', 'sagittarius']


In [51]:
print("Example 9 - predicted :",y_pred_inv[45])
print("Example 9 - Actual :",y_test_new_inv[45])
print("Example 9 - Actual_before mlb transformation :",y_test[45])

Example 9 - predicted : ('male', 'aries', '35', 'technology')
Example 9 - Actual : ('student', 'female', '25', 'taurus')
Example 9 - Actual_before mlb transformation : ['female', '25', 'student', 'taurus']


In [52]:
print("Example 10 - predicted :",y_pred_inv[50])
print("Example 10 - Actual :",y_test_new_inv[50])
print("Example 10 - Actual_before mlb transformation :",y_test[50])

Example 10 - predicted : ('male', 'aries', '35', 'technology')
Example 10 - Actual : ('male', '15', 'libra', 'science')
Example 10 - Actual_before mlb transformation : ['male', '15', 'science', 'libra']
