# Project Statistical NLP



---



**Load the dataset**

In [1]:
from sklearn import datasets
import numpy as np
import pandas as pd
import io
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns;

import warnings
warnings.filterwarnings("ignore")

  import pandas.util.testing as tm


In [0]:
btext_df = pd.read_csv('/content/drive/My Drive/ABK_AIML/NLP/blogtext.csv', delimiter=',')

In [3]:
btext_df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [4]:
btext_df.shape

(681284, 7)

In [0]:
btext_dfsample = btext_df.sample(n=10000, random_state=1)

In [6]:
btext_dfsample.shape

(10000, 7)

In [7]:
btext_df['gender'].value_counts()

male      345193
female    336091
Name: gender, dtype: int64

In [8]:
btext_dfsample['gender'].value_counts()

male      5108
female    4892
Name: gender, dtype: int64

In [0]:
btext_dfsample = btext_df.head(10000)

**Drop ID and Date**

In [0]:
btext_dfsample.drop(columns=['id','date'], axis=1, inplace=True)

In [11]:
btext_dfsample.head()

Unnamed: 0,gender,age,topic,sign,text
0,male,15,Student,Leo,"Info has been found (+/- 100 pages,..."
1,male,15,Student,Leo,These are the team members: Drewe...
2,male,15,Student,Leo,In het kader van kernfusie op aarde...
3,male,15,Student,Leo,testing!!! testing!!!
4,male,33,InvestmentBanking,Aquarius,Thanks to Yahoo!'s Toolbar I can ...


**Preprocess rows of the dataset** 
*   **Remove unwanted characters**

*   **Convert text to lowercase** 
*   **Remove unwanted spaces** 


*   **Remove stopwords**


In [0]:
from bs4 import BeautifulSoup
import re
import unicodedata
from nltk.stem import WordNetLemmatizer

In [13]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

In [0]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [0]:
def remove_special_characters(text, remove_digits=False):
    #Using regex
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

In [0]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])    

In [0]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [0]:
def normalize_corpus(corpus, html_stripping=True, accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

In [0]:
btext_dfsample['text'] = normalize_corpus(btext_dfsample['text'])
btext_dfsample['topic'] = normalize_corpus(btext_dfsample['topic'])
btext_dfsample['gender'] = normalize_corpus(btext_dfsample['gender'])
btext_dfsample['sign'] = normalize_corpus(btext_dfsample['sign'])

In [21]:
btext_dfsample.head()

Unnamed: 0,gender,age,topic,sign,text
0,male,15,student,leo,info ha been found pages and mb of pdf files n...
1,male,15,student,leo,these are the team members drewes van der laag...
2,male,15,student,leo,in het kader van kernfusie op aarde maak je ei...
3,male,15,student,leo,testing testing
4,male,33,investmentbanking,aquarius,thanks to yahoo s toolbar i can now capture th...


In [22]:
btext_dfsample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   gender  10000 non-null  object
 1   age     10000 non-null  int64 
 2   topic   10000 non-null  object
 3   sign    10000 non-null  object
 4   text    10000 non-null  object
dtypes: int64(1), object(4)
memory usage: 390.8+ KB


In [23]:
btext_dfsample['gender'].value_counts()

male      5916
female    4084
Name: gender, dtype: int64

In [24]:
btext_dfsample['age'].unique()

array([15, 33, 14, 25, 17, 23, 37, 26, 24, 27, 45, 34, 41, 44, 16, 39, 35,
       36, 46, 42, 13, 38, 43, 40])

In [25]:
btext_dfsample['topic'].unique()

array(['student', 'investmentbanking', 'indunk', 'nonprofit', 'banking',
       'education', 'engineering', 'science', 'communicationsmedia',
       'businessservices', 'sportsrecreation', 'art', 'internet',
       'museumslibraries', 'accounting', 'technology', 'law',
       'consulting', 'automotive', 'religion', 'fashion', 'publishing',
       'marketing', 'lawenforcementsecurity', 'humanresources',
       'telecommunication'], dtype=object)

In [26]:
btext_dfsample['sign'].unique()

array(['leo', 'aquarius', 'aries', 'capricorn', 'gemini', 'cancer',
       'sagittarius', 'scorpio', 'libra', 'virgo', 'taurus', 'pisces'],
      dtype=object)

In [0]:
btext_dfsample = btext_dfsample.astype({"age": str})

In [28]:
btext_dfsample.dtypes

gender    object
age       object
topic     object
sign      object
text      object
dtype: object

**A Dictionary to get the count of every label i.e. the key will be label name and value will be the total count of the label.**

In [0]:
count_labels=dict(btext_dfsample['gender'].value_counts())
count_labels.update(dict(btext_dfsample['age'].value_counts()))
count_labels.update(dict(btext_dfsample['topic'].value_counts()))
count_labels.update(dict(btext_dfsample['sign'].value_counts()))

In [30]:
len(count_labels)

64

In [31]:
count_labels

{'13': 42,
 '14': 212,
 '15': 602,
 '16': 440,
 '17': 1185,
 '23': 253,
 '24': 655,
 '25': 386,
 '26': 234,
 '27': 1054,
 '33': 136,
 '34': 553,
 '35': 2315,
 '36': 1708,
 '37': 33,
 '38': 46,
 '39': 79,
 '40': 1,
 '41': 20,
 '42': 14,
 '43': 6,
 '44': 3,
 '45': 16,
 '46': 7,
 'accounting': 4,
 'aquarius': 571,
 'aries': 4198,
 'art': 45,
 'automotive': 14,
 'banking': 16,
 'businessservices': 91,
 'cancer': 504,
 'capricorn': 215,
 'communicationsmedia': 99,
 'consulting': 21,
 'education': 270,
 'engineering': 127,
 'fashion': 1622,
 'female': 4084,
 'gemini': 150,
 'humanresources': 2,
 'indunk': 3287,
 'internet': 118,
 'investmentbanking': 70,
 'law': 11,
 'lawenforcementsecurity': 10,
 'leo': 301,
 'libra': 491,
 'male': 5916,
 'marketing': 156,
 'museumslibraries': 17,
 'nonprofit': 71,
 'pisces': 454,
 'publishing': 4,
 'religion': 9,
 'sagittarius': 1097,
 'science': 63,
 'scorpio': 971,
 'sportsrecreation': 80,
 'student': 1137,
 'taurus': 812,
 'technology': 2654,
 'telecomm

**Label columns to merge: “gender”, “age”, “topic”, “sign”**

In [0]:
# btext_dfsample['labels'] = btext_dfsample['gender'].str.cat(btext_dfsample['age'], sep=", ").str.cat(btext_dfsample['topic'], sep=", ").str.cat(btext_dfsample['sign'], sep=", ")

btext_dfsample['labels'] = btext_dfsample[['gender', 'age', 'topic', 'sign']].values.tolist()

In [0]:
btext_dfsample.drop(columns=['gender','age','topic','sign'], inplace=True)

In [34]:
btext_dfsample.head()

Unnamed: 0,text,labels
0,info ha been found pages and mb of pdf files n...,"[male, 15, student, leo]"
1,these are the team members drewes van der laag...,"[male, 15, student, leo]"
2,in het kader van kernfusie op aarde maak je ei...,"[male, 15, student, leo]"
3,testing testing,"[male, 15, student, leo]"
4,thanks to yahoo s toolbar i can now capture th...,"[male, 33, investmentbanking, aquarius]"


**Separating features and labels, and splitting the data into training and testing**

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X = btext_dfsample['text']
Y = btext_dfsample['labels']

In [0]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state=1)

In [38]:
print(x_train.shape)
print(y_train.shape)

(7000,)
(7000,)


In [39]:
print(x_test.shape)
print(y_test.shape)

(3000,)
(3000,)


In [40]:
y_train.head()

2228    [male, 35, technology, aries]
5910     [female, 27, indunk, taurus]
1950    [male, 35, technology, aries]
2119    [male, 35, technology, aries]
5947     [female, 27, indunk, taurus]
Name: labels, dtype: object

In [41]:
y_test.head()

9953    [female, 16, indunk, capricorn]
3850        [male, 14, student, pisces]
4962      [female, 17, indunk, scorpio]
3886       [female, 36, indunk, pisces]
5437      [female, 17, indunk, scorpio]
Name: labels, dtype: object

**Vectorizing the features  
Creating a Bag of Words using count vectorizer, Using ngram_range=(1, 2)** 

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
cvect = CountVectorizer(ngram_range=(1,2))

In [43]:
#Feed train data to CountVectorizer
cvect.fit(x_train)

#Check the vocablury size
len(cvect.vocabulary_)

480455

In [0]:
X_train_ct = cvect.transform(x_train)

In [45]:
#Size of Document Term Matrix
X_train_ct.shape

(7000, 480455)

In [46]:
print(X_train_ct[0])

  (0, 9313)	1
  (0, 10239)	1
  (0, 16322)	2
  (0, 19652)	2
  (0, 28338)	1
  (0, 29819)	1
  (0, 37065)	1
  (0, 37167)	1
  (0, 93554)	1
  (0, 93647)	1
  (0, 110014)	1
  (0, 110209)	1
  (0, 188678)	4
  (0, 189111)	4
  (0, 225015)	1
  (0, 225019)	1
  (0, 226448)	4
  (0, 226470)	4
  (0, 229160)	1
  (0, 229227)	1
  (0, 237814)	1
  (0, 237831)	1
  (0, 246235)	1
  (0, 246934)	1
  (0, 333913)	1
  (0, 333916)	1
  (0, 390234)	1
  (0, 390243)	1
  (0, 397237)	1
  (0, 405446)	1
  (0, 411777)	1
  (0, 412262)	1
  (0, 417504)	1
  (0, 419825)	4
  (0, 420710)	1
  (0, 421985)	1
  (0, 422032)	1
  (0, 422091)	1
  (0, 455060)	1
  (0, 455336)	1
  (0, 457434)	1
  (0, 457467)	1
  (0, 475917)	2
  (0, 476272)	1
  (0, 477253)	1


**Vectorizing training and testing features**

In [0]:
X_test_ct = cvect.transform(x_test)

In [48]:
X_test_ct.shape

(3000, 480455)

In [0]:
cv_matrix = X_test_ct.toarray()

In [0]:
vocab = cvect.get_feature_names()

**Printing the term-document matrix**

In [51]:
pd.DataFrame(cv_matrix, columns=vocab)

Unnamed: 0,__,__ ___,__ all,__ and,__ being,__ bit,__ chance,__ congrats,__ english,__ geo,__ goodluck,__ he,__ ill,__ keke,__ or,__ shame,__ so,__ that,__ the,__ until,__ which,__ yes,__ you,___,___ ___,___ and,___ but,___ chinese,___ currently,___ had,___ haha,___ happy,___ im,___ ive,___ like,___ my,___ of,___ ok,___ played,___ slip,...,zuluchim from,zuluchim heyness,zuluchim oh,zuluchim uh,zuluchim very,zumanity,zumanity at,zuo,zuo le,zur,zur gumitlikeit,zurich,zurich switzerland,zy,zywiec,zywiec this,zza,zza dongeast,zzz,zzzs,zzzs before,zzzs well,zzzz,zzzz first,zzzz glad,zzzz slept,zzzzs,zzzzs when,zzzzz,zzzzz drop,zzzzzs,zzzzzs at,zzzzzz,zzzzzz had,zzzzzzz,zzzzzzzzzz,zzzzzzzzzz didnt,zzzzzzzzzzzz,zzzzzzzzzzzzz,zzzzzzzzzzzzzzz
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2996,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2997,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


 **Transforming the labels**
 **Converting train and test labels using MultiLabelBinarizer**

In [0]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

In [0]:
y_train_ = mlb.fit_transform(y_train)
y_test_ = mlb.transform(y_test)

In [54]:
mlb.classes_

array(['13', '14', '15', '16', '17', '23', '24', '25', '26', '27', '33',
       '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44',
       '45', '46', 'accounting', 'aquarius', 'aries', 'art', 'automotive',
       'banking', 'businessservices', 'cancer', 'capricorn',
       'communicationsmedia', 'consulting', 'education', 'engineering',
       'fashion', 'female', 'gemini', 'humanresources', 'indunk',
       'internet', 'investmentbanking', 'law', 'lawenforcementsecurity',
       'leo', 'libra', 'male', 'marketing', 'museumslibraries',
       'nonprofit', 'pisces', 'publishing', 'religion', 'sagittarius',
       'science', 'scorpio', 'sportsrecreation', 'student', 'taurus',
       'technology', 'virgo'], dtype=object)

In [55]:
y_train_.shape

(7000, 63)

In [56]:
y_test_.shape

(3000, 63)

**Using LogisticRegression classifier, wrapping it up in OneVsRestClassifier to train it on every label**

In [0]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [0]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [0]:
clf = LogisticRegression(solver='lbfgs')
clf = OneVsRestClassifier(clf)

In [60]:
clf.fit(X_train_ct, y_train_)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [0]:
y_predict = clf.predict(X_test_ct)

In [62]:
metrics.accuracy_score(y_test_, y_predict)

0.2926666666666667

In [0]:
lgr_recall = metrics.recall_score(y_test_, y_predict, pos_label='positive', average='micro')
lgr_precision = metrics.precision_score(y_test_, y_predict, pos_label='positive', average='micro')
lgr_f1score = metrics.f1_score(y_test_, y_predict, pos_label='positive', average='micro')

In [64]:
print(color.BOLD + "F1-score: "+ color.END +"{}".format(lgr_f1score*100))
print(color.BOLD + "Precision Score: "+ color.END +"{}".format(lgr_precision*100))
print(color.BOLD + "Recall Score: "+ color.END +"{}".format(lgr_recall*100))

[1mF1-score: [0m63.57524903086512
[1mPrecision Score: [0m77.29387901205106
[1mRecall Score: [0m53.992332055342565


**Examples of True label and Predicted label** 


In [0]:
y_act = pd.DataFrame(mlb.inverse_transform(y_test_))

In [0]:
y_pred = pd.DataFrame(mlb.inverse_transform(y_predict))

In [67]:
y_act.head(5)

Unnamed: 0,0,1,2,3
0,16,capricorn,female,indunk
1,14,male,pisces,student
2,17,female,indunk,scorpio
3,36,female,indunk,pisces
4,17,female,indunk,scorpio


In [68]:
y_pred.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,35,aries,male,technology,,,,,,,,
1,aries,male,,,,,,,,,,
2,35,aries,male,technology,,,,,,,,
3,36,female,indunk,,,,,,,,,
4,female,,,,,,,,,,,
