# Project Statistical NLP



---



**Load the dataset**

In [1]:
from sklearn import datasets
import numpy as np
import pandas as pd
import io
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns;

import warnings
warnings.filterwarnings("ignore")

  import pandas.util.testing as tm


In [0]:
btext_df = pd.read_csv('/content/drive/My Drive/ABK_AIML/NLP/blogtext.csv', delimiter=',')

In [3]:
btext_df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [4]:
btext_df.shape

(681284, 7)

In [0]:
btext_dfsample = btext_df.sample(n=10000, random_state=1)

In [6]:
btext_dfsample.shape

(10000, 7)

In [7]:
btext_df['gender'].value_counts()

male      345193
female    336091
Name: gender, dtype: int64

In [8]:
btext_dfsample['gender'].value_counts()

male      5108
female    4892
Name: gender, dtype: int64

**Drop ID and Date**

In [0]:
btext_dfsample.drop(columns=['id','date'], axis=1, inplace=True)

In [10]:
btext_dfsample.head()

Unnamed: 0,gender,age,topic,sign,text
25639,male,33,indUnk,Pisces,Let's say you have friends that hav...
216060,male,15,Technology,Aries,Was officially the COOLEST FUCKING ...
633204,male,17,Student,Gemini,"Apparently, a few people consider..."
582291,male,27,indUnk,Aries,His nose is too big for his face. Eyes...
366878,female,27,indUnk,Gemini,urlLink urlLink 16-feb-04


**Preprocess rows of the dataset** 
*   **Remove unwanted characters**

*   **Convert text to lowercase** 
*   **Remove unwanted spaces** 


*   **Remove stopwords**


In [0]:
from bs4 import BeautifulSoup
import re
import unicodedata
from nltk.stem import WordNetLemmatizer

In [12]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

In [0]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [0]:
def remove_special_characters(text, remove_digits=False):
    #Using regex
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

In [0]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])    

In [0]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [0]:
def normalize_corpus(corpus, html_stripping=True, accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

In [0]:
btext_dfsample['text'] = normalize_corpus(btext_dfsample['text'])
btext_dfsample['topic'] = normalize_corpus(btext_dfsample['topic'])
btext_dfsample['gender'] = normalize_corpus(btext_dfsample['gender'])
btext_dfsample['sign'] = normalize_corpus(btext_dfsample['sign'])

In [20]:
btext_dfsample.head()

Unnamed: 0,gender,age,topic,sign,text
25639,male,33,indunk,pisces,lets say you have friend that have stood by yo...
216060,male,15,technology,aries,wa officially the coolest fucking day ever bla...
633204,male,17,student,gemini,apparently a few people considered that cory w...
582291,male,27,indunk,aries,his nose is too big for his face eyes soft lit...
366878,female,27,indunk,gemini,urllink urllink feb


In [21]:
btext_dfsample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 25639 to 127105
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   gender  10000 non-null  object
 1   age     10000 non-null  int64 
 2   topic   10000 non-null  object
 3   sign    10000 non-null  object
 4   text    10000 non-null  object
dtypes: int64(1), object(4)
memory usage: 468.8+ KB


In [22]:
btext_dfsample['gender'].value_counts()

male      5108
female    4892
Name: gender, dtype: int64

In [23]:
btext_dfsample['age'].unique()

array([33, 15, 17, 27, 16, 25, 14, 34, 46, 41, 23, 39, 26, 24, 45, 13, 43,
       42, 47, 40, 36, 35, 37, 44, 38, 48])

In [24]:
btext_dfsample['topic'].unique()

array(['indunk', 'technology', 'student', 'nonprofit', 'consulting',
       'art', 'education', 'chemical', 'law', 'fashion',
       'communicationsmedia', 'engineering', 'internet', 'religion',
       'science', 'sportsrecreation', 'accounting', 'agriculture',
       'banking', 'publishing', 'lawenforcementsecurity',
       'humanresources', 'construction', 'advertising', 'transportation',
       'telecommunication', 'biotech', 'investmentbanking',
       'businessservices', 'automotive', 'marketing', 'government',
       'museumslibraries', 'realestate', 'tourism', 'military',
       'manufacturing', 'architecture', 'environment', 'maritime'],
      dtype=object)

In [25]:
btext_dfsample['sign'].unique()

array(['pisces', 'aries', 'gemini', 'aquarius', 'cancer', 'capricorn',
       'scorpio', 'leo', 'libra', 'virgo', 'sagittarius', 'taurus'],
      dtype=object)

In [0]:
btext_dfsample = btext_dfsample.astype({"age": str})

In [27]:
btext_dfsample.dtypes

gender    object
age       object
topic     object
sign      object
text      object
dtype: object

**A Dictionary to get the count of every label i.e. the key will be label name and value will be the total count of the label.**

In [0]:
count_labels=dict(btext_dfsample['gender'].value_counts())
count_labels.update(dict(btext_dfsample['age'].value_counts()))
count_labels.update(dict(btext_dfsample['topic'].value_counts()))
count_labels.update(dict(btext_dfsample['sign'].value_counts()))

In [29]:
len(count_labels)

80

In [30]:
count_labels

{'13': 178,
 '14': 410,
 '15': 576,
 '16': 1108,
 '17': 1209,
 '23': 1066,
 '24': 1182,
 '25': 1006,
 '26': 773,
 '27': 639,
 '33': 276,
 '34': 297,
 '35': 250,
 '36': 217,
 '37': 162,
 '38': 120,
 '39': 87,
 '40': 82,
 '41': 51,
 '42': 39,
 '43': 67,
 '44': 21,
 '45': 57,
 '46': 50,
 '47': 29,
 '48': 48,
 'accounting': 75,
 'advertising': 80,
 'agriculture': 22,
 'aquarius': 677,
 'architecture': 34,
 'aries': 1017,
 'art': 469,
 'automotive': 25,
 'banking': 53,
 'biotech': 28,
 'businessservices': 59,
 'cancer': 923,
 'capricorn': 741,
 'chemical': 38,
 'communicationsmedia': 283,
 'construction': 8,
 'consulting': 98,
 'education': 406,
 'engineering': 155,
 'environment': 15,
 'fashion': 75,
 'female': 4892,
 'gemini': 799,
 'government': 94,
 'humanresources': 44,
 'indunk': 3700,
 'internet': 215,
 'investmentbanking': 24,
 'law': 129,
 'lawenforcementsecurity': 31,
 'leo': 773,
 'libra': 895,
 'male': 5108,
 'manufacturing': 37,
 'maritime': 5,
 'marketing': 63,
 'military': 38

**Label columns to merge: “gender”, “age”, “topic”, “sign”**

In [0]:
# btext_dfsample['labels'] = btext_dfsample['gender'].str.cat(btext_dfsample['age'], sep=", ").str.cat(btext_dfsample['topic'], sep=", ").str.cat(btext_dfsample['sign'], sep=", ")

btext_dfsample['labels'] = btext_dfsample[['gender', 'age', 'topic', 'sign']].values.tolist()

In [0]:
btext_dfsample.drop(columns=['gender','age','topic','sign'], inplace=True)

In [33]:
btext_dfsample.head()

Unnamed: 0,text,labels
25639,lets say you have friend that have stood by yo...,"[male, 33, indunk, pisces]"
216060,wa officially the coolest fucking day ever bla...,"[male, 15, technology, aries]"
633204,apparently a few people considered that cory w...,"[male, 17, student, gemini]"
582291,his nose is too big for his face eyes soft lit...,"[male, 27, indunk, aries]"
366878,urllink urllink feb,"[female, 27, indunk, gemini]"


**Separating features and labels, and splitting the data into training and testing**

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X = btext_dfsample['text']
Y = btext_dfsample['labels']

In [0]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state=1)

In [37]:
print(x_train.shape)
print(y_train.shape)

(7000,)
(7000,)


In [38]:
print(x_test.shape)
print(y_test.shape)

(3000,)
(3000,)


In [39]:
y_train.head()

296027       [female, 23, student, leo]
5850       [female, 27, indunk, taurus]
413325    [male, 36, realestate, virgo]
17542      [female, 15, student, virgo]
278394       [male, 33, art, capricorn]
Name: labels, dtype: object

In [40]:
y_test.head()

514717               [male, 14, indunk, aquarius]
552986    [male, 46, communicationsmedia, gemini]
287557           [female, 23, technology, taurus]
182349            [male, 25, engineering, gemini]
247677     [female, 27, businessservices, pisces]
Name: labels, dtype: object

**Vectorizing the features  
Creating a Bag of Words using count vectorizer, Using ngram_range=(1, 2)** 

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
cvect = CountVectorizer(ngram_range=(1,2))

In [42]:
#Feed train data to CountVectorizer
cvect.fit(x_train)

#Check the vocablury size
len(cvect.vocabulary_)

573327

In [0]:
X_train_ct = cvect.transform(x_train)

In [44]:
#Size of Document Term Matrix
X_train_ct.shape

(7000, 573327)

In [45]:
print(X_train_ct[0])

  (0, 36324)	1
  (0, 36729)	1
  (0, 73707)	1
  (0, 73743)	1
  (0, 131983)	1
  (0, 132389)	1
  (0, 133339)	1
  (0, 133659)	1
  (0, 184450)	1
  (0, 184661)	1
  (0, 243985)	1
  (0, 246719)	1
  (0, 273310)	1
  (0, 273325)	1
  (0, 372671)	1
  (0, 372689)	1
  (0, 493152)	1
  (0, 501075)	1
  (0, 501685)	1
  (0, 536218)	1
  (0, 536293)	1
  (0, 546152)	1
  (0, 546516)	1


**Vectorizing training and testing features**

In [0]:
X_test_ct = cvect.transform(x_test)

In [47]:
X_test_ct.shape

(3000, 573327)

In [0]:
cv_matrix = X_test_ct.toarray()

In [0]:
vocab = cvect.get_feature_names()

**Printing the term-document matrix**

In [50]:
pd.DataFrame(cv_matrix, columns=vocab)

Unnamed: 0,__,__ ___,__ but,__ man,__ of,__ oh,___,___ god,___ love,___ moment,___ my,___ she,___ went,_____,_____ and,_____ right,_____ the,_____ this,______,______ peace,_______,_______ ako,_______ it,_______ man,________,________ my,__________,__________ something,______________,______________ my,_________________,_________________ will,_______________________,_______________________ ratings,_________________________,_________________________ the,___________________________,___________________________ ratings______________________________________________________________,_____________________________________________________,_____________________________________________________ current,...,zuph,zuph an,zuphite,zuphite from,zur,zur conclusion,zwallet,zwallet comindex,zwei,zwei parallelen,zwei sprachen,zx,zx wat,zx went,zyban,zyban lame,zylryan,zylryan and,zz,zz next,zzaniah,zzn,zzn com,zzs,zzs after,zzz,zzz same,zzzs,zzzs no,zzzz,zzzz sabby,zzzz the,zzzzs,zzzzs but,zzzzz,zzzzz drop,zzzzzzz,zzzzzzzz,zzzzzzzz shucks,zzzzzzzzzz
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2996,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2997,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


 **Transforming the labels**
 **Converting train and test labels using MultiLabelBinarizer**

In [0]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

In [0]:
y_train_ = mlb.fit_transform(y_train)
y_test_ = mlb.transform(y_test)

In [53]:
mlb.classes_

array(['13', '14', '15', '16', '17', '23', '24', '25', '26', '27', '33',
       '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44',
       '45', '46', '47', '48', 'accounting', 'advertising', 'agriculture',
       'aquarius', 'architecture', 'aries', 'art', 'automotive',
       'banking', 'biotech', 'businessservices', 'cancer', 'capricorn',
       'chemical', 'communicationsmedia', 'construction', 'consulting',
       'education', 'engineering', 'environment', 'fashion', 'female',
       'gemini', 'government', 'humanresources', 'indunk', 'internet',
       'investmentbanking', 'law', 'lawenforcementsecurity', 'leo',
       'libra', 'male', 'manufacturing', 'maritime', 'marketing',
       'military', 'museumslibraries', 'nonprofit', 'pisces',
       'publishing', 'realestate', 'religion', 'sagittarius', 'science',
       'scorpio', 'sportsrecreation', 'student', 'taurus', 'technology',
       'telecommunication', 'tourism', 'transportation', 'virgo'],
      dtype=object)

In [54]:
y_train_.shape

(7000, 80)

In [55]:
y_test_.shape

(3000, 80)

**Using LogisticRegression classifier, wrapping it up in OneVsRestClassifier to train it on every label**

In [0]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [0]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [0]:
clf = LogisticRegression(solver='lbfgs')
clf = OneVsRestClassifier(clf)

In [59]:
clf.fit(X_train_ct, y_train_)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [0]:
y_predict = clf.predict(X_test_ct)

In [61]:
metrics.accuracy_score(y_test_, y_predict)

0.002

In [0]:
lgr_recall = metrics.recall_score(y_test_, y_predict, pos_label='positive', average='micro')
lgr_precision = metrics.precision_score(y_test_, y_predict, pos_label='positive', average='micro')
lgr_f1score = metrics.f1_score(y_test_, y_predict, pos_label='positive', average='micro')

In [63]:
print(color.BOLD + "F1-score: "+ color.END +"{}".format(lgr_f1score*100))
print(color.BOLD + "Precision Score: "+ color.END +"{}".format(lgr_precision*100))
print(color.BOLD + "Recall Score: "+ color.END +"{}".format(lgr_recall*100))

[1mF1-score: [0m28.845807574575353
[1mPrecision Score: [0m51.55546020167346
[1mRecall Score: [0m20.025000000000002


**Examples of True label and Predicted label** 


In [0]:
y_act = pd.DataFrame(mlb.inverse_transform(y_test_))

In [0]:
y_pred = pd.DataFrame(mlb.inverse_transform(y_predict))

In [66]:
y_act.head(5)

Unnamed: 0,0,1,2,3
0,14,aquarius,indunk,male
1,46,communicationsmedia,gemini,male
2,23,female,taurus,technology
3,25,engineering,gemini,male
4,27,businessservices,female,pisces


In [67]:
y_pred.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34
0,female,student,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,cancer,male,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,male,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,male,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,male,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
