# Statistical NLP Project

## Loading required libaries

In [53]:
import numpy as np
import pandas as pd

import re

import nltk

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, average_precision_score, recall_score, f1_score

## Loading data

In [54]:
raw_data = pd.read_csv('blogtext.csv')
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681284 entries, 0 to 681283
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      681284 non-null  int64 
 1   gender  681284 non-null  object
 2   age     681284 non-null  int64 
 3   topic   681284 non-null  object
 4   sign    681284 non-null  object
 5   date    681284 non-null  object
 6   text    681284 non-null  object
dtypes: int64(2), object(5)
memory usage: 36.4+ MB


## Preprocessing data

In [55]:
raw_data.isnull().sum()

id        0
gender    0
age       0
topic     0
sign      0
date      0
text      0
dtype: int64

In [56]:
#working_data = raw_data.head(10) # to reduce the time complexity for model building
#working_data = raw_data.copy()
working_data = raw_data.head(100000)

In [57]:
working_data.text

0                   Info has been found (+/- 100 pages,...
1                   These are the team members:   Drewe...
2                   In het kader van kernfusie op aarde...
3                         testing!!!  testing!!!          
4                     Thanks to Yahoo!'s Toolbar I can ...
                               ...                        
99995                THE HINDU - 125 YEARS             ...
99996                DILBERT & IIT-ans                 ...
99997                Case Study : How HP won $3 billion...
99998                Championing Chennai               ...
99999                WEEKEND                         It...
Name: text, Length: 100000, dtype: object

In [58]:
def preprocess_data(input_column):
    working_column = input_column

    # a. Remove unwanted characters
    working_column = working_column.apply(lambda x : re.sub('[^A-Za-z]+', ' ', x))

    # b. Convert text to lowercase
    working_column = working_column.apply(lambda x : x.lower())

    # c. Remove unwanted spaces
    working_column = working_column.apply(lambda x: x.strip())

    # d. Remove stopwords
    stopwords = set(nltk.corpus.stopwords.words('English'))
    working_column = working_column.apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))

    output_column = working_column
    return output_column

In [59]:
working_data.text = preprocess_data(working_data.text)

In [60]:
working_data.text

0        info found pages mb pdf files wait untill team...
1        team members drewes van der laag urllink mail ...
2        het kader van kernfusie op aarde maak je eigen...
3                                          testing testing
4        thanks yahoo toolbar capture urls popups means...
                               ...                        
99995    hindu years great see special edition hindu co...
99996    dilbert iit ans global iit brand finds space u...
99997    case study hp billion p g outsourcing deal bea...
99998    championing chennai bangalore iim hyderabad ho...
99999    weekend turned rather interesting different we...
Name: text, Length: 100000, dtype: object

## Preparing data

In [61]:
def prepare_data(input_dataframe, columns_to_merge , columns_to_drop): #
    interim_dataframe = input_dataframe.copy()

    if len(columns_to_merge) > 0 :
        print(columns_to_merge)
        
        interim_dataframe['column'] = interim_dataframe[columns_to_merge[0]]
        for i in range(len(columns_to_merge)):
            print(columns_to_merge[i])
            if i>0:
                interim_dataframe['column'] = (interim_dataframe['column']+','+interim_dataframe[columns_to_merge[i]].astype(str)).to_list()
    
    interim_dataframe['column'] = interim_dataframe['column'].map(lambda x:x.split(','))

    if len(columns_to_drop) > 0 :
        interim_dataframe.drop(columns = columns_to_drop, inplace=True)


    output_dataframe = interim_dataframe.copy()
    return output_dataframe

In [62]:
working_data.columns

Index(['id', 'gender', 'age', 'topic', 'sign', 'date', 'text'], dtype='object')

In [63]:
working_data = prepare_data(working_data,['gender', 'age', 'topic', 'sign'],['id', 'gender', 'age', 'topic', 'sign', 'date'])
working_data.columns

['gender', 'age', 'topic', 'sign']
gender
age
topic
sign


Index(['text', 'column'], dtype='object')

In [64]:
working_data.head()

Unnamed: 0,text,column
0,info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,testing testing,"[male, 15, Student, Leo]"
4,thanks yahoo toolbar capture urls popups means...,"[male, 33, InvestmentBanking, Aquarius]"


In [65]:
working_data.rename(columns={"column":"label"}, inplace=True)

In [66]:
working_data.head()

Unnamed: 0,text,label
0,info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,testing testing,"[male, 15, Student, Leo]"
4,thanks yahoo toolbar capture urls popups means...,"[male, 33, InvestmentBanking, Aquarius]"


## Separating features and labels

In [67]:
X = working_data.text.values
y = working_data.label.values

## Splitting data into Train Test Validation data

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

In [69]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(80000,) (20000,) (80000,) (20000,)


## Vectorizing the feature data

### Creating Bag of words

In [70]:
countVectorizer = CountVectorizer(ngram_range=(1,2))
X_train_cv = countVectorizer.fit_transform(X_train)
X_test_cv = countVectorizer.transform(X_test)

In [71]:
len(countVectorizer.vocabulary_)

4156959

### Printing term-document matrix

In [72]:
# print(X_train_cv.toarray())

In [73]:
X_train_cv.shape

(80000, 4156959)

In [74]:
X_test_cv.shape

(20000, 4156959)

## Creating dictionary to contain feature label counts

In [75]:
working_data.head()

Unnamed: 0,text,label
0,info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,testing testing,"[male, 15, Student, Leo]"
4,thanks yahoo toolbar capture urls popups means...,"[male, 33, InvestmentBanking, Aquarius]"


In [76]:
label_list = working_data.label.to_list()
len(label_list)

100000

In [77]:
type(label_list)

list

In [78]:
label_list[0]

['male', '15', 'Student', 'Leo']

In [79]:
ALL_WORDS = []
def load_dictionary(input_list):
    for list in input_list:
        for word in list:
            ALL_WORDS.append(word)
    return ALL_WORDS

ALL_WORDS = load_dictionary(label_list)

#ALL_WORDS

In [80]:
def create_dict(input_list):
    output_dict = dict()
    value_count = 0
    print(len(input_list))
    print(range(len(input_list)))
    # for i in range(len(input_list)-2):
        
    #     if input_list[i] in output_dict:
    #         value_count = output_dict[input_list[i]]
    #         value_count += 1
    #         print(label_list[i], value_count)
    #     else:
    #         value_count = 1
    #     output_dict.setdefault(input_list[i], value_count)

    for word in input_list:
        if word in output_dict:
            output_dict[word] += 1
        else:
            output_dict[word] = 1
    #output_dict = { 'key' : lambda counts : counts + 1 if in input_list[i] for i in range(len(input_list)) }

    return output_dict

In [81]:
label_dict = create_dict(ALL_WORDS)

400000
range(0, 400000)


In [82]:
type(label_dict)

dict

In [83]:
#label_dict

## Transforming the labels

In [84]:
multiLabelBinarizer = MultiLabelBinarizer(classes=sorted(label_dict.keys()))


In [85]:
multiLabelBinarizer

MultiLabelBinarizer(classes=['13', '14', '15', '16', '17', '23', '24', '25',
                             '26', '27', '33', '34', '35', '36', '37', '38',
                             '39', '40', '41', '42', '43', '44', '45', '46',
                             '47', '48', 'Accounting', 'Advertising',
                             'Agriculture', 'Aquarius', ...])

In [86]:

y_train_mlb = multiLabelBinarizer.fit_transform(y_train)
y_test_mlb = multiLabelBinarizer.transform(y_test)

In [87]:
print(y_train_mlb, y_train_mlb.shape)

[[0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 1 0 0]
 ...
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]] (80000, 80)


In [88]:
print(y_test_mlb, y_test_mlb.shape)

[[0 0 0 ... 0 0 1]
 [0 0 0 ... 0 1 1]
 [0 0 0 ... 1 1 0]
 ...
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 1 1 0]] (20000, 80)


## Preparing the classifier

In [89]:
classifier = LogisticRegression(solver='lbfgs')
classifier = OneVsRestClassifier(classifier)

In [90]:
classifier.fit(X_train_cv, y_train_mlb)

OneVsRestClassifier(estimator=LogisticRegression())

In [91]:
classifier.classes_

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79])

In [92]:
classifier.n_classes_

80

In [93]:
classifier.estimator

LogisticRegression()

In [94]:
classifier.get_params()

{'estimator__C': 1.0,
 'estimator__class_weight': None,
 'estimator__dual': False,
 'estimator__fit_intercept': True,
 'estimator__intercept_scaling': 1,
 'estimator__l1_ratio': None,
 'estimator__max_iter': 100,
 'estimator__multi_class': 'auto',
 'estimator__n_jobs': None,
 'estimator__penalty': 'l2',
 'estimator__random_state': None,
 'estimator__solver': 'lbfgs',
 'estimator__tol': 0.0001,
 'estimator__verbose': 0,
 'estimator__warm_start': False,
 'estimator': LogisticRegression(),
 'n_jobs': None}

In [95]:
classifier.n_features_in_

4156959

## Predictions and results using the classifer

In [96]:
predictions_train = classifier.predict(X_train_cv)

In [97]:
accuracy_score(y_train_mlb, predictions_train)

0.84975

In [98]:
predictions_test = classifier.predict(X_test_cv)

In [99]:
predictions_test.shape

(20000, 80)

In [100]:
predictions_labels = multiLabelBinarizer.inverse_transform(predictions_test)
predictions_labels[0:5]

[('male',),
 ('23', 'Sagittarius', 'indUnk', 'male'),
 ('Scorpio', 'male'),
 ('female',),
 ('female',)]

In [101]:
print("F1: " + str(f1_score(y_test_mlb,predictions_test,average='micro')))
print("Recall: " + str(recall_score(y_test_mlb,predictions_test,average='micro')))
print("Precision: " + str(average_precision_score(y_test_mlb, predictions_test,average='micro')))
print("Accuracy:" + str(accuracy_score(y_test_mlb,predictions_test))) 

F1: 0.4957069829768227
Recall: 0.37455
Precision: 0.3057141344696043
Accuracy:0.121


# *Interestingly, as and how I have increased the sample data size from the actual raw data(few thousands to now 100000) , (just as I can) the accuracy kept dropping from 70% to 12% now.*

## Examples of the predictions

In [102]:
y_test_mlb_labels = multiLabelBinarizer.inverse_transform(y_test_mlb)

In [103]:

for i in range(5):
    print()
    print(X_test[i])
    print()
    print(y_test_mlb_labels[i])
    print()
    print(predictions_labels[i])
    print('#####################################')


pretty disturbing although weird think put loads weight turns pants full squirrels

('35', 'Aries', 'Technology', 'male')

('male',)
#####################################

jer according post posted sa guestbook ko last april th kailangan kong humanda dahil sa gagawa ka ng bagong site ano ba kailan ba ako maghahanda front page oo nga pala dahil sa sem break kami rito wala akong magawa sa buhay ko gonna make lot changes urllink midnght delusions already started changing pics urllink k previous topic mon ami guy told thought falling love remember one used elven language talk maybe simply one used elven language regardless feel done great wrong whatever boggles mind think mind still working simply think first another point think love defined way broad word noun verb sometimes different part speech altogether like think feel love different forms love immaterial material things love eat one love dance love collection whatnots love people love mother father whole lot others wormed heart ever