# Statistical NLP Project

## Loading required libaries

In [1]:
import numpy as np
import pandas as pd

import re

import nltk

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, average_precision_score, recall_score, f1_score

## Loading data

In [4]:
raw_data = pd.read_csv('blogtext.csv')
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681284 entries, 0 to 681283
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      681284 non-null  int64 
 1   gender  681284 non-null  object
 2   age     681284 non-null  int64 
 3   topic   681284 non-null  object
 4   sign    681284 non-null  object
 5   date    681284 non-null  object
 6   text    681284 non-null  object
dtypes: int64(2), object(5)
memory usage: 36.4+ MB


## Preprocessing data

In [5]:
raw_data.isnull().sum()

id        0
gender    0
age       0
topic     0
sign      0
date      0
text      0
dtype: int64

In [6]:
#working_data = raw_data.head(10) # to reduce the time complexity for model building
#working_data = raw_data.copy()
working_data = raw_data.head(10000)

In [7]:
working_data.text

0                  Info has been found (+/- 100 pages,...
1                  These are the team members:   Drewe...
2                  In het kader van kernfusie op aarde...
3                        testing!!!  testing!!!          
4                    Thanks to Yahoo!'s Toolbar I can ...
                              ...                        
9995            take me home with you forever where I ...
9996            seductive secretness behind doors warn...
9997            For being so kind to me when I need yo...
9998            blurry outside sounds as people mingle...
9999            my body feels broken while my mind rej...
Name: text, Length: 10000, dtype: object

In [8]:
def preprocess_data(input_column):
    working_column = input_column

    # a. Remove unwanted characters
    working_column = working_column.apply(lambda x : re.sub('[^A-Za-z]+', ' ', x))

    # b. Convert text to lowercase
    working_column = working_column.apply(lambda x : x.lower())

    # c. Remove unwanted spaces
    working_column = working_column.apply(lambda x: x.strip())

    # d. Remove stopwords
    stopwords = set(nltk.corpus.stopwords.words('English'))
    working_column = working_column.apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))

    output_column = working_column
    return output_column

In [9]:
working_data.text = preprocess_data(working_data.text)

In [10]:
working_data.text

0       info found pages mb pdf files wait untill team...
1       team members drewes van der laag urllink mail ...
2       het kader van kernfusie op aarde maak je eigen...
3                                         testing testing
4       thanks yahoo toolbar capture urls popups means...
                              ...                        
9995    take home forever may rest sleep arms forgotte...
9997    kind need holding hand petting hair cry bring ...
9998    blurry outside sounds people mingle pass darkn...
9999    body feels broken mind rejoices thought warmth...
Name: text, Length: 10000, dtype: object

## Preparing data

In [11]:
def prepare_data(input_dataframe, columns_to_merge , columns_to_drop): #
    interim_dataframe = input_dataframe.copy()

    if len(columns_to_merge) > 0 :
        print(columns_to_merge)
        
        interim_dataframe['column'] = interim_dataframe[columns_to_merge[0]]
        for i in range(len(columns_to_merge)):
            print(columns_to_merge[i])
            if i>0:
                interim_dataframe['column'] = (interim_dataframe['column']+','+interim_dataframe[columns_to_merge[i]].astype(str)).to_list()
    
    interim_dataframe['column'] = interim_dataframe['column'].map(lambda x:x.split(','))

    if len(columns_to_drop) > 0 :
        interim_dataframe.drop(columns = columns_to_drop, inplace=True)


    output_dataframe = interim_dataframe.copy()
    return output_dataframe

In [12]:
working_data.columns

Index(['id', 'gender', 'age', 'topic', 'sign', 'date', 'text'], dtype='object')

In [13]:
working_data = prepare_data(working_data,['gender', 'age', 'topic', 'sign'],['id', 'gender', 'age', 'topic', 'sign', 'date'])
working_data.columns

['gender', 'age', 'topic', 'sign']
gender
age
topic
sign


Index(['text', 'column'], dtype='object')

In [14]:
working_data.head()

Unnamed: 0,text,column
0,info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,testing testing,"[male, 15, Student, Leo]"
4,thanks yahoo toolbar capture urls popups means...,"[male, 33, InvestmentBanking, Aquarius]"


In [15]:
working_data.rename(columns={"column":"label"}, inplace=True)

In [16]:
working_data.head()

Unnamed: 0,text,label
0,info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,testing testing,"[male, 15, Student, Leo]"
4,thanks yahoo toolbar capture urls popups means...,"[male, 33, InvestmentBanking, Aquarius]"


## Separating features and labels

In [17]:
X = working_data.text.values
y = working_data.label.values

## Splitting data into Train Test Validation data

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

In [19]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(8000,) (2000,) (8000,) (2000,)


## Vectorizing the feature data

### Creating Bag of words

In [21]:
countVectorizer = CountVectorizer(ngram_range=(1,2))
X_train_cv = countVectorizer.fit_transform(X_train)
X_test_cv = countVectorizer.transform(X_test)

In [22]:
len(countVectorizer.vocabulary_)

529906

### Printing term-document matrix

In [23]:
print(X_train_cv.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [24]:
X_train_cv.shape

(8000, 529906)

In [25]:
X_test_cv.shape

(2000, 529906)

## Creating dictionary to contain feature label counts

In [26]:
working_data.head()

Unnamed: 0,text,label
0,info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,testing testing,"[male, 15, Student, Leo]"
4,thanks yahoo toolbar capture urls popups means...,"[male, 33, InvestmentBanking, Aquarius]"


In [27]:
label_list = working_data.label.to_list()
len(label_list)

10000

In [28]:
type(label_list)

list

In [29]:
label_list[0]

['male', '15', 'Student', 'Leo']

In [30]:
ALL_WORDS = []
def load_dictionary(input_list):
    for list in input_list:
        for word in list:
            ALL_WORDS.append(word)
    return ALL_WORDS

ALL_WORDS = load_dictionary(label_list)

#ALL_WORDS

In [31]:
def create_dict(input_list):
    output_dict = dict()
    value_count = 0
    print(len(input_list))
    print(range(len(input_list)))
    # for i in range(len(input_list)-2):
        
    #     if input_list[i] in output_dict:
    #         value_count = output_dict[input_list[i]]
    #         value_count += 1
    #         print(label_list[i], value_count)
    #     else:
    #         value_count = 1
    #     output_dict.setdefault(input_list[i], value_count)

    for word in input_list:
        if word in output_dict:
            output_dict[word] += 1
        else:
            output_dict[word] = 1
    #output_dict = { 'key' : lambda counts : counts + 1 if in input_list[i] for i in range(len(input_list)) }

    return output_dict

In [32]:
label_dict = create_dict(ALL_WORDS)

40000
range(0, 40000)


In [33]:
type(label_dict)

dict

In [34]:
#label_dict

## Transforming the labels

In [35]:
multiLabelBinarizer = MultiLabelBinarizer(classes=sorted(label_dict.keys()))


In [36]:
multiLabelBinarizer

MultiLabelBinarizer(classes=['13', '14', '15', '16', '17', '23', '24', '25',
                             '26', '27', '33', '34', '35', '36', '37', '38',
                             '39', '40', '41', '42', '43', '44', '45', '46',
                             'Accounting', 'Aquarius', 'Aries', 'Arts',
                             'Automotive', 'Banking', ...])

In [37]:

y_train_mlb = multiLabelBinarizer.fit_transform(y_train)
y_test_mlb = multiLabelBinarizer.transform(y_test)

In [38]:
print(y_train_mlb, y_train_mlb.shape)

[[0 0 0 ... 0 0 1]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 1]
 ...
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 1]] (8000, 64)


In [39]:
print(y_test_mlb, y_test_mlb.shape)

[[0 0 0 ... 0 1 1]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]] (2000, 64)


## Preparing the classifier

In [40]:
classifier = LogisticRegression(solver='lbfgs')
classifier = OneVsRestClassifier(classifier)

In [41]:
classifier.fit(X_train_cv, y_train_mlb)

OneVsRestClassifier(estimator=LogisticRegression())

In [42]:
classifier.classes_

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63])

In [43]:
classifier.n_classes_

64

In [44]:
classifier.estimator

LogisticRegression()

In [45]:
classifier.get_params()

{'estimator__C': 1.0,
 'estimator__class_weight': None,
 'estimator__dual': False,
 'estimator__fit_intercept': True,
 'estimator__intercept_scaling': 1,
 'estimator__l1_ratio': None,
 'estimator__max_iter': 100,
 'estimator__multi_class': 'auto',
 'estimator__n_jobs': None,
 'estimator__penalty': 'l2',
 'estimator__random_state': None,
 'estimator__solver': 'lbfgs',
 'estimator__tol': 0.0001,
 'estimator__verbose': 0,
 'estimator__warm_start': False,
 'estimator': LogisticRegression(),
 'n_jobs': None}

In [46]:
classifier.n_features_in_

529906

## Predictions and results using the classifer

In [47]:
predictions_train = classifier.predict(X_train_cv)

In [48]:
accuracy_score(y_train_mlb, predictions_train)

0.9575

In [49]:
predictions_test = classifier.predict(X_test_cv)

In [51]:
predictions_test.shape

(2000, 64)

In [52]:
predictions_labels = multiLabelBinarizer.inverse_transform(predictions_test)
predictions_labels[0:5]

[('16', 'Cancer', 'indUnk', 'male'),
 ('Sports-Recreation', 'male'),
 ('Aries', 'male'),
 ('27', 'Taurus', 'female', 'indUnk'),
 ('Aries', 'male')]

In [54]:
print("F1: " + str(f1_score(y_test_mlb,predictions_test,average='micro')))
print("Recall: " + str(recall_score(y_test_mlb,predictions_test,average='micro')))
print("Precision: " + str(average_precision_score(y_test_mlb, predictions_test,average='micro')))
print("Accuracy:" + str(accuracy_score(y_test_mlb,predictions_test))) 

F1: 0.6487521166163587
Recall: 0.55075
Precision: 0.46271980510030447
Accuracy:0.326


## Examples of the predictions

In [None]:
for i in range(5):
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_test[i],
        ','.join(y_test_inversed[i]),
        ','.join(predictions_labels[i])
    ))

In [61]:
y_test_mlb_labels = multiLabelBinarizer.inverse_transform(y_test_mlb)

In [62]:

for i in range(5):
    print()
    print(X_test[i])
    print()
    print(y_test_mlb_labels[i])
    print()
    print(predictions_labels[i])
    print('#####################################')


okay forgot something tell guys im show weekend ya im directing one kid doesnt give crap isnt coming saturday learned two days ago go learning lines short amount time ill go tommrw hafta add lib alot today decided offended show hafta change everything aaaaaaaaaaahhhhhhhhhhhhhhhhhhh hw three songs learn lines comp ugh kill please k im done bitching think

('16', 'Cancer', 'indUnk', 'male')

('16', 'Cancer', 'indUnk', 'male')
#####################################

hit golf ball nbsp yes according calendar days since could last seems like months nbsp cant tell ecstatic nbsp backyard working backswing dad said dont try slow weight shift see feels felt fine nbsp worked full hard swing didnt hurt either went range immediately hit bucket balls nbsp little rusty expected nbsp goal hit every day week hopefully play full round thursday nbsp feels like whole summer reborn nbsp plus found tryouts week later thought optimism permeating usually case evening nbsp rest day sucked nbsp slept got read 