In [1]:
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
import string
import pandas as pd
import time

from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split

In [2]:
# load data
df = pd.read_csv(r"train.csv", usecols = ["text", "target"])
df.head(5)

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
print("col_names : \t" + df.columns)
print('\n')
print("Data-dimensions: \t" + str(df.shape))
print('\n')
print("Count the not-null values of each features: \n" + str(df.notnull().sum()))

Index(['col_names : \ttext', 'col_names : \ttarget'], dtype='object')


Data-dimensions: 	(7613, 2)


Count the not-null values of each features: 
text      7613
target    7613
dtype: int64


In [4]:
df.drop_duplicates(inplace = True)
print("The new dimension after checking duplicate & removing is:\t" + str(df.shape))

The new dimension after checking duplicate & removing is:	(7521, 2)


In [5]:
df['Text_length'] = df['text'].str.len()
df['Numb_words'] = df['text'].str.split().map(lambda x: len(x))
df.head()

Unnamed: 0,text,target,Text_length,Numb_words
0,Our Deeds are the Reason of this #earthquake M...,1,69,13
1,Forest fire near La Ronge Sask. Canada,1,38,7
2,All residents asked to 'shelter in place' are ...,1,133,22
3,"13,000 people receive #wildfires evacuation or...",1,65,8
4,Just got sent this photo from Ruby #Alaska as ...,1,88,16


In [7]:
import re
from spellchecker import SpellChecker    

def process_text(str_input):
    ## 1. Remove url_link
    remove_url = re.compile(r'https?://\S+|www\.\S+').sub(r'', str_input)
    
    ## 2. Remove html_link
    remove_html = re.compile(r'<.*?>').sub(r'', remove_url)
    
    ## 3. Remove Emojis
    remove_emo = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE).sub(r'', remove_html)
    words = re.sub(r"[^A-Za-z0-9\-]", " ", remove_emo).lower().split()    
        
    ## 4. spell_correction
    # spell = SpellChecker()
    # words = [spell.correction(word) for word in words]

    return words

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
text_process = CountVectorizer(analyzer = process_text).fit_transform(df['text'])

In [9]:
y = df.target.to_numpy()
X = df[['text', 'Text_length']]

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

## Set test_size = 0.3
test_size = 0.3
start = time.time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify = y, random_state = 42)

tfidf_vect = TfidfVectorizer(analyzer = process_text)

tfidf_train = tfidf_vect.fit(X_train['text']).transform(X_train['text']) 
tfidf_test = tfidf_vect.fit(X_train['text']).transform(X_test['text'])

X_train_vect = pd.concat([
                            X_train[['Text_length']].reset_index(drop = True), 
                            pd.DataFrame(tfidf_train.toarray())
                        ], axis = 1)

X_test_vect = pd.concat([
                            X_test[['Text_length']].reset_index(drop = True), 
                            pd.DataFrame(tfidf_test.toarray())
                        ], axis = 1)

tpot_clf = TPOTClassifier(generations=100, population_size=10, offspring_size=3 , cv=5,
                          verbosity=2, random_state=42)


# Fit the classifier to the training data
tpot_clf.fit(X_train_vect, y_train)

# Score on the test set
print(tpot_clf.score(X_test_vect, y_test))
print ('Fit&trainning time : ', time.time() - start)

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=310.0, style=ProgressStyle(deâ€¦

Generation 1 - Current best internal CV score: 0.7690016574046992
Generation 2 - Current best internal CV score: 0.7690016574046992
Generation 3 - Current best internal CV score: 0.7690016574046992
Generation 4 - Current best internal CV score: 0.7690016574046992
Generation 5 - Current best internal CV score: 0.7690016574046992
Generation 6 - Current best internal CV score: 0.7690016574046992
Generation 7 - Current best internal CV score: 0.7690016574046992
Generation 8 - Current best internal CV score: 0.7690016574046992
Generation 9 - Current best internal CV score: 0.7690016574046992
Generation 10 - Current best internal CV score: 0.7845765674029299
Generation 11 - Current best internal CV score: 0.7870469670216186
Generation 12 - Current best internal CV score: 0.7870469670216186
Generation 13 - Current best internal CV score: 0.7870469670216186
Generation 14 - Current best internal CV score: 0.7870469670216186
Generation 15 - Current best internal CV score: 0.7870469670216186
Gene

***Hence, the best method is using the Pipeline belong with BernoulliNB and SelectPercentile.***

In [10]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_selection import SelectPercentile

**Approach 1.**`First, we only focus on the BernoulliNB (without using Pipeline)`

In [14]:
clf = BernoulliNB(fit_prior=False)
clf.fit(X_train_vect, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=False)

In [15]:
preds = clf.predict(X_test_vect)

train_acc_NVB = accuracy_score(y_train, clf.predict(X_train_vect)) * 100.0

test_acc_NVB = accuracy_score(y_test, preds) * 100.0

**Checking the confusion_matrix**

In [16]:
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix

print("Training_Accuracy: %.2f%%" % train_acc_NVB)
print("Testing_Accuracy: %.2f%%" % test_acc_NVB)
print(classification_report(y_test, preds))
print('Confusion Matrix: \n', confusion_matrix(y_test, preds))

Training_Accuracy: 90.67%
Testing_Accuracy: 80.91%
              precision    recall  f1-score   support

           0       0.80      0.90      0.84      1303
           1       0.83      0.69      0.76       981

    accuracy                           0.81      2284
   macro avg       0.81      0.79      0.80      2284
weighted avg       0.81      0.81      0.81      2284

Confusion Matrix: 
 [[1167  136]
 [ 300  681]]


**Using F-measure in the `confusion matrix`**

Remind that, `A confusion matrix is a summary of prediction results on a classification problem. The number of correct and incorrect predictions are summarized with count values and broken down by each class. This is the key to the confusion matrix. The confusion matrix shows the ways in which your classification model is confused when it makes predictions. It gives us insight not only into the errors being made by a classifier but more importantly the types of errors that are being made.`

$$\begin{array}{ccc} & \text{Class 1_predicted} & \text{Class 2_predicted} \\ \text{Class 1_actual} & \text{TP} & \text{FN} \\ \text{Class 2_actual} & \text{TN} & \text{FP} \end{array}$$

where,

-1) `True Positive (TP)` : Observation is positive, and is predicted to be positive.

-2) `False Negative (FN)` : Observation is positive, but is predicted negative.

-3) `True Negative (TN)` : Observation is negative, and is predicted to be negative.

-4) `False Positive (FP)` : Observation is negative, but is predicted positive.

In [17]:
Mat = confusion_matrix(y_test, preds)
TP = Mat[0, 0]    ## 
FP = Mat[1, 0]
FN = Mat[0, 1]
TP, FP, FN

(1167, 300, 136)

**Calculate the value of `Recall` & `Precision` from the `confusion_matrix`**, 

We have
$$ F_{\text{score}} = \dfrac{2*\text{recall}*\text{precision} }{\text{recall}+\text{precision}} $$

In [18]:
Reca = TP/(TP + FN)
Pres = TP/(TP + FP)
F_scr = 2*Reca*Pres/(Reca + Pres)*100
print("F-measure = %.2f%%."% F_scr)

F-measure = 84.26%.


**Approach 2. Now, I try to add one more feature: `Number of words` to this model. Let see the F_measure in the confusion matrix**

In [19]:
y = df.target.to_numpy()
X = df[['text', 'Text_length', 'Numb_words']]

from sklearn.feature_extraction.text import TfidfVectorizer

## Set test_size = 0.3
test_size = 0.3
start = time.time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify = y, random_state = 42)

tfidf_vect = TfidfVectorizer(analyzer = process_text)

tfidf_train = tfidf_vect.fit(X_train['text']).transform(X_train['text']) 
tfidf_test = tfidf_vect.fit(X_train['text']).transform(X_test['text'])

X_train_vect = pd.concat([
                            X_train[['Text_length', 'Numb_words']].reset_index(drop = True), 
                            pd.DataFrame(tfidf_train.toarray())
                        ], axis = 1)

X_test_vect = pd.concat([
                            X_test[['Text_length', 'Numb_words']].reset_index(drop = True), 
                            pd.DataFrame(tfidf_test.toarray())
                        ], axis = 1)

clf = BernoulliNB(alpha = 1.0, fit_prior=False)
clf.fit(X_train_vect, y_train)

preds = clf.predict(X_test_vect)

train_acc_NVB = accuracy_score(y_train, clf.predict(X_train_vect)) * 100.0

test_acc_NVB = accuracy_score(y_test, preds) * 100.0

from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix

print("Training_Accuracy: %.2f%%" % train_acc_NVB)
print("Testing_Accuracy: %.2f%%" % test_acc_NVB)
print(classification_report(y_test, preds))
print('Confusion Matrix: \n', confusion_matrix(y_test, preds))

Mat = confusion_matrix(y_test, preds)
TP = Mat[0, 0]
FP = Mat[1, 0]
FN = Mat[0, 1]

Reca = TP/(TP + FN)
Pres = TP/(TP + FP)
F_scr = 2*Reca*Pres/(Reca + Pres)*100
print("F-measure = %.2f%%."% F_scr)

Training_Accuracy: 90.67%
Testing_Accuracy: 80.91%
              precision    recall  f1-score   support

           0       0.80      0.90      0.84      1303
           1       0.83      0.69      0.76       981

    accuracy                           0.81      2284
   macro avg       0.81      0.79      0.80      2284
weighted avg       0.81      0.81      0.81      2284

Confusion Matrix: 
 [[1167  136]
 [ 300  681]]
F-measure = 84.26%.


As we see, the `F_measure = 84.26` and there are nothing change!!

**Approach 3. Finally, we use the Pipeline and BernoulliBN and SelectPercentile.**

In [20]:
from sklearn.feature_selection import SelectPercentile
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

clf = Pipeline([('scl', StandardScaler()),
                    ('clf',  BernoulliNB(SelectPercentile(percentile = 97)))
                   ])
clf

Pipeline(memory=None,
         steps=[('scl',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('clf',
                 BernoulliNB(alpha=SelectPercentile(percentile=97,
                                                    score_func=<function f_classif at 0x00000273F50F4A68>),
                             binarize=0.0, class_prior=None, fit_prior=True))],
         verbose=False)

**List the names in `pipeline` with `.named_steps`**

In [21]:
clf.named_steps

{'scl': StandardScaler(copy=True, with_mean=True, with_std=True),
 'clf': BernoulliNB(alpha=SelectPercentile(percentile=97,
                                    score_func=<function f_classif at 0x00000273F50F4A68>),
             binarize=0.0, class_prior=None, fit_prior=True)}

**Checking the statement in `clf`**

In [22]:
clf.named_steps['clf']

BernoulliNB(alpha=SelectPercentile(percentile=97,
                                   score_func=<function f_classif at 0x00000273F50F4A68>),
            binarize=0.0, class_prior=None, fit_prior=True)

**Set the params `alpha` and `fit_prior` into BernoulliNB()**

In [26]:
clf.named_steps['clf'].set_params(alpha=1.0, fit_prior=False)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=False)

***Verify the parameters in pipeline***

In [27]:
clf

Pipeline(memory=None,
         steps=[('scl',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('clf',
                 BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None,
                             fit_prior=False))],
         verbose=False)

In [29]:
clf.fit(X_train_vect, y_train)

preds = clf.predict(X_test_vect)

train_acc_NVB = accuracy_score(y_train, clf.predict(X_train_vect)) * 100.0

test_acc_NVB = accuracy_score(y_test, preds) * 100.0

print("Training_Accuracy: %.2f%%" % train_acc_NVB)
print("Testing_Accuracy: %.2f%%" % test_acc_NVB)
print(classification_report(y_test, preds))
print('Confusion Matrix: \n', confusion_matrix(y_test, preds))

Mat = confusion_matrix(y_test, preds)
TP = Mat[0, 0]
FP = Mat[1, 0]
FN = Mat[0, 1]

Reca = TP/(TP + FN)
Pres = TP/(TP + FP)
F_scr = 2*Reca*Pres/(Reca + Pres)*100
print("F-measure = %.2f%%."% F_scr)

Training_Accuracy: 90.50%
Testing_Accuracy: 81.30%
              precision    recall  f1-score   support

           0       0.79      0.90      0.84      1303
           1       0.84      0.69      0.76       981

    accuracy                           0.81      2284
   macro avg       0.82      0.79      0.80      2284
weighted avg       0.81      0.81      0.81      2284

Confusion Matrix: 
 [[1172  131]
 [ 304  677]]
F-measure = 84.35%.
