In [1]:
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
import string
import pandas as pd
import time

from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split

In [2]:
# load data
train_df = pd.read_csv(r"train.csv", usecols = ["text", "target"])
train_df.head(5)

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
test_df = pd.read_csv(r"test.csv", usecols = ["id", "text"])
test_df.head()

Unnamed: 0,id,text
0,0,Just happened a terrible car crash
1,2,"Heard about #earthquake is different cities, s..."
2,3,"there is a forest fire at spot pond, geese are..."
3,9,Apocalypse lighting. #Spokane #wildfires
4,11,Typhoon Soudelor kills 28 in China and Taiwan


In [4]:
print("col_names : " + train_df.columns)
print('\n')
print("Data-dimensions: \t" + str(train_df.shape))
print('\n')
print("Count the not-null values of each features: \n" + str(train_df.notnull().sum()))

Index(['col_names : text', 'col_names : target'], dtype='object')


Data-dimensions: 	(7613, 2)


Count the not-null values of each features: 
text      7613
target    7613
dtype: int64


In [6]:
train_df.drop_duplicates(inplace = True)
print("The new dimension after checking duplicate & removing is:\t" + str(train_df.shape))

The new dimension after checking duplicate & removing is:	(7521, 2)


In [7]:
train_df['Text_length'] = train_df['text'].str.len()
train_df['Numb_words'] = train_df['text'].str.split().map(lambda x: len(x))
train_df.head()

Unnamed: 0,text,target,Text_length,Numb_words
0,Our Deeds are the Reason of this #earthquake M...,1,69,13
1,Forest fire near La Ronge Sask. Canada,1,38,7
2,All residents asked to 'shelter in place' are ...,1,133,22
3,"13,000 people receive #wildfires evacuation or...",1,65,8
4,Just got sent this photo from Ruby #Alaska as ...,1,88,16


In [8]:
import re
from spellchecker import SpellChecker    

def process_text(str_input):
    ## 1. Remove url_link
    remove_url = re.compile(r'https?://\S+|www\.\S+').sub(r'', str_input)
    
    ## 2. Remove html_link
    remove_html = re.compile(r'<.*?>').sub(r'', remove_url)
    
    ## 3. Remove Emojis
    remove_emo = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE).sub(r'', remove_html)
    words = re.sub(r"[^A-Za-z0-9\-]", " ", remove_emo).lower().split()    
        
    ## 4. spell_correction
    # spell = SpellChecker()
    # words = [spell.correction(word) for word in words]

    return words

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
text_process = CountVectorizer(analyzer = process_text).fit_transform(train_df['text'])

In [10]:
y = train_df.target.to_numpy()
X = train_df[['text', 'Text_length']]

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

## Set test_size = 0.3
test_size = 0.3
start = time.time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify = y, random_state = 42)

tfidf_vect = TfidfVectorizer(analyzer = process_text)

tfidf_train = tfidf_vect.fit(X_train['text']).transform(X_train['text']) 
tfidf_test = tfidf_vect.fit(X_train['text']).transform(X_test['text'])

X_train_vect = pd.concat([
                            X_train[['Text_length']].reset_index(drop = True), 
                            pd.DataFrame(tfidf_train.toarray())
                        ], axis = 1)

X_test_vect = pd.concat([
                            X_test[['Text_length']].reset_index(drop = True), 
                            pd.DataFrame(tfidf_test.toarray())
                        ], axis = 1)

tpot_clf = TPOTClassifier(generations=100, population_size=10, offspring_size=3 , cv=5,
                          verbosity=2, random_state=42)


# Fit the classifier to the training data
tpot_clf.fit(X_train_vect, y_train)

# Score on the test set
print(tpot_clf.score(X_test_vect, y_test))
print ('Fit&trainning time : ', time.time() - start)

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=310.0, style=ProgressStyle(de…

Generation 1 - Current best internal CV score: 0.7690016574046992
Generation 2 - Current best internal CV score: 0.7690016574046992
Generation 3 - Current best internal CV score: 0.7690016574046992
Generation 4 - Current best internal CV score: 0.7690016574046992
Generation 5 - Current best internal CV score: 0.7690016574046992
Generation 6 - Current best internal CV score: 0.7690016574046992
Generation 7 - Current best internal CV score: 0.7690016574046992
Generation 8 - Current best internal CV score: 0.7690016574046992
Generation 9 - Current best internal CV score: 0.7690016574046992
Generation 10 - Current best internal CV score: 0.7845765674029299
Generation 11 - Current best internal CV score: 0.7870469670216186
Generation 12 - Current best internal CV score: 0.7870469670216186
Generation 13 - Current best internal CV score: 0.7870469670216186
Generation 14 - Current best internal CV score: 0.7870469670216186
Generation 15 - Current best internal CV score: 0.7870469670216186
Gene

***Hence, the best method is using the Pipeline belong with BernoulliNB and SelectPercentile.***

**Using F-measure in the `confusion matrix`**

Remind that, `A confusion matrix is a summary of prediction results on a classification problem. The number of correct and incorrect predictions are summarized with count values and broken down by each class. This is the key to the confusion matrix. The confusion matrix shows the ways in which your classification model is confused when it makes predictions. It gives us insight not only into the errors being made by a classifier but more importantly the types of errors that are being made.`

$$\begin{array}{ccc} & \text{Class 1_predicted} & \text{Class 2_predicted} \\ \text{Class 1_actual} & \text{TP} & \text{FN} \\ \text{Class 2_actual} & \text{TN} & \text{FP} \end{array}$$

where,

-1) `True Positive (TP)` : Observation is positive, and is predicted to be positive.

-2) `False Negative (FN)` : Observation is positive, but is predicted negative.

-3) `True Negative (TN)` : Observation is negative, and is predicted to be negative.

-4) `False Positive (FP)` : Observation is negative, but is predicted positive.

**Calculate the value of `Recall` & `Precision` from the `confusion_matrix`**, 

We have
$$ F_{\text{score}} = \dfrac{2*\text{recall}*\text{precision} }{\text{recall}+\text{precision}} $$

In [11]:
test_df['Text_length'] = test_df['text'].str.len()
test_df['Numb_words'] = test_df['text'].str.split().map(lambda x: len(x))

test_df.tail()

Unnamed: 0,id,text,Text_length,Numb_words
3258,10861,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,55,8
3259,10865,Storm in RI worse than last hurricane. My city...,139,23
3260,10868,Green Line derailment in Chicago http://t.co/U...,55,6
3261,10874,MEG issues Hazardous Weather Outlook (HWO) htt...,65,7
3262,10875,#CityofCalgary has activated its Municipal Eme...,68,8


In [14]:
y_train_df = train_df.target.to_numpy()
X_train_df = train_df[['text', 'Text_length', 'Numb_words']]

from sklearn.feature_extraction.text import TfidfVectorizer

## Set test_size = 0.3
test_size = 0.3
start = time.time()
X_train, X_test, y_train, y_test = train_test_split(X_train_df, y_train_df, test_size=test_size, 
                                                    stratify = y_train_df, random_state = 42)

tfidf_vect = TfidfVectorizer(analyzer = process_text)

tfidf_train = tfidf_vect.fit(X_train['text']).transform(X_train['text']) 
tfidf_test = tfidf_vect.fit(X_train['text']).transform(X_test['text'])
tfidf_test_df = tfidf_vect.fit(X_train['text']).transform(test_df['text'])

X_train_vect = pd.concat([
                            X_train[['Text_length', 'Numb_words']].reset_index(drop = True), 
                            pd.DataFrame(tfidf_train.toarray())
                        ], axis = 1)

X_test_vect = pd.concat([
                            X_test[['Text_length', 'Numb_words']].reset_index(drop = True), 
                            pd.DataFrame(tfidf_test.toarray())
                        ], axis = 1)

#X_test_df_vect.shape, X_test_vect.shape, y_train.shape
tfidf_train.shape, tfidf_test.shape, tfidf_test_df.shape

((5264, 13887), (2257, 13887), (3263, 13887))

In [15]:
from sklearn.feature_selection import SelectPercentile
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB

clf = Pipeline([('scl', StandardScaler()),
                    ('clf',  BernoulliNB(SelectPercentile(percentile = 97)))
                   ])
clf

Pipeline(memory=None,
         steps=[('scl',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('clf',
                 BernoulliNB(alpha=SelectPercentile(percentile=97,
                                                    score_func=<function f_classif at 0x0000021A7A960558>),
                             binarize=0.0, class_prior=None, fit_prior=True))],
         verbose=False)

**List the names in `pipeline` with `.named_steps`**

In [16]:
clf.named_steps

{'scl': StandardScaler(copy=True, with_mean=True, with_std=True),
 'clf': BernoulliNB(alpha=SelectPercentile(percentile=97,
                                    score_func=<function f_classif at 0x0000021A7A960558>),
             binarize=0.0, class_prior=None, fit_prior=True)}

**Checking the statement in `clf`**

In [17]:
clf.named_steps['clf']

BernoulliNB(alpha=SelectPercentile(percentile=97,
                                   score_func=<function f_classif at 0x0000021A7A960558>),
            binarize=0.0, class_prior=None, fit_prior=True)

**Set the params `alpha` and `fit_prior` into BernoulliNB()**

In [18]:
clf.named_steps['clf'].set_params(alpha=1.0, fit_prior=False)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=False)

***Verify the parameters in pipeline***

In [19]:
clf

Pipeline(memory=None,
         steps=[('scl',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('clf',
                 BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None,
                             fit_prior=False))],
         verbose=False)

In [21]:
from sklearn.metrics import classification_report, confusion_matrix

clf.fit(X_train_vect, y_train)

preds = clf.predict(X_test_vect)

train_acc_NVB = accuracy_score(y_train, clf.predict(X_train_vect)) * 100.0

test_acc_NVB = accuracy_score(y_test, preds) * 100.0

print("Training_Accuracy: %.2f%%" % train_acc_NVB)
print("Testing_Accuracy: %.2f%%" % test_acc_NVB)
print(classification_report(y_test, preds))
print('Confusion Matrix: \n', confusion_matrix(y_test, preds))

Mat = confusion_matrix(y_test, preds)
TP = Mat[0, 0]
FP = Mat[1, 0]
FN = Mat[0, 1]

Reca = TP/(TP + FN)
Pres = TP/(TP + FP)
F_scr = 2*Reca*Pres/(Reca + Pres)*100
print("F-measure = %.2f%%."% F_scr)

Training_Accuracy: 90.33%
Testing_Accuracy: 79.40%
              precision    recall  f1-score   support

           0       0.77      0.90      0.83      1295
           1       0.83      0.65      0.73       962

    accuracy                           0.79      2257
   macro avg       0.80      0.77      0.78      2257
weighted avg       0.80      0.79      0.79      2257

Confusion Matrix: 
 [[1171  124]
 [ 341  621]]
F-measure = 83.43%.


**Create & predict the `X_test_df_vect` by combine the column `'text_lenght'` & `numb_words` in `'test_df'` with the `tfidf_test_df`**

In [24]:
X_test_df_vect = pd.concat([
                            test_df[['Text_length', 'Numb_words']].reset_index(drop = True), 
                            pd.DataFrame(tfidf_test_df.toarray())
                        ], axis = 1)

predicts = clf.predict(X_test_df_vect)
predicts

array([0, 0, 1, ..., 1, 1, 1], dtype=int64)

**Create the csv file by combine the `predict(X_test_df_vect)` & `id`**

In [25]:
## create the dataframe
predicts = clf.predict(X_test_df_vect)

submission_df = pd.DataFrame({'id': test_df.id, 'target' : predicts})
submission_df.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,1
3,9,1
4,11,1


In [26]:
saved_path = "D://submission.csv"
#submission_df.to_csv(saved_path)