## Importing libraries

In [1]:
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
import string
import pandas as pd
import time
from sklearn.model_selection import train_test_split

## Loading the data

In [2]:
# load data
train_df = pd.read_csv(r"train.csv", usecols = ["text", "target"])
train_df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
test_df = pd.read_csv(r"test.csv", usecols = ["id", "text"])
test_df.head()

Unnamed: 0,id,text
0,0,Just happened a terrible car crash
1,2,"Heard about #earthquake is different cities, s..."
2,3,"there is a forest fire at spot pond, geese are..."
3,9,Apocalypse lighting. #Spokane #wildfires
4,11,Typhoon Soudelor kills 28 in China and Taiwan


In [4]:
print("col_names : " + train_df.columns)
print('\n')
print("Data-dimensions: \t" + str(train_df.shape))
print('\n')
print("Count the not-null values of each features: \n" + str(train_df.notnull().sum()))

Index(['col_names : text', 'col_names : target'], dtype='object')


Data-dimensions: 	(7613, 2)


Count the not-null values of each features: 
text      7613
target    7613
dtype: int64


**The training data has no null-value!**

## Checking and removing duplication

In [5]:
train_df.drop_duplicates(inplace = True)
print("The new dimension after checking  & removing duplication is:\t" + str(train_df.shape))

The new dimension after checking  & removing duplication is:	(7521, 2)


## Adding two additional features Text length and Number of words

In [6]:
train_df['Text_length'] = train_df['text'].str.len()
train_df['Numb_words'] = train_df['text'].str.split().map(lambda x: len(x))
train_df.head()

Unnamed: 0,text,target,Text_length,Numb_words
0,Our Deeds are the Reason of this #earthquake M...,1,69,13
1,Forest fire near La Ronge Sask. Canada,1,38,7
2,All residents asked to 'shelter in place' are ...,1,133,22
3,"13,000 people receive #wildfires evacuation or...",1,65,8
4,Just got sent this photo from Ruby #Alaska as ...,1,88,16


In [15]:
test_df['Text_length'] = test_df['text'].str.len()
test_df['Numb_words'] = test_df['text'].str.split().map(lambda x: len(x))
test_df.tail()

Unnamed: 0,id,text,Text_length,Numb_words
3258,10861,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,55,8
3259,10865,Storm in RI worse than last hurricane. My city...,139,23
3260,10868,Green Line derailment in Chicago http://t.co/U...,55,6
3261,10874,MEG issues Hazardous Weather Outlook (HWO) htt...,65,7
3262,10875,#CityofCalgary has activated its Municipal Eme...,68,8


## Cleaning the texts

In [7]:
import re
def process_text(str_input):
    ## 1. Remove url_link
    remove_url = re.compile(r'https?://\S+|www\.\S+').sub(r'', str_input)
    
    ## 2. Remove html_link
    remove_html = re.compile(r'<.*?>').sub(r'', remove_url)
    
    ## 3. Remove Emojis
    remove_emo = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE).sub(r'', remove_html)
    words = re.sub(r"[^A-Za-z0-9\-]", " ", remove_emo).lower().split()    
        
    ## 4. spell_correction
    # spell = SpellChecker()
    # words = [spell.correction(word) for word in words]

    return words

In [8]:
#from sklearn.feature_extraction.text import CountVectorizer
#text_process = CountVectorizer(analyzer = process_text).fit_transform(train_df['text'])

In [16]:
y_train_df = train_df.target.to_numpy()
X_train_df = train_df[['text', 'Text_length', 'Numb_words']]

from sklearn.feature_extraction.text import TfidfVectorizer

## Set test_size = 0.3
test_size = 0.3
start = time.time()
X_train, X_test, y_train, y_test = train_test_split(X_train_df, y_train_df, test_size=test_size, 
                                                    stratify = y_train_df, random_state = 42)

tfidf_vect = TfidfVectorizer(analyzer = process_text)

tfidf_train = tfidf_vect.fit(X_train['text']).transform(X_train['text']) 
tfidf_test = tfidf_vect.fit(X_train['text']).transform(X_test['text'])
tfidf_test_df = tfidf_vect.fit(X_train['text']).transform(test_df['text'])

X_train_vect = pd.concat([
                            X_train[['Text_length', 'Numb_words']].reset_index(drop = True), 
                            pd.DataFrame(tfidf_train.toarray())
                        ], axis = 1)

X_test_vect = pd.concat([
                            X_test[['Text_length', 'Numb_words']].reset_index(drop = True), 
                            pd.DataFrame(tfidf_test.toarray())
                        ], axis = 1)

#X_test_df_vect.shape, X_test_vect.shape, y_train.shape
tfidf_train.shape, tfidf_test.shape, tfidf_test_df.shape

((5264, 13887), (2257, 13887), (3263, 13887))

## Building and fitting the model

In [26]:
from sklearn.feature_selection import SelectPercentile
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB

clf = Pipeline([('scl', StandardScaler()),
                    ('clf',  BernoulliNB(SelectPercentile(percentile = 97)))
                   ])
clf.named_steps['clf'].set_params(alpha=1.0, fit_prior=False)
clf

Pipeline(memory=None,
         steps=[('scl',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('clf',
                 BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None,
                             fit_prior=False))],
         verbose=False)

In [27]:
from sklearn.metrics import classification_report, confusion_matrix

clf.fit(X_train_vect, y_train)

Pipeline(memory=None,
         steps=[('scl',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('clf',
                 BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None,
                             fit_prior=False))],
         verbose=False)

## Evaluate the model

In [28]:
preds = clf.predict(X_test_vect)

train_acc_NVB = accuracy_score(y_train, clf.predict(X_train_vect)) * 100.0

test_acc_NVB = accuracy_score(y_test, preds) * 100.0

print("Training_Accuracy: %.2f%%" % train_acc_NVB)
print("Testing_Accuracy: %.2f%%" % test_acc_NVB)
print(classification_report(y_test, preds))
print('Confusion Matrix: \n', confusion_matrix(y_test, preds))

Mat = confusion_matrix(y_test, preds)
TP = Mat[0, 0]
FP = Mat[1, 0]
FN = Mat[0, 1]

Reca = TP/(TP + FN)
Pres = TP/(TP + FP)
F_scr = 2*Reca*Pres/(Reca + Pres)*100
print("F-measure = %.2f%%."% F_scr)

Training_Accuracy: 90.33%
Testing_Accuracy: 79.40%
              precision    recall  f1-score   support

           0       0.77      0.90      0.83      1295
           1       0.83      0.65      0.73       962

    accuracy                           0.79      2257
   macro avg       0.80      0.77      0.78      2257
weighted avg       0.80      0.79      0.79      2257

Confusion Matrix: 
 [[1171  124]
 [ 341  621]]
F-measure = 83.43%.


## Prediction on test set

In [25]:
X_test_df_vect = pd.concat([
                            test_df[['Text_length', 'Numb_words']].reset_index(drop = True), 
                            pd.DataFrame(tfidf_test_df.toarray())
                        ], axis = 1)

predicts = clf.predict(X_test_df_vect)
predicts

array([0, 0, 1, ..., 1, 1, 1])