In [2]:
import nltk
import string
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict, Counter

import re
from nltk.tokenize import word_tokenize
import gensim
import string
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm

import seaborn as sns
import matplotlib.pyplot as plt

### Loading & viewing dataset.
- References [link_1](https://github.com/Nhan121/Kaggle-6-first-projects/blob/master/NLP_Text_Classification/NLP_Text_classification.ipynb)

In [3]:
path = r'../input/nlp-getting-started'
train = pd.read_csv(path + '/train.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


## 1. Pre-processing
### 1.1. Counting & handling missing-values

In [4]:
pd.DataFrame({'count_NA': train.isna().sum(), 
              '%NA': round(100*train.isna().sum() / train.shape[0], 2), 
              'type' :train.dtypes})

Unnamed: 0,count_NA,%NA,type
id,0,0.0,int64
keyword,61,0.8,object
location,2533,33.27,object
text,0,0.0,object
target,0,0.0,int64


#### Comment.
- For the column `keyword`, it takes about `0.8%` at both dataset; whilethe column `location` is more than `33%`. Both of them (both columns) are the `object` types so we can replace the missing values by `unknown` to keep the structure preservation.
- The rest columns `id, text, target` (in `train.csv`) and `id, text` (in `test.csv`) has no missing value since we will predict the `target` (and add them to the `test`) mainly based on the `text`.

In [5]:
train = train.fillna('unknown')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,unknown,unknown,Our Deeds are the Reason of this #earthquake M...,1
1,4,unknown,unknown,Forest fire near La Ronge Sask. Canada,1
2,5,unknown,unknown,All residents asked to 'shelter in place' are ...,1
3,6,unknown,unknown,"13,000 people receive #wildfires evacuation or...",1
4,7,unknown,unknown,Just got sent this photo from Ruby #Alaska as ...,1


### 1.2. Droping duplicated-values
#### The unique-values.

In [6]:
count_unique = [len(train[col].unique()) for col in train.columns]
percent_uniq = [round(100*cnt / train.shape[0], 2) for cnt in count_unique]
pd.DataFrame({'cnt_uniq': count_unique, 'perc_uniq_%': percent_uniq}, index = train.columns)

Unnamed: 0,cnt_uniq,perc_uniq_%
id,7613,100.0
keyword,222,2.92
location,3342,43.9
text,7503,98.56
target,2,0.03


#### Droping the duplicates values (after ignore the `id`)

In [7]:
train_non_id = train.drop(columns = ['id'])
train_non_id = train_non_id.drop_duplicates()
train_non_id.shape

(7561, 4)

### 1.3. Text-processing.

In [8]:
!pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.5.6-py2.py3-none-any.whl (2.5 MB)
[K     |████████████████████████████████| 2.5 MB 3.0 MB/s eta 0:00:01
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.5.6
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


The following function is prepared for the Section 3.2 (Model using `text-processing`)

In [9]:
from pre_process import *
## Review the first 5 lines after using text-processing
new_train = train.copy()
%time new_train.loc[:5, 'correct_text'] = train.loc[:5, 'text'].apply(lambda x: process_text(x))
new_train.head()

CPU times: user 7.53 ms, sys: 924 µs, total: 8.46 ms
Wall time: 8.39 ms


Unnamed: 0,id,keyword,location,text,target,correct_text
0,1,unknown,unknown,Our Deeds are the Reason of this #earthquake M...,1,"[our, deeds, are, the, reason, of, this, earth..."
1,4,unknown,unknown,Forest fire near La Ronge Sask. Canada,1,"[forest, fire, near, la, ronge, sask, canada]"
2,5,unknown,unknown,All residents asked to 'shelter in place' are ...,1,"[all, residents, asked, to, shelter, in, place..."
3,6,unknown,unknown,"13,000 people receive #wildfires evacuation or...",1,"[13, 000, people, receive, wildfires, evacuati..."
4,7,unknown,unknown,Just got sent this photo from Ruby #Alaska as ...,1,"[just, got, sent, this, photo, from, ruby, ala..."


## 2. Grid-Seach CV.
### 2.1. Initialize the model

In [10]:
train['Text_length'] = train['text'].str.len()
train['Numb_words'] = train['text'].str.split().map(lambda x: len(x))
train = train.set_index('id')
train.head()

Unnamed: 0_level_0,keyword,location,text,target,Text_length,Numb_words
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,unknown,unknown,Our Deeds are the Reason of this #earthquake M...,1,69,13
4,unknown,unknown,Forest fire near La Ronge Sask. Canada,1,38,7
5,unknown,unknown,All residents asked to 'shelter in place' are ...,1,133,22
6,unknown,unknown,"13,000 people receive #wildfires evacuation or...",1,65,8
7,unknown,unknown,Just got sent this photo from Ruby #Alaska as ...,1,88,16


### Train-test split.
- 2 columns `keyword, location` has so many categories so that it can not be used to establish the model.
- It make more time in computation and the efficients is not good.

In [11]:
# Initialize the tfidf_vectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english') 
X = tfidf_vectorizer.fit_transform(train['text']) 

## Target
y = train['target']

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.3, random_state=42)

X_train.shape, X_test.shape, y_train.shape

((5329, 21363), (2284, 21363), (5329,))

### 2.2. Lauching with SVM model

In [13]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV

svm = svm.SVC()
grid_params = [{
                'kernel':['linear', 'rbf', 'poly'],
                'C': [0.1, 1, 5], #default: 1.0
                 }]
clf_svm = GridSearchCV(estimator=svm, param_grid = grid_params, cv = 8, verbose = 0)
%time clf_svm.fit(X_train, y_train)

CPU times: user 3min 25s, sys: 3.44 s, total: 3min 29s
Wall time: 3min 29s


GridSearchCV(cv=8, estimator=SVC(),
             param_grid=[{'C': [0.1, 1, 5],
                          'kernel': ['linear', 'rbf', 'poly']}])

In [14]:
clf_svm.best_params_

{'C': 1, 'kernel': 'linear'}

In [15]:
clf_svm.best_score_

0.7907679043361202

In [16]:
from sklearn.metrics import accuracy_score

pred_train = clf_svm.predict(X_train)
pred_test = clf_svm.predict(X_test)

print(accuracy_score(y_train, pred_train))
print(accuracy_score(y_test, pred_test))

0.9502720960780634
0.8104203152364273


In [17]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_train, pred_train))
print(confusion_matrix(y_test, pred_test))

[[2994   45]
 [ 220 2070]]
[[1178  125]
 [ 308  673]]


In [18]:
df = pd.DataFrame(columns = ['best_params', 'train_acc_%', 'train_conf_matrix', 'test_acc_%', 'test_conf_matrix'])
df.loc['SVM'] = [clf_svm.best_params_, 
                 100*accuracy_score(y_train, pred_train), confusion_matrix(y_train, pred_train), 
                 100*accuracy_score(y_test, pred_test), confusion_matrix(y_test, pred_test)]
df

Unnamed: 0,best_params,train_acc_%,train_conf_matrix,test_acc_%,test_conf_matrix
SVM,"{'C': 1, 'kernel': 'linear'}",95.02721,"[[2994, 45], [220, 2070]]",81.042032,"[[1178, 125], [308, 673]]"


## 3. And for another model

In [19]:
from sklearn.preprocessing import StandardScaler
X_train_num = train[['Text_length', 'Numb_words']].to_numpy()
X_con = StandardScaler().fit_transform(X_train_num)

from sklearn.feature_extraction.text import CountVectorizer
X_C = CountVectorizer(analyzer = process_text).fit_transform(train['text'])

X_train, X_test, y_train, y_test = train_test_split(X_C, y, stratify = y, test_size=0.3, random_state=0)

X_train.shape, X_test.shape, y_train.shape

((5329, 17330), (2284, 17330), (5329,))

### 3.1. Naive Bayes

In [20]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB

naiv = MultinomialNB()
grid_params = [{'alpha' : [0.5, 0.75, 0.8, 1]}]
clf = GridSearchCV(estimator=naiv, param_grid = grid_params, cv = 5, verbose = 0)
%time clf.fit(X_train, y_train)

pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

print(clf.best_params_)
print(accuracy_score(y_train, pred_train))
print(accuracy_score(y_test, pred_test))

df.loc['MultiNB'] = [clf.best_params_, 
                     100*accuracy_score(y_train, pred_train), confusion_matrix(y_train, pred_train), 
                     100*accuracy_score(y_test, pred_test), confusion_matrix(y_test, pred_test)]

CPU times: user 93.4 ms, sys: 0 ns, total: 93.4 ms
Wall time: 93.2 ms
{'alpha': 1}
0.903921936573466
0.8025394045534151


### 3.2. Logistic-Regression

In [21]:
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.3, random_state=42)
logreg = LogisticRegression()
grid_params = [{'C' : [0.5, 1, 2, 10],
                'max_iter': [500, 1000]}]
clf = GridSearchCV(estimator = logreg, param_grid = grid_params, cv = 15, verbose = 0)
%time clf.fit(X_train, y_train)

pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

print(clf.best_params_)
print(accuracy_score(y_train, pred_train))
print(accuracy_score(y_test, pred_test))

df.loc['Log-reg'] = [clf.best_params_, 
                     100*accuracy_score(y_train, pred_train), confusion_matrix(y_train, pred_train), 
                     100*accuracy_score(y_test, pred_test), confusion_matrix(y_test, pred_test)]

CPU times: user 38.5 s, sys: 531 ms, total: 39.1 s
Wall time: 19.6 s
{'C': 2, 'max_iter': 500}
0.9356352036029274
0.8042907180385289


### 3.3. K-Nearest-Neighbors Classifier

In [22]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
grid_params = [{'n_neighbors': [50, 80, 100, 150]}]

clf = GridSearchCV(estimator = knn, param_grid = grid_params, cv = 5, verbose = 0)
%time clf.fit(X_train, y_train)

pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

print(clf.best_params_)
print(accuracy_score(y_train, pred_train))
print(accuracy_score(y_test, pred_test))

df.loc['K-nn'] = [clf.best_params_, 
                  100*accuracy_score(y_train, pred_train), confusion_matrix(y_train, pred_train), 
                  100*accuracy_score(y_test, pred_test), confusion_matrix(y_test, pred_test)]

CPU times: user 2.94 s, sys: 150 ms, total: 3.09 s
Wall time: 3 s
{'n_neighbors': 100}
0.7766935635203603
0.7863397548161121


### 3.4. Random Forest Classifier

In [23]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
X_train, X_test, y_train, y_test = train_test_split(X_C, y, stratify = y, test_size=0.3, random_state=42)
grid_params = [{
                'n_estimators': [200, 300], #default=10
                'criterion': ['gini'], #default=”gini”
                'max_depth': [5, 10, 20], #default=None
                'oob_score': [True], #default=False
                'random_state': [0, 42, 88]
                 }]

clf = GridSearchCV(estimator = rfc, param_grid = grid_params, cv = 5, verbose = 0)
%time clf.fit(X_train, y_train)

pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

print(clf.best_params_)
print(accuracy_score(y_train, pred_train))
print(accuracy_score(y_test, pred_test))

df.loc['rfc'] = [clf.best_params_, 
                 100*accuracy_score(y_train, pred_train), confusion_matrix(y_train, pred_train), 
                 100*accuracy_score(y_test, pred_test), confusion_matrix(y_test, pred_test)]

CPU times: user 3min 20s, sys: 495 ms, total: 3min 20s
Wall time: 3min 20s
{'criterion': 'gini', 'max_depth': 20, 'n_estimators': 300, 'oob_score': True, 'random_state': 88}
0.7350347157065116
0.717600700525394


### 3.5. XGB Classifier.

In [24]:
from xgboost import XGBClassifier

xgb = XGBClassifier()
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.3, random_state=42)
grid_params = [{
                'n_estimators': [500, 1000, 5000],
                'learning_rate': [.05, 0.1],
                'max_depth': [5, 7, 10],
            }]
clf = GridSearchCV(estimator = xgb, param_grid = grid_params, cv = 5, verbose = 0)
%time clf.fit(X_train, y_train,eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds = 20, verbose = 0)

CPU times: user 16min 39s, sys: 6.9 s, total: 16min 46s
Wall time: 4min 18s


GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None,

In [25]:
print(clf.best_params_)
print(accuracy_score(y_train, pred_train))
print(accuracy_score(y_test, pred_test))

df.loc['XGB-simple'] = [clf.best_params_, 
                         100*accuracy_score(y_train, pred_train), confusion_matrix(y_train, pred_train), 
                         100*accuracy_score(y_test, pred_test), confusion_matrix(y_test, pred_test)]

{'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 500}
0.7350347157065116
0.717600700525394


#### XGB with complexity-model

In [26]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.field]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[[self.field]]

In [27]:
import time
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline, FeatureUnion
svc = SVC()

X_t = train[['text', 'Text_length']]
X_train, X_test, y_train, y_test = train_test_split(X_t, 
                                                    y, 
                                                    test_size = 0.3, 
                                                    stratify = y, 
                                                    random_state = 42)
clf = Pipeline([
    (
        'features', FeatureUnion([
        ('text', Pipeline([
            ('colext', TextSelector('text')),
            ('tfidf', TfidfVectorizer(tokenizer = process_text, stop_words = 'english',
                     min_df = .0025, max_df = 0.25, ngram_range = (1, 9) ) ),
            ('svd', TruncatedSVD(algorithm ='randomized', n_components = 300) ), #for XGB
        ])),
        ('words', Pipeline([
            ('wordext', NumberSelector('Text_length')),
            ('wscaler', StandardScaler()),
        ])),            
    ])
    ),
    ('clf', XGBClassifier(eval_set=[(X_train, y_train), (X_test, y_test)],
                          max_depth = 8, n_estimators = 500, base_estimator = svc, learning_rate = 0.1, cv = 15))
    ])

## Fit the model
start = time.time()
clf.fit(X_train, y_train)
pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)
print ('Fit&trainning time : ', time.time() - start)

train_acc_Xgb2 = accuracy_score(y_train, clf.predict(X_train)) * 100.0 
test_acc_Xgb2 = accuracy_score(y_test, pred_test) * 100.0

print("Training_Accuracy: %.2f%%" % train_acc_Xgb2)
print("Testing_Accuracy: %.2f%%" % test_acc_Xgb2)

Parameters: { base_estimator, cv, eval_set } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Fit&trainning time :  58.100852489471436
Training_Accuracy: 98.99%
Testing_Accuracy: 76.66%


In [28]:
df.loc['XGB-complex'] = ["", 
                         100*accuracy_score(y_train, pred_train), confusion_matrix(y_train, pred_train), 
                         100*accuracy_score(y_test, pred_test), confusion_matrix(y_test, pred_test)]

### 3.6. Adaboost

In [29]:
from sklearn.ensemble import AdaBoostClassifier

X_t = train[['text', 'Numb_words']]
X_train, X_test, y_train, y_test = train_test_split(X_t, 
                                                    y, 
                                                    test_size = 0.3, 
                                                    stratify = y, 
                                                    random_state = 42)

clf = Pipeline([
    (
        'features', FeatureUnion([
        ('text', Pipeline([
            ('colext', TextSelector('text')),
            ('tfidf', TfidfVectorizer(tokenizer = process_text, stop_words = 'english',
                     min_df = .0025, max_df = 0.25, ngram_range = (1, 5) ) ),
            ('svd', TruncatedSVD(algorithm ='randomized', n_components = 300) ), 
        ])),
        ('words', Pipeline([
            ('wordext', NumberSelector('Numb_words')),
            ('wscaler', StandardScaler()),
        ])),            
    ])
    ),
    ('clf', AdaBoostClassifier(n_estimators = 300, learning_rate = 0.1)),
    ])

start = time.time()
%time clf.fit(X_train, y_train)
pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

df.loc['Ada_Bst-complex'] = ["", 
                             100*accuracy_score(y_train, pred_train), confusion_matrix(y_train, pred_train), 
                             100*accuracy_score(y_test, pred_test), confusion_matrix(y_test, pred_test)]
df

CPU times: user 1min 2s, sys: 48.6 ms, total: 1min 2s
Wall time: 1min 1s


Unnamed: 0,best_params,train_acc_%,train_conf_matrix,test_acc_%,test_conf_matrix
SVM,"{'C': 1, 'kernel': 'linear'}",95.02721,"[[2994, 45], [220, 2070]]",81.042032,"[[1178, 125], [308, 673]]"
MultiNB,{'alpha': 1},90.392194,"[[2917, 122], [390, 1900]]",80.25394,"[[1122, 181], [270, 711]]"
Log-reg,"{'C': 2, 'max_iter': 500}",93.56352,"[[2991, 48], [295, 1995]]",80.429072,"[[1180, 123], [324, 657]]"
K-nn,{'n_neighbors': 100},77.669356,"[[2649, 390], [800, 1490]]",78.633975,"[[1122, 181], [307, 674]]"
rfc,"{'criterion': 'gini', 'max_depth': 20, 'n_esti...",73.503472,"[[3039, 0], [1412, 878]]",71.76007,"[[1292, 11], [634, 347]]"
XGB-simple,"{'learning_rate': 0.1, 'max_depth': 10, 'n_est...",73.503472,"[[3039, 0], [1412, 878]]",71.76007,"[[1292, 11], [634, 347]]"
XGB-complex,,98.986677,"[[3028, 11], [43, 2247]]",76.663748,"[[1103, 200], [333, 648]]"
Ada_Bst-complex,,77.387878,"[[2703, 336], [869, 1421]]",74.781086,"[[1119, 184], [392, 589]]"


### 4. Submission the best_model

In [30]:
path = r'../input/nlp-getting-started'
test_df = pd.read_csv(path + '/test.csv')
X = tfidf_vectorizer.transform(test_df['text']) 
preds = clf_svm.predict(X)
preds

array([0, 1, 1, ..., 1, 1, 0])

In [31]:
sub_df = pd.DataFrame({'id': test_df['id'], 'target': preds})
sub_df.to_csv('submit_ML_model.csv')

### Final results.
80.25 % acc