In [1]:
import string
import pandas as pd
import time

## import modules for NLP
import nltk
from nltk.corpus import stopwords

## import the modules for train & test data
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score

## word2vec with gensim; press pip install -U gensim before importing
import gensim

## Otherwise
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.metrics import classification_report, confusion_matrix
from keras.preprocessing.text import one_hot

### Loading & viewing dataset.
- References [link_1](https://github.com/Nhan121/Kaggle-6-first-projects/blob/master/NLP_Text_Classification/NLP_Text_classification.ipynb)

In [2]:
path = r'../input/nlp-getting-started'
train_df = pd.read_csv(path + '/train.csv')
test_df = pd.read_csv(path + '/test.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


## 1. Model 1. Only using One-hot Encoding
### 1.1. One-hot Encoding

In [3]:
test_size = 0.35
text = list(train_df['text'])
vocab_size = 50000

encoded_docs = [one_hot(d, vocab_size) for d in text]
print(encoded_docs[:10])

[[902, 10396, 48308, 15005, 20564, 5275, 35195, 26550, 15270, 14367, 49640, 8031, 41777], [38776, 4518, 46920, 31425, 43419, 11088, 44938], [41777, 33062, 38696, 13990, 33219, 4316, 47688, 48308, 48056, 467, 22366, 20371, 10097, 3729, 43488, 41712, 21047, 4316, 20566, 44982, 48308, 11870], [42587, 43490, 43837, 36338, 14427, 43488, 44982, 4316, 2529], [27160, 13762, 14138, 35195, 18371, 27738, 22141, 36689, 39232, 47402, 27738, 14427, 46582, 27862, 18452, 16905], [33906, 21225, 2529, 25498, 42633, 40256, 4316, 19610, 17944, 29419, 13990, 33147, 27639, 4518, 25497, 14427], [48722, 34631, 30357, 5867, 40749, 26399, 10343, 5275, 37765, 4316, 493, 39698, 16203, 2986], [37127, 10292, 11207, 5275, 15005, 15983, 13110, 17500, 36123, 34332, 18452, 4518, 4316, 15005, 19627], [24745, 13440, 1561, 43488, 45217, 24046, 4316, 15005, 1041, 16366, 15005, 32817], [37127, 36971, 48439, 15005, 26136, 37598, 4442, 13990, 902, 14539]]


#### Checking the max-length of text

In [4]:
train_df['Numb_words'] = train_df['text'].str.split().map(lambda x: len(x))
test_df['Numb_words'] = test_df['text'].str.split().map(lambda x: len(x))
train_df['Numb_words'].max(), test_df['Numb_words'].max()

(31, 31)

In [5]:
max_length = 31
embedding_dim = 32
trunc_type='post'

padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

print(padded_docs)
padded_docs.shape

[[  902 10396 48308 ...     0     0     0]
 [38776  4518 46920 ...     0     0     0]
 [41777 33062 38696 ...     0     0     0]
 ...
 [21256 31929   681 ...     0     0     0]
 [ 8317   216 31009 ...     0     0     0]
 [15005 21245  3473 ...     0     0     0]]


(7613, 31)

#### Lauching model

In [6]:
from tensorflow import keras
from tensorflow.keras import layers
from keras import layers
from keras.models import Sequential
from keras.layers import Flatten
from keras.layers import Dense, Dropout, Activation, Conv1D, GlobalMaxPooling1D, MaxPooling1D

y = train_df.target.to_numpy()
x_train, x_test, y_train, y_test = train_test_split(padded_docs, y, test_size=test_size, 
                                                    stratify = y, random_state = 42)

model = Sequential()
model.add(Embedding(vocab_size, 120, input_length=max_length))
model.add(Dropout(0.1))
model.add(Flatten())
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['acc'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 31, 120)           6000000   
_________________________________________________________________
dropout (Dropout)            (None, 31, 120)           0         
_________________________________________________________________
flatten (Flatten)            (None, 3720)              0         
_________________________________________________________________
dense (Dense)                (None, 1)                 3721      
Total params: 6,003,721
Trainable params: 6,003,721
Non-trainable params: 0
_________________________________________________________________


In [7]:
history = model.fit(x_train, y_train, 
                    epochs = 5, batch_size = 128,
                    verbose = 2, validation_data = (x_test, y_test))

Epoch 1/5
39/39 - 3s - loss: 0.6638 - acc: 0.5918 - val_loss: 0.6364 - val_acc: 0.6795
Epoch 2/5
39/39 - 2s - loss: 0.5571 - acc: 0.7902 - val_loss: 0.5539 - val_acc: 0.7565
Epoch 3/5
39/39 - 2s - loss: 0.4027 - acc: 0.8775 - val_loss: 0.4902 - val_acc: 0.7797
Epoch 4/5
39/39 - 2s - loss: 0.2646 - acc: 0.9281 - val_loss: 0.4677 - val_acc: 0.7962
Epoch 5/5
39/39 - 2s - loss: 0.1704 - acc: 0.9602 - val_loss: 0.4644 - val_acc: 0.7992


#### Evaluate accuracy

In [8]:
score, tr_acc = model.evaluate(x_train, y_train)
score, t_acc = model.evaluate(x_test, y_test)

print('Train accuracy:', tr_acc)
print('Test accuracy:', t_acc)

Train accuracy: 0.9749393463134766
Test accuracy: 0.799249529838562


### 1.2. Submit model

In [9]:
import numpy as np
text = list(test_df['text'])
encoded_docs = [one_hot(d, vocab_size) for d in text]
padded_docs = pad_sequences(encoded_docs, maxlen = max_length, padding='post')
print(padded_docs.shape)
preds = model.predict(padded_docs)
preds = np.round(preds).ravel()

(3263, 31)


In [10]:
sub_df = pd.DataFrame({'id': test_df['id'], 'target': preds})
sub_df.to_csv('submit_DL_model_1.csv')

### Final results.
79.85 % acc

## 2. Model 2. LSTM

In [11]:
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam
from tqdm import tqdm
from nltk.tokenize import word_tokenize

stop = set(stopwords.words('english'))

## nltk.download('stopwords') and nltk.download('punkt') before running

def create_corpus(data):
    corpus=[]
    for tweet in tqdm(data['text']):
        words = [word.lower() for word in word_tokenize(tweet) if((word.isalpha()==1) & (word not in stop))]
        corpus.append(words)
    return corpus

sentences = train_df['text']
corpus = create_corpus(train_df)

100%|██████████| 7613/7613 [00:03<00:00, 2524.00it/s]


In [12]:
MAX_LEN = 32
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences = tokenizer_obj.texts_to_sequences(corpus)

tweet_pad = pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

In [13]:
word_index = tokenizer_obj.word_index
print('Number of unique words:',len(word_index))

Number of unique words: 15013


- Here, we will use `glove.6B.100d.txt` to pre-trainned the model.

In [14]:
embedding_dict={}
with open(r"../input/glove6b100dtxt/glove.6B.100d.txt", 'r', encoding='utf8', errors='ignore') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], 'float32')
        embedding_dict[word]=vectors
f.close()

In [15]:
num_words = len(word_index)+1
embedding_matrix = np.zeros((num_words, 100))

for word,i in tqdm(word_index.items()):
    if i > num_words:
        continue
    
    emb_vec=embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[i] = emb_vec

100%|██████████| 15013/15013 [00:00<00:00, 241249.77it/s]


#### Fitting & Evaluation

In [16]:
X_train, X_test,y_train,y_test = train_test_split(tweet_pad, y, test_size = 0.35, random_state = 42)

model=Sequential()

embedding=Embedding(num_words, 100, embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

model.add(embedding)
model.add(SpatialDropout1D(0.1))
model.add(LSTM(64, dropout=0.1, recurrent_dropout=0.1))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc'])
model.summary()

history = model.fit(X_train, y_train, batch_size = 64, epochs = 13, validation_data = (X_test, y_test), verbose=2)

score, tr_acc = model.evaluate(X_train, y_train)
score, t_acc = model.evaluate(X_test, y_test)

print('Train accuracy:', tr_acc)
print('Test accuracy:', t_acc)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 32, 100)           1501400   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 32, 100)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 64)                42240     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 1,543,705
Trainable params: 42,305
Non-trainable params: 1,501,400
_________________________________________________________________
Epoch 1/13
78/78 - 5s - loss: 0.5526 - acc: 0.7259 - val_loss: 0.4657 - val_acc: 0.7970
Epoch 2/13
78/78 - 5s - loss: 0.4713 - acc: 0.7910 - val_loss: 0.4347 - val_acc: 0.8049
Epoch 3/13
78/78 - 5s - loss: 0.4526 - acc: 0.7916 - val_loss

#### Submit your result.

In [17]:
sentences = test_df['text']
corpus = create_corpus(test_df)
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences = tokenizer_obj.texts_to_sequences(corpus)
tweet_pad = pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

100%|██████████| 3263/3263 [00:01<00:00, 2484.09it/s]


In [18]:
preds = model.predict(tweet_pad)
preds = np.round(preds).ravel()
sub_df = pd.DataFrame({'id': test_df['id'], 'target': preds})
sub_df.to_csv('submit_DL_model_2_LSTM.csv')

#### Result
79.65 % acc

#### Comment
- We need clean text-data carefully before using Deep-Learning model. 
- The original-text contains something that make the Embeddings confusing.
- When we have pre-trained embeddings, doing standard preprocessing steps might not be a good idea because some of the valuable information can be lost. It is better to get vocabulary as close to embeddings as possible.