In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/realornot-2020/submission.csv
/kaggle/input/realornot-2020/test_cleaned.csv
/kaggle/input/realornot-2020/train_cleaned.csv


https://stackoverflow.com/questions/50060241/how-to-use-glove-word-embeddings-file-on-google-colaboratory

link to configure Glove

In [2]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [3]:
import gc
import re
import string
import operator
from collections import defaultdict

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import matplotlib.pyplot as plt
import seaborn as sns

import tokenization
from wordcloud import STOPWORDS

from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import precision_score, recall_score, f1_score

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.layers import Dense, Input, Dropout, GlobalAveragePooling1D
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, Callback

SEED = 1337

In [4]:
df_train = pd.read_csv("/kaggle/input/realornot-2020/train_cleaned.csv", sep=",")
df_test = pd.read_csv("/kaggle/input/realornot-2020/test_cleaned.csv", sep=",")
submission = pd.read_csv("/kaggle/input/realornot-2020/submission.csv", sep=",")

In [5]:
bert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1', trainable=True)

In [57]:
Dropout_num = 0.5
learning_rate = 1e-6
valid = 0.25
epochs_num = 30
batch_size_num = 16
target_corrected = True
target_big_corrected = False

In [58]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)


def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)


def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def clean_tweets(tweet):
    """Removes links and non-ASCII characters"""
    
    tweet = ''.join([x for x in tweet if x in string.printable])
    
    # Removing URLs
    tweet = re.sub(r"http\S+", "", tweet)
    
    return tweet

def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)


def clean(df):
    df['text_cleaned'] = df['text_cleaned'].apply(lambda x : str(x).lower())
    df['text'] = df['text'].apply(lambda x : remove_URL(x))
    df['text'] = df['text'].apply(lambda x : remove_html(x))
    df['text'] = df['text'].apply(lambda x: remove_emoji(x))
    df['text'] = df['text'].apply(lambda x : remove_punct(x))
    df['text'] = df['text'].apply(lambda x : convert_abbrev_in_text(x))
    df['text_cleaned'] = df['text_cleaned'].apply(lambda x : clean_tweets(x))
    df['text_cleaned'] = df['text_cleaned'].apply(lambda x : lemmatization(x))
    return df


if target_corrected:
    ids_with_target_error = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]
    df_train.loc[df_train['id'].isin(ids_with_target_error),'target_relabeled'] = 0
    df_train[df_train['id'].isin(ids_with_target_error)]
    
    
if target_big_corrected:
    df_train = clean(df_train)
    df_test = clean(df_test)

In [59]:
def bert_encode(texts, tokenizer, max_len=128):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

def build_model(bert_layer, max_len=128):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    
    if Dropout_num == 0:
        # Without Dropout
        out = Dense(1, activation='sigmoid')(clf_output)
    else:
        # With Dropout(Dropout_num), Dropout_num > 0
        x = Dropout(Dropout_num)(clf_output)
        out = Dense(1, activation='sigmoid')(x)
        
    optimizer1 = SGD(learning_rate=learning_rate, momentum=0.7)
    optimizer2 = Adam(lr=learning_rate)

    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(optimizer2, loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [60]:
# Load BERT from the Tensorflow Hub
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

# Load tokenizer from the bert layer
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

# Encode the text into tokens, masks, and segment flags
train_input = bert_encode(df_train.text_cleaned.values, tokenizer, max_len=128)
test_input = bert_encode(df_test.text_cleaned.values, tokenizer, max_len=128)
train_labels = df_train.target_relabeled.values

In [61]:
# Model: Build, Train, Predict, Submit -- NO DROPOUT
model_BERT = build_model(bert_layer, max_len=128)
model_BERT.summary()

Model: "model_8"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 128)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
keras_layer_9 (KerasLayer)      [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]           

In [62]:
%%time 
checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', save_best_only=True)
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)

train_history = model_BERT.fit(
    train_input, train_labels,
    validation_split = valid,
    epochs = epochs_num,
    callbacks=[checkpoint, es],
    batch_size = batch_size_num
)

Train on 5709 samples, validate on 1904 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 00009: early stopping
CPU times: user 9min 30s, sys: 3min 41s, total: 13min 12s
Wall time: 14min 48s


In [64]:
%%time
test_pred_BERT = model_BERT.predict(test_input)
test_pred_BERT_int = test_pred_BERT.round().astype('int')
submission['target'] = test_pred_BERT_int
submission.to_csv("submission_bert_dropout45_9e-7_30epochs.csv", index=False, header=True, sep=",")

CPU times: user 10.2 s, sys: 5.6 s, total: 15.8 s
Wall time: 15.7 s


In [None]:
"""
Dropout_num = 0.45
learning_rate = 1e-6, 2e-6, 5e-7, 9e-7
valid = 0.25
epochs_num = 30
batch_size_num = 16
target_corrected = True
target_big_corrected = False
real: 0.83435, 0.82310, 0.82413,0.81697
"""

"""
Dropout_num = 0.55
learning_rate = 7e-6, 3e-6
valid = 0.25
epochs_num = 30
batch_size_num = 16
target_corrected = True
target_big_corrected = False
real: 0.82106, 0.82208
"""

"""
Dropout_num = 0.5
learning_rate = 2e-6
valid = 0.25
epochs_num = 25
batch_size_num = 16
target_corrected = True
target_big_corrected = False
val: 0.8409
real: 0.83537
"""

"""
BEST!!!!!!!
Dropout_num = 0.5
learning_rate = 1e-6
valid = 0.25
epochs_num = 25
batch_size_num = 16
target_corrected = True
target_big_corrected = False
val: 0.8388
real: 0.83640

Train on 5709 samples, validate on 1904 samples
Epoch 1/25
5709/5709 [==============================] - 116s 20ms/sample - loss: 0.6384 - accuracy: 0.6423 - val_loss: 0.4758 - val_accuracy: 0.7894
Epoch 2/25
5709/5709 [==============================] - 97s 17ms/sample - loss: 0.4862 - accuracy: 0.7723 - val_loss: 0.4136 - val_accuracy: 0.8204
Epoch 3/25
5709/5709 [==============================] - 97s 17ms/sample - loss: 0.4183 - accuracy: 0.8199 - val_loss: 0.3934 - val_accuracy: 0.8283
Epoch 4/25
5709/5709 [==============================] - 98s 17ms/sample - loss: 0.3768 - accuracy: 0.8422 - val_loss: 0.3899 - val_accuracy: 0.8351
Epoch 5/25
5709/5709 [==============================] - 97s 17ms/sample - loss: 0.3480 - accuracy: 0.8581 - val_loss: 0.3874 - val_accuracy: 0.8388
Epoch 6/25
5709/5709 [==============================] - 95s 17ms/sample - loss: 0.3261 - accuracy: 0.8679 - val_loss: 0.3890 - val_accuracy: 0.8356
Epoch 7/25
5709/5709 [==============================] - 95s 17ms/sample - loss: 0.2979 - accuracy: 0.8819 - val_loss: 0.3945 - val_accuracy: 0.8351
Epoch 8/25
5709/5709 [==============================] - 95s 17ms/sample - loss: 0.2716 - accuracy: 0.8970 - val_loss: 0.4006 - val_accuracy: 0.8388
Epoch 00008: early stopping
"""

"""
Dropout_num = 0.5
learning_rate = 5e-8
valid = 0.25
epochs_num = 60
batch_size_num = 16
target_corrected = True
target_big_corrected = False
val: 0.8309
real: 0.83231
"""

"""
Dropout_num = 0.5
learning_rate = 5e-7
valid = 0.3
epochs_num = 25
batch_size_num = 16
target_corrected = True
target_big_corrected = False
real: 0.81595
val: 0.8363
"""
"""
Dropout_num = 0.4
learning_rate = 5e-7
valid = 0.2
epochs_num = 25
batch_size_num = 16
target_corrected = True
target_big_corrected = False
val: 0.8378
real: 0.81901
"""

"""
Dropout_num = 0.4
learning_rate = 2e-7
valid = 0.25
epochs_num = 30
batch_size_num = 16
target_corrected = True
target_big_corrected = False
max val: 0.8380
real: 0.82310
"""

"""
Dropout_num = 0.4
learning_rate = 2e-7
valid = 0.25
epochs_num = 30
batch_size_num = 16
target_corrected = True
target_big_corrected = False
max val: 0.8340
real: 0.82413
"""

"""
Dropout_num = 0
learning_rate = 2e-6
valid = 0.2
epochs_num = 10
batch_size_num = 16
target_corrected = True
target_big_corrected = False
"""

"""
Dropout_num = 0
learning_rate = 2e-6
valid = 0.25
epochs_num = 10
batch_size_num = 26 # 32 is not good at all
target_corrected = True
target_big_corrected = False
"""

"""
Dropout_num = 0
learning_rate = 2e-6
valid = 0.25
epochs_num = 10
batch_size_num = 16
target_corrected = True
target_big_corrected = False
"""

# ---

"""
Dropout_num = 0
learning_rate = 2e-5
valid = 0.25
epochs_num = 10
batch_size_num = 32
target_corrected = True
target_big_corrected = False
"""

"""
Dropout_num = 0
learning_rate = 2e-5
valid = 0.2
epochs_num = 10
batch_size_num = 32
target_corrected = True
target_big_corrected = False
"""

"""
Dropout_num = 0.2
learning_rate = 2e-6
valid = 0.25
epochs_num = 10
batch_size_num = 16
target_corrected = True
target_big_corrected = False

Train on 5709 samples, validate on 1904 samples
Epoch 1/10
5709/5709 [==============================] - 119s 21ms/sample - loss: 0.5191 - accuracy: 0.7432 - val_loss: 0.4100 - val_accuracy: 0.8220
Epoch 2/10
5709/5709 [==============================] - 98s 17ms/sample - loss: 0.3770 - accuracy: 0.8457 - val_loss: 0.4100 - val_accuracy: 0.8225
Epoch 3/10
5709/5709 [==============================] - 98s 17ms/sample - loss: 0.3152 - accuracy: 0.8721 - val_loss: 0.3960 - val_accuracy: 0.8377
Epoch 4/10
5709/5709 [==============================] - 95s 17ms/sample - loss: 0.2528 - accuracy: 0.9017 - val_loss: 0.4031 - val_accuracy: 0.8398
Epoch 5/10
5709/5709 [==============================] - 95s 17ms/sample - loss: 0.1971 - accuracy: 0.9277 - val_loss: 0.4315 - val_accuracy: 0.8340
Epoch 00005: early stopping
CPU times: user 5min 29s, sys: 2min 3s, total: 7min 33s
Wall time: 8min 24s

0.83128
"""

"""
Dropout_num = 0.25
learning_rate = 2e-6
valid = 0.25
epochs_num = 10
batch_size_num = 16
target_corrected = True
target_big_corrected = False
"""

"""
Dropout_num = 0.2
learning_rate = 5e-7
valid = 0.25
epochs_num = 10
batch_size_num = 16
target_corrected = True
target_big_corrected = False

Train on 5709 samples, validate on 1904 samples
Epoch 1/10
5709/5709 [==============================] - 118s 21ms/sample - loss: 0.6110 - accuracy: 0.6728 - val_loss: 0.5232 - val_accuracy: 0.7600
Epoch 2/10
5709/5709 [==============================] - 99s 17ms/sample - loss: 0.4963 - accuracy: 0.7747 - val_loss: 0.4495 - val_accuracy: 0.8072
Epoch 3/10
5709/5709 [==============================] - 99s 17ms/sample - loss: 0.4369 - accuracy: 0.8106 - val_loss: 0.4191 - val_accuracy: 0.8246
Epoch 4/10
5709/5709 [==============================] - 98s 17ms/sample - loss: 0.4063 - accuracy: 0.8278 - val_loss: 0.4021 - val_accuracy: 0.8304
Epoch 5/10
5709/5709 [==============================] - 98s 17ms/sample - loss: 0.3748 - accuracy: 0.8434 - val_loss: 0.3946 - val_accuracy: 0.8309
Epoch 6/10
5709/5709 [==============================] - 98s 17ms/sample - loss: 0.3541 - accuracy: 0.8546 - val_loss: 0.3912 - val_accuracy: 0.8330
Epoch 7/10
5709/5709 [==============================] - 99s 17ms/sample - loss: 0.3368 - accuracy: 0.8642 - val_loss: 0.3893 - val_accuracy: 0.8351
Epoch 8/10
5709/5709 [==============================] - 95s 17ms/sample - loss: 0.3163 - accuracy: 0.8709 - val_loss: 0.3914 - val_accuracy: 0.8346
Epoch 9/10
5709/5709 [==============================] - 95s 17ms/sample - loss: 0.2995 - accuracy: 0.8846 - val_loss: 0.3918 - val_accuracy: 0.8377
Epoch 10/10
5709/5709 [==============================] - 95s 17ms/sample - loss: 0.2816 - accuracy: 0.8926 - val_loss: 0.3947 - val_accuracy: 0.8351
Epoch 00010: early stopping
CPU times: user 10min 38s, sys: 4min 7s, total: 14min 45s
Wall time: 16min 34s

0.83231
"""

---

# Bi Directional RNN

In [None]:
import pandas as pd
import numpy as np
import os

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

from nltk.corpus import stopwords
from nltk.util import ngrams

from wordcloud import WordCloud

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.metrics import classification_report,confusion_matrix

from collections import defaultdict
from collections import Counter
plt.style.use('ggplot')
stop=set(stopwords.words('english'))

import re
from nltk.tokenize import word_tokenize
import gensim
import string

from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM,Dense, SpatialDropout1D, Dropout
from keras.initializers import Constant
from keras.optimizers import Adam

In [None]:
def create_corpus_new(df):
    corpus=[]
    for tweet in tqdm(df['text']):
        words=[word.lower() for word in word_tokenize(tweet)]
        corpus.append(words)
    return corpus   

In [None]:
# Bidirectional LSTM
model=Sequential()
embedding=Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)
model.add(embedding)
model.add(Bidirectional(LSTM(100,return_sequences=True,dropout=0.20)))
model.add(TimeDistributed(Dense(100,activation='relu')))
model.add(Flatten())
model.add(Dense(100,activation='relu'))
model.add(Dense(1, activation='sigmoid'))
optimzer=Adam(learning_rate=5e-5)
model.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])