In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.layers as L
from tensorflow.keras.models import Sequential
from IPython.display import clear_output
import util
import encoders
import re
from nltk import TweetTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from random import sample
import pickle
import imp
from util import Text2mat
from encoders import Encoder, CombinedEncoder

import warnings
warnings.filterwarnings('ignore')

print(tf.__version__)

2.1.0


### recurrent neural network

##### one layer LSTM
1. add dropout(worked); lstm dropout & lstm recurrent dropout & dropout layer. 
   
   validation loss stablises near 0.20(best 0.18)
2. add embedding size and hidden layer dimension;(x)
3. increase the depth of network; (x)
4. decrease learning rate; (x)
5. add a LSTM layer; (x)
6. use pretrained embeddings (worked);
    validation loss decreases under 0.18
7. use biLSTM;(x)
8. use sentences in reversed order; (x)
9. add self-attention; (x)
10. add ELMO word embeddings;

In [31]:
imp.reload(util)
imp.reload(encoders)

from util import Text2mat
from encoders import Encoder, CombinedEncoder, WordRNN, CharRNN

In [3]:
df = pd.read_csv('./data/train_v2.csv')

labels = df.target.values
text = df.apply(lambda row:row['text'],axis=1)\
          .apply(lambda line:line.replace('#','').lower())\
          .tolist()

In [51]:
# tokenizer = TweetTokenizer()
# lines = [tokenizer.tokenize(line) for line in lines]
lines = util.clean_text(text)

lines_train, lines_dev, y_train, y_dev = train_test_split(lines,labels,test_size=0.2,random_state=42)

In [9]:
# #use pretrained word embeddings
# import gensim.downloader as api
# pre_word_vectors = api.load("glove-wiki-gigaword-100")

# pre_embeddings = {word:pre_word_vectors[word] for word in pre_word_vectors.vocab}

with open('glove-wiki-gigaword-100-word-embeddings.pkl','rb') as f:
    pre_embeddings = pickle.load(f)


all_pre_embeddings = np.stack(pre_embeddings.values())
pre_embed_mean, pre_embed_var = np.mean(all_pre_embeddings,axis=0), np.var(all_pre_embeddings,axis=0)

embedding_weight_matrix = np.random.normal(pre_embed_mean,pre_embed_var,(text2mat.n_tokens,100))

num_oov = 0
for i in range(text2mat.n_tokens):
    word = text2mat.id_to_token[i]
    if word in pre_embeddings:
        embedding_weight_matrix[i] = pre_embeddings[word]
    else:
        num_oov += 1
print(f'{num_oov} out-of-vocabulary words.')

235 out-of-vocabulary words.


In [5]:
text2mat_word = Text2mat(lines,low_count_threshold=2)
text2mat_char = Text2mat(lines,mode='char',low_count_threshold=100)

In [58]:
emb_size, hid_size = 256, 128
vocab_size, char_size = text2mat_word.n_tokens, text2mat_char.n_tokens
conv_kernel_sizes = [3,5,7,9]
conv_hid_size = 64

#test word + char -level word embeddings
model = CombinedEncoder(vocab_size,char_size,emb_size,emb_size,hid_size,hid_size,
                       conv_kernel_sizes,conv_hid_size)

# #test only word rnn
# model = WRNN(vocab_size,emb_size,hid_size)

# #test attention encoder
# model = newEncoder(vocab_size,emb_size,hid_size,128)

In [42]:
model.get_logits(['i awf fgd'],text2mat_word)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.04130412]], dtype=float32)>

##### shape verification

In [None]:
dummy_input_w = text2mat_word.to_matrix(lines[:3])
dummy_input_c = text2mat_char.to_matrix(lines[:3])
print(f'input word:{dummy_input_w.shape}')
print(f'input char:{dummy_input_c.shape}')

word_encoder_output = model.w_encoder(dummy_input_w)
print(f'word encoder:{word_encoder_output.shape}')

char_encoder_output = model.c_encoder(dummy_input_c)
print(f'word encoder:{char_encoder_output.shape}')

total_output = model(dummy_input_w,dummy_input_c)
print(f'final output:{total_output.shape}')

##### training

In [59]:
def compute_loss(model,batch_lines,batch_label):
    logits = model.get_logits(batch_lines,text2mat_word,text2mat_char)
    labels = tf.cast(tf.reshape(batch_label,[-1,1]),logits.dtype)
    return tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels,logits))

def validation_loss(model,dev_lines,dev_label,batch_size=64):
    total_loss = 0
    N = len(dev_lines)
    step = 0
    for i in range(0,N,batch_size):
        dev_line, dev_y = dev_lines[i:i+batch_size], dev_label[i:i+batch_size]
        total_loss += compute_loss(model,dev_line,dev_y)
        step += 1
    return total_loss / step

In [16]:
def generate_minibatch(X,y,batch_size=64,cycle=True):
    while True:
        N = len(X)
        #shuffle at the beginning of each round
        indices = np.arange(N)
        np.random.shuffle(indices)
        X = [X[i] for i in indices]
        
        for i in range(0,N,batch_size):
            batch_X = X[i:i+batch_size]
            batch_y = y[i:i+batch_size]
            yield batch_X, batch_y
        
        if not cycle:
            break

In [60]:
BATCH_SIZE = 256
EPOCHS = 500
NUM_OF_EXAMPLE = len(lines_train)
train_indices = list(np.arange(len(lines_train)))
optimizer = tf.optimizers.Adam()

train_history = []
dev_history = []

best_dev_loss = np.inf

for epoch in range(EPOCHS):
    #shuffle at the beginning of each round
    np.random.shuffle(train_indices)
    lines_train = [lines_train[i] for i in train_indices]
    y_train = [y_train[i] for i in train_indices]

    epoch_losses = []
    
    step = 1
    for i in range(0,NUM_OF_EXAMPLE,BATCH_SIZE):
        batch_line = lines_train[i:i+BATCH_SIZE]
        batch_label = y_train[i:i+BATCH_SIZE]
        
        batch_size_step = len(batch_line)
        
        #print(f'EPOCH {epoch} STEP {step} TRAIN ON {batch_size_step} EXAMPLES.')
    
        #print(batch_line)
        with tf.GradientTape() as tape:
            loss_i = compute_loss(model,batch_line,batch_label)
    
        grads = tape.gradient(loss_i,model.trainable_variables)
        optimizer.apply_gradients(zip(grads,model.trainable_variables))
        
        #print(sum(sum(model.get_weights()[0])))
        
        epoch_losses.append(loss_i.numpy())
        
        step += 1
    
    train_history.append((epoch,np.mean(epoch_losses)))
    dev_loss_i = validation_loss(model,lines_dev,y_dev,BATCH_SIZE)
    dev_history.append((epoch,dev_loss_i.numpy()))
    
    print(f'EPOCH {epoch} train loss:{train_history[-1]}')
    print(f'EPOCH {epoch} dev loss:{dev_history[-1]}')
#     clear_output(wait=True)
    
#     plt.figure(figsize=[12, 6])
#     plt.subplot(1,2,1), plt.title('train loss'), plt.grid()
#     plt.scatter(*zip(*train_history),alpha=0.5,color='blue')
    
#     plt.subplot(1,2,2), plt.title('dev loss'), plt.grid()
#     plt.scatter(*zip(*dev_history),color='orange')
        
#     plt.show()
        
    #restore from the best 
    if dev_loss_i < best_dev_loss:
        best_weights = model.get_weights()
        best_dev_loss = dev_loss_i

EPOCH 0 train loss:(0, 0.6807831)
EPOCH 0 dev loss:(0, 0.6583183)
EPOCH 1 train loss:(1, 0.529605)
EPOCH 1 dev loss:(1, 0.5164897)
EPOCH 2 train loss:(2, 0.3588581)
EPOCH 2 dev loss:(2, 0.51208913)
EPOCH 3 train loss:(3, 0.26165175)
EPOCH 3 dev loss:(3, 0.5357696)
EPOCH 4 train loss:(4, 0.20656405)
EPOCH 4 dev loss:(4, 0.65574425)


KeyboardInterrupt: 

In [95]:
a = [('ab',3),('a',10),('a',3),('b',1),('ab',0.9)]

In [101]:
b = ['',' asd','ac']
sorted(b)

['', ' asd', 'ac']

In [106]:
"acv".find('c',0)

1

In [None]:
#model reset
keras.backend.clear_session()

BATCH_SIZE = 256
EPOCHS = 500
patience = 5
optimizer = tf.optimizers.Adam()
train_size = list(np.arange(len(lines_train)))

train_history = []
dev_history = []

best_dev_loss = np.inf
data_generator = generate_minibatch(lines_train,y_train,BATCH_SIZE)

for i in range(EPOCHS):
    batch_line, batch_label = next(data_generator)
    batch_size_step = len(batch_line)
    
    with tf.GradientTape() as tape:
        loss_i = compute_loss(model,batch_line,batch_label)
    
    grads = tape.gradient(loss_i,model.trainable_variables)
    optimizer.apply_gradients(zip(grads,model.trainable_variables))
    
    train_history.append((i,loss_i.numpy()))
    if i % 10 == 0:
        clear_output(wait=True)
        
        dev_loss_i = validation_loss(model,lines_dev,y_dev,BATCH_SIZE)
        dev_history.append((i,dev_loss_i))

        print(f'train on {batch_size_step} examples.')
        
        plt.figure(figsize=[12, 6])
        plt.subplot(1,2,1), plt.title('train loss'), plt.grid()
        plt.scatter(*zip(*train_history),alpha=0.5,color='blue')
        
        plt.subplot(1,2,2), plt.title('validation loss'), plt.grid()
        plt.plot(*zip(*dev_history),color='orange')
        
        plt.show()
        
        #restore from the best
        if dev_loss_i < best_dev_loss:
            best_weights = model.get_weights()
            best_dev_loss = dev_loss_i
        
#restore from the best iteration
model.set_weights(best_weights)

In [61]:
model.set_weights(best_weights)

In [62]:
def evaluate(model,lines_to_eva,y,batch_size=256):
    pred = np.array([])
    for i in range(0,len(lines_to_eva),batch_size):
        batch_line = lines_to_eva[i:i+batch_size]
        X_w, X_c = text2mat_word.to_matrix(batch_line), text2mat_char.to_matrix(batch_line)
        pred_i = np.rint(tf.math.sigmoid(model(X_w,X_c))).astype('int').reshape((-1,))
        pred = np.concatenate([pred,pred_i])
    return classification_report(y,pred)
#     pos_truth, pos_pred = y.sum(), pred.sum()
#     true_positive = (pred == y)[y == 1].sum()
#     return true_positive / pos_pred, true_positive / pos_truth

In [63]:
print(evaluate(model,lines_dev,y_dev,64))
# p, r = evaluate(model,lines_dev,y_dev)
# print(f"precision : {p}")
# print(f"recall : {r}")

              precision    recall  f1-score   support

           0       0.79      0.84      0.81       845
           1       0.77      0.71      0.73       642

    accuracy                           0.78      1487
   macro avg       0.78      0.77      0.77      1487
weighted avg       0.78      0.78      0.78      1487



In [40]:
def predict(model,lines_to_test,batch_size=256):
    pred = []
    for i in range(0,len(lines_to_test),batch_size):
        X = text2mat.to_matrix(lines_to_test[i:i+batch_size])
        pred_i = np.rint(tf.math.sigmoid(model(X))).astype('int').reshape((-1,))
        pred.extend(list(pred_i))
    return pred

In [57]:
pred = predict(model,lines_dev)

In [64]:
rrrr = re.compile('@\S*|http:\S*|https:\S*|http\S*|http://*S*')
re.sub(rrrr,"","leicester_merc : icymi - ashes 2015: australia collapse at trent bridge - how twitter reaû_ http://t.co/hqewmreyso) http://t.co/y4y8fcljed")

'leicester_merc : icymi - ashes 2015: australia collapse at trent bridge - how twitter rea\x89û_  '

In [115]:
df_test = pd.read_csv('./data/test.csv')

text_test = df_test.apply(lambda row:row['text'],axis=1)\
          .apply(lambda line:line.replace('#','').lower())\
          .tolist()

lines_test = [re.sub(pattern,'',line).strip() for line in text_test]
tokenizer = TweetTokenizer()
lines_test = [tokenizer.tokenize(line) for line in lines_test]

In [206]:
pred = predict(model,lines_test,64)

In [207]:
df_to_submit = pd.DataFrame({'id':df_test.id.values,'target':pred})

df_to_submit.to_csv('submission_rnn_v2.csv',index=False)

### baseline model: Naive Bayes

- use english stopwords

- filter rare words with document count < 2(not important)

- tf-idf feature

In [21]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.naive_bayes import MultinomialNB

# df = pd.read_csv('./data/train_v1.csv')

# y = df.target.values

# lines = df.apply(lambda row:row['text'],axis=1).tolist()
# tfidf_vectorizer = TfidfVectorizer(min_df=2,stop_words='english')
# X = tfidf_vectorizer.fit_transform(lines)

# clf = MultinomialNB()
# clf.fit(X,y)

# df_test = pd.read_csv('./data/test.csv')

# lines_test = df_test.apply(lambda row:row['text'],axis=1).tolist()
# X_test = tfidf_vectorizer.transform(lines_test)

# pred = clf.predict(X_test)

# df_to_submit = pd.DataFrame({'id':df_test.id.values,'target':pred})

# df_to_submit.to_csv('submission_nb.csv',index=False)