In [1]:
import numpy as np
import keras
import tensorflow as tf
import pandas as pd
import nltk
import spacy
import re
import emot
import json
from nltk.corpus import stopwords
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Bidirectional, Dropout, Activation, Dense
from keras.layers.convolutional import Conv1D
from keras.layers.recurrent import LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.optimizers import RMSprop, adam
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras.utils import to_categorical

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_curve, auc

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Preprocessing

In [2]:
p1 = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$#-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
p2 = re.compile('(RT @[a-zA-Z0-9_]+)')
p3 = re.compile('(RT @[a-zA-Z0-9_]+:)')
p4 = re.compile('(@[a-zA-Z0-9_]+)')
p5 = re.compile('\W+')
p6 = re.compile('\d+')
reg = [p1, p2, p3, p4, p5, p6]

nlp = spacy.load('en')
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words("english")

with open('data/EmoticonSentimentLexicon.txt') as file:
    data = file.read()
emoticons_sent = {x.split('\t')[0]:int(x.split('\t')[1]) for x in data.split('\n')}
emoticons_data = [x.split('\t')[0] for x in data.split('\n')]

with open('data/EmojiSentimentLexicon.json') as file:
    data = json.loads(file.read())
emoji_sent = {x['emoji']:x['polarity'] for x in data}
emoji_data = [x['emoji'] for x in data]

In [3]:
def clear(text):
    # all emoji stuff
    emoji = [x['value'] for x in emot.emoji(text)]
    emoticons = [x for x in emoticons_data if x in text]
    for e in emoji:
        if emoji_sent.get(e, 0) > 0:
            text = text.replace(e, ' posemoji ')
        elif emoji_sent.get(e, 0) < 0:
            text = text.replace(e, ' negemoji ')
        else:
            text = text.replace(e, ' emoji ')
    for e in emoticons:
        if emoticons_sent.get(e, 0) > 0:
            text = text.replace(e, ' posemoji ')
        elif emoticons_sent.get(e, 0) < 0:
            text = text.replace(e, ' negemoji ')
        else:
            text = text.replace(e, ' emoji ')
    
    # additional regex
    text = re.sub(r"#(\w+)", " tag ", text)
    text = re.sub("\d+", " num ", text)
    
    for pattern in reg:
        text = re.sub(pattern, ' ', text)
    
    text = [word for word in text.split(' ') if word != '']
    
    # MOAR
    text = [lemmatizer.lemmatize(word) for word in text if word not in stop_words]
    pos_tokens = pos_tag(text)
    
    text = [word + '_' + tag for word, tag in pos_tokens]
    
    return ' '.join(text)

In [4]:
df = pd.read_csv('data/train.csv', encoding='latin-1', names=['lable', 'id', 'date', 'wtf', 'name', 'text'])

In [5]:
df = df.sample(n=100000)

In [12]:
sentence_length = 50
vocabular = 20000
embedding = 300
hidden_size = 16
batch_size = 64
epochs = 2

In [7]:
df['x'] = df.text.apply(lambda a: clear(a))

In [8]:
df['y'] = df.lable.apply(lambda b: 0 if b == 0 else 1)

In [9]:
x_tmp = df.x.values.tolist()
y_tmp = df.y.values.tolist()

tokenizer = Tokenizer(num_words=vocabular)
tokenizer.fit_on_texts(x_tmp)

x = tokenizer.texts_to_sequences(x_tmp)
x = pad_sequences(x, maxlen=sentence_length)

y = to_categorical(y_tmp)

In [10]:
def create_model():
    model = Sequential()
    model.add(Embedding(vocabular + 1, embedding, 
                       input_length=sentence_length, trainable=True))
    model.add(Bidirectional(LSTM(hidden_size, recurrent_dropout=0.5)))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))
    opt = adam(lr=0.001, decay=0.001)
    model.compile(loss='categorical_crossentropy', 
                 optimizer=opt,
                 metrics=['accuracy'])
    return model

In [13]:
global_precision = []
global_f1 = []
global_recall = []
global_res = []
confusion = []
out_real = []
out_pred = []

kfold = StratifiedKFold(n_splits=5)
for train, test in kfold.split(np.zeros(x.shape), df.lable.tolist()):
    checkpoint = ModelCheckpoint('model/model.hdf5', 
                             monitor='val_acc', save_best_only=True,
                             verbose=True)
    early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, 
                                   patience=15,  mode='auto',
                                   verbose=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5,
                                  patience=5, min_lr=0.001,
                                  verbose=True)
    callbacks = [checkpoint]
    model = create_model()
    history = model.fit(
        x[train], y[train],
        batch_size=batch_size,
        epochs=epochs,
        validation_data=(x[test], y[test]),
        callbacks=callbacks,
        shuffle=True
    )

    model = load_model("model/model.hdf5")
    
    results = model.predict(x[test])
    confusion.append(confusion_matrix(y[test].argmax(-1), results.argmax(-1)))
    global_precision.append(precision_score(y[test].argmax(-1), results.argmax(-1), average=None))
    global_f1.append(f1_score(y[test].argmax(-1), results.argmax(-1), average=None))
    global_recall.append(recall_score(y[test].argmax(-1), results.argmax(-1), average=None))
    global_res.append([global_precision, global_f1, global_recall])
    
    out_pred.extend(results)
    out_real.extend(y[test])

Train on 79999 samples, validate on 20001 samples
Epoch 1/2

Epoch 00001: val_acc improved from -inf to 0.75816, saving model to model/model.hdf5
Epoch 2/2

Epoch 00002: val_acc improved from 0.75816 to 0.75931, saving model to model/model.hdf5
Train on 79999 samples, validate on 20001 samples
Epoch 1/2

Epoch 00001: val_acc improved from -inf to 0.75821, saving model to model/model.hdf5
Epoch 2/2

Epoch 00002: val_acc improved from 0.75821 to 0.76051, saving model to model/model.hdf5
Train on 80000 samples, validate on 20000 samples
Epoch 1/2

Epoch 00001: val_acc improved from -inf to 0.76640, saving model to model/model.hdf5
Epoch 2/2

Epoch 00002: val_acc did not improve
Train on 80001 samples, validate on 19999 samples
Epoch 1/2

Epoch 00001: val_acc improved from -inf to 0.76069, saving model to model/model.hdf5
Epoch 2/2

Epoch 00002: val_acc improved from 0.76069 to 0.76209, saving model to model/model.hdf5
Train on 80001 samples, validate on 19999 samples
Epoch 1/2

Epoch 0000

In [16]:
np.average(global_res[0], axis=1)

array([[0.77196913, 0.75297264],
       [0.75725915, 0.76613111],
       [0.74351626, 0.78014279]])

In [19]:
global_res[0]

[[array([0.75845555, 0.76017238]),
  array([0.77618593, 0.74654944]),
  array([0.7913291 , 0.74544451]),
  array([0.77101449, 0.75374794]),
  array([0.7628606 , 0.75894893])],
 [array([0.75951644, 0.75910729]),
  array([0.75332166, 0.76729499]),
  array([0.75575073, 0.77615945]),
  array([0.75791188, 0.76612269]),
  array([0.75979506, 0.76197113])],
 [array([0.76058029, 0.75804517]),
  array([0.73176588, 0.78922646]),
  array([0.72323394, 0.80951429]),
  array([0.74524715, 0.77891054]),
  array([0.75675405, 0.76501749])]]