In [50]:
import sys
sys.path.append('../')
import string
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from pprint import pprint
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras.utils import to_categorical
import keras.backend as K
from gensim.models import KeyedVectors
import word2vecReader as godin_embedding
from sklearn.metrics import accuracy_score, f1_score,precision_score, recall_score
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [12,10]
from mlens.visualization import corrmat

In [51]:
def load_data_from_file(filename,test_flag = False):
    data = pd.read_csv(filename, sep="\t", header=None)
    if not test_flag:
        data.columns = ["tweet_id", "username", "database_id", "class","tweet"]
    else:
        data.columns = ["a", "b", "med","med", "tweet","class",]
    return data

In [52]:
train_data = load_data_from_file('dataset/personal_intake_tweets.txt')
dev_data = load_data_from_file('dataset/personal_intake_tweets_dev.txt')

In [53]:
train_sentences = train_data['tweet'].tolist()+dev_data['tweet'].tolist()
train_labels = train_data['class'].tolist()+dev_data['class'].tolist()

In [54]:
test_data = load_data_from_file('dataset/task_2_test_full_form.txt',test_flag=True)

In [55]:
test_labels = test_data['class'].tolist()
test_sentences = test_data['tweet'].tolist()

In [56]:
len(train_labels),len(train_sentences),len(test_labels),len(test_sentences)

(9107, 9107, 7419, 7419)

In [57]:
test_labels = [x-1 for x in test_labels]
train_labels = [x-1 for x in train_labels]

In [58]:
number_of_classes = len(set(train_labels))
number_of_classes

3

In [59]:
def remove_punctuation(s):
    list_punctuation = list(string.punctuation)
    for i in list_punctuation:
        s = s.replace(i,'')
    return s

In [60]:
def clean_sentence(sentence):
    #removes links
    sentence = re.sub(r'(?P<url>https?://[^\s]+)', r'', sentence)
    # remove @usernames
    sentence = re.sub(r"\@(\w+)", "", sentence)
    #remove # from #tags
    sentence = sentence.replace('#','')
    # split into tokens by white space
    tokens = sentence.split()
    # remove punctuation from each token
    # should have used translate but for some reason it breaks on my server
    tokens = [remove_punctuation(w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    tokens = ' '.join(tokens)
    return tokens

In [61]:
print("cleaning data")
trainX = [clean_sentence(s) for s in train_sentences]
testX = [clean_sentence(s) for s in test_sentences]
trainY = np.array(train_labels)

cleaning data


In [62]:
max_len = 20

In [63]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [64]:
def encode_text(tokenizer, lines, length):
    encoded = tokenizer.texts_to_sequences(lines)
    padded = pad_sequences(encoded, maxlen=length, padding='post')
    return padded

In [65]:
def load_godin_word_embedding(path):
    print("Loading Goding model.")
    return godin_embedding.Word2Vec.load_word2vec_format(path, binary=True)

In [66]:
def get_word_embedding_matrix(model,dim):
    #dim = 300 for google word2vec
    #dim = 400 for godin
    #dim = 100 for fast text
    embedding_matrix = np.zeros((vocab_size,dim))
    for word, i in tokenizer.word_index.items():
        try:
            embedding_vector = model[word]
        except KeyError:
            embedding_vector = None
        if embedding_vector is not None:
            embedding_matrix[i]=embedding_vector
    return embedding_matrix

In [67]:
def get_results(test_labels,pred_class):
    f1 = f1_score(test_labels,pred_class,labels=[0,1],average='micro')
    p = precision_score(test_labels,pred_class,labels=[0,1],average='micro')
    r = recall_score(test_labels,pred_class,labels=[0,1],average='micro')
    acc = accuracy_score(test_labels,pred_class)
    return [f1,p,r,acc]

In [73]:
def get_pred_class(model):
    pred = model.predict(testX)
    return [int(np.argmax(x)) for x in pred]

In [69]:
tokenizer = create_tokenizer(trainX)
vocab_size = len(tokenizer.word_index) + 1
print('Max document length: %d' % max_len)
print('Vocabulary size: %d' % vocab_size)
trainX = encode_text(tokenizer, trainX, max_len)
testX = encode_text(tokenizer, testX, max_len)
trainY = to_categorical(trainY,num_classes=number_of_classes)

Max document length: 20
Vocabulary size: 10940


In [70]:
# godin_model = load_godin_word_embedding("../word_embeddings/word2vec_twitter_model.bin")

In [71]:
# embedding_matrix_godin = get_word_embedding_matrix(godin_model,400)

## Base Learners

In [72]:
model_cnn = load_model('models/cnn.h5')
model_bi_lstm = load_model('models/bi_lstm.h5')
model_cnn_bi_lstm = load_model('models/cnn_bi_lstm.h5')
model_cnn_lstm = load_model('models/cnn_lstm.h5')

In [74]:
base_models = [('cnn',model_cnn),('bi_lstm',model_bi_lstm),('cnn_bi_lstm',model_cnn_bi_lstm),('cnn_lstm',model_cnn_lstm)]

In [75]:
pred_class_base = [(name,get_pred_class(m)) for name,m in base_models]

In [79]:
pred_mat = np.zeros((len(pred_class_base[0][1]),len(pred_class_base)),dtype = np.int64)
for i,p in enumerate(pred_class_base):
    pred_mat[:,i] = p[1]

pred_df = pd.DataFrame(pred_mat)
pred_df.columns = ["cnn", "bi_lstm","cnn_bi_lstm","cnn_lstm"]

In [81]:
# pred_df

In [86]:
result_base = [(name,get_results(test_labels,pred_class)) for name,pred_class in pred_class_base]
result_base_f = []
for t in result_base:
    temp = [t[0]]
    for x in t[1]:
        temp.append(x)
    result_base_f.append(temp)

In [88]:
result_base_f

[['cnn',
  0.6231868486085743,
  0.5879967558799676,
  0.6628571428571428,
  0.6379565979242485],
 ['bi_lstm',
  0.6176506096824822,
  0.6545547594677584,
  0.5846857142857143,
  0.6627577840679337],
 ['cnn_bi_lstm',
  0.6321674994051867,
  0.659141652195485,
  0.6073142857142857,
  0.6720582288718157],
 ['cnn_lstm',
  0.6375070821529746,
  0.6321348314606742,
  0.6429714285714285,
  0.6699016039897561]]

In [None]:
model_performance = pd.DataFrame(columns=['model','f1', 'precision', 'recall','accuracy'])

In [None]:
for i,result in enumerate(result_base):
    model_performance.loc[i] = result