In [54]:
#multi channel CNN for sentiment analysis
from nltk.corpus import stopwords
from string import punctuation
import pandas as pd
import numpy as np
import re
import json
import codecs
import word2vecReader as godin_embedding
import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.utils import to_categorical
from keras.models import load_model
import keras.backend as K
from gensim.models import KeyedVectors

In [55]:
#loading data
def load_data_from_file(filename):
    print("loading file = ",filename)
    with open(filename,'r') as f:
        foo = json.load(f)
    return foo['sentence'],foo['labels']

In [56]:
dataX,dataY = load_data_from_file('dataset/final_test.json')

loading file =  dataset/final_test.json


In [57]:
print('data len')
print(len(dataX),len(dataY))

data len
112 112


In [58]:
print('count 1,count 0')
print(dataY.count(1),dataY.count(0))

count 1,count 0
60 52


In [59]:
def remove_punctuation(s):
    list_punctuation = list(punctuation.replace('$',''))
    for i in list_punctuation:
        s = s.replace(i,'')
    return s

In [60]:
def clean_sentence(sentence):
    #remove multiple repeat non num-aplha char !!!!!!!!!-->!
    sentence = re.sub(r'(\W)\1{2,}', r'\1', sentence) 
    #removes alpha char repeating more than twice aaaa->aa
    sentence = re.sub(r'(\w)\1{2,}', r'\1\1', sentence)
    #removes links
    sentence = re.sub(r'(?P<url>https?://[^\s]+)', r'', sentence)
    # remove @usernames
    sentence = re.sub(r"\@(\w+)", "", sentence)
    #removing stock names to see if it helps
#     sentence = re.sub(r"(?:\$|https?\://)\S+", "", sentence)
    #remove # from #tags
    sentence = sentence.replace('#','')
    # split into tokens by white space
    tokens = sentence.split()
    # remove punctuation from each token
    tokens = [remove_punctuation(w) for w in tokens]
#     remove remaining tokens that are not alphabetic
#     tokens = [word for word in tokens if word.isalpha()]
#no removing non alpha words to keep stock names($ZSL)
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    tokens = ' '.join(tokens)
    return tokens

In [61]:
print('cleaning train set')
dataX = [clean_sentence(x) for x in dataX]

cleaning train set


In [62]:
# trainY

In [63]:
dataY = to_categorical(dataY,2)

In [64]:
# trainY

In [65]:
# lengths = [len(s.split()) for s in dataX]
max_length = 27

In [66]:
max_length

27

In [67]:
def load_google_word2vec(file_name):
    print("Loading word2vec model, this can take some time...")
    return KeyedVectors.load_word2vec_format(file_name, binary=True)

In [68]:
#loading godin word embedding
def load_godin_word_embedding(path):
    print("Loading goding model, this can take some time...")
    return godin_embedding.Word2Vec.load_word2vec_format(path, binary=True)

In [69]:
word2vec_model= load_google_word2vec('word_embeddings/GoogleNews-vectors-negative300.bin')

Loading word2vec model, this can take some time...


In [70]:
godin_model = load_godin_word_embedding("word_embeddings/word2vec_twitter_model.bin")

Loading goding model, this can take some time...


In [71]:
def get_embedding_matrix(model,sentence,godin_flag = False):
    tokens = sentence.split()[:max_length]
    if godin_flag:
        embedding_matrix = np.zeros((max_length,400))
    else:
        embedding_matrix = np.zeros((max_length,300))
    for i,word in enumerate(tokens):
        try:
            embedding_vector = model[word]
        except KeyError:
            embedding_vector = None
        if embedding_vector is not None:
            embedding_matrix[i]=embedding_vector
    return embedding_matrix

In [72]:
print("bulding word2vec matrix of train set")
train_word2vec = np.asarray([get_embedding_matrix(word2vec_model,x) for x in dataX])
print("bulding godin matrix of train set")
train_godin = np.asarray([get_embedding_matrix(godin_model,x,godin_flag=True) for x in dataX])

bulding word2vec matrix of train set
bulding godin matrix of train set


In [73]:
input_array = [train_godin,train_word2vec,train_godin]

In [74]:
model = load_model('top20models/215.h5')

In [75]:
loss,acc = model.evaluate(input_array,dataY)



In [76]:
acc

0.7857142857142857

In [93]:
pred = model.predict(input_array)

In [94]:
def get_acc(pred,dataY):
    pred_class = np.asarray([np.asarray([1,0],dtype=np.float32) if x[0]>x[1] else np.asarray([0,1],dtype=np.float32) for x in pred])
    count = [True if x[0]==y[0] else False for x,y in zip(pred_class,dataY)]
    return float(count.count(True))/len(count)

In [None]:
get_acc(pre)