In [79]:
#multi channel CNN for sentiment analysis
from nltk.corpus import stopwords
from string import punctuation
import pandas as pd
import numpy as np
import re
import json
# import codecs
import word2vecReader as godin_embedding
from keras.utils import to_categorical
from keras.models import load_model
from gensim.models import KeyedVectors
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import mean_squared_error,r2_score

In [43]:
#loading data
def load_data_from_file(filename):
    print("loading file = ",filename)
    with open(filename,'r') as f:
        foo = json.load(f)
        
    sentence_id =[]
    sentence = []
    score = []
    for key in foo.keys():
        sentence_id.append(key)
        sentence.append(foo[key]['sentence'])
        score.append(float(foo[key]['info'][0]['sentiment_score']))
    labels = [1 if x>=0 else 0 for x in score]
    return sentence_id,sentence,score,labels

In [45]:
sentence_id,dataX,score,dataY = load_data_from_file('dataset/master.json')

loading file =  dataset/master.json


In [50]:
print('data len')
print(len(dataX),len(dataY))

data len
1111 1111


In [51]:
print('count 1,count 0')
print(dataY.count(1),dataY.count(0))

count 1,count 0
734 377


In [52]:
def remove_punctuation(s):
    list_punctuation = list(punctuation.replace('$',''))
    for i in list_punctuation:
        s = s.replace(i,'')
    return s

In [53]:
def clean_sentence(sentence):
    #remove multiple repeat non num-aplha char !!!!!!!!!-->!
    sentence = re.sub(r'(\W)\1{2,}', r'\1', sentence) 
    #removes alpha char repeating more than twice aaaa->aa
    sentence = re.sub(r'(\w)\1{2,}', r'\1\1', sentence)
    #removes links
    sentence = re.sub(r'(?P<url>https?://[^\s]+)', r'', sentence)
    # remove @usernames
    sentence = re.sub(r"\@(\w+)", "", sentence)
    #removing stock names to see if it helps
#     sentence = re.sub(r"(?:\$|https?\://)\S+", "", sentence)
    #remove # from #tags
    sentence = sentence.replace('#','')
    # split into tokens by white space
    tokens = sentence.split()
    # remove punctuation from each token
    tokens = [remove_punctuation(w) for w in tokens]
#     remove remaining tokens that are not alphabetic
#     tokens = [word for word in tokens if word.isalpha()]
#no removing non alpha words to keep stock names($ZSL)
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    tokens = ' '.join(tokens)
    return tokens

In [54]:
print('cleaning train set')
dataX = [clean_sentence(x) for x in dataX]

cleaning train set


In [9]:
# trainY

In [55]:
dataY = to_categorical(dataY,2)

In [11]:
# trainY

In [56]:
# lengths = [len(s.split()) for s in dataX]
max_length = 27

In [57]:
# max_length

In [58]:
def load_google_word2vec(file_name):
    print("Loading word2vec model, this can take some time...")
    return KeyedVectors.load_word2vec_format(file_name, binary=True)

In [59]:
#loading godin word embedding
def load_godin_word_embedding(path):
    print("Loading goding model, this can take some time...")
    return godin_embedding.Word2Vec.load_word2vec_format(path, binary=True)

In [60]:
word2vec_model= load_google_word2vec('word_embeddings/GoogleNews-vectors-negative300.bin')

Loading word2vec model, this can take some time...


In [61]:
godin_model = load_godin_word_embedding("word_embeddings/word2vec_twitter_model.bin")

Loading goding model, this can take some time...


In [62]:
def get_embedding_matrix(model,sentence,godin_flag = False):
    tokens = sentence.split()[:max_length]
    if godin_flag:
        embedding_matrix = np.zeros((max_length,400))
    else:
        embedding_matrix = np.zeros((max_length,300))
    for i,word in enumerate(tokens):
        try:
            embedding_vector = model[word]
        except KeyError:
            embedding_vector = None
        if embedding_vector is not None:
            embedding_matrix[i]=embedding_vector
    return embedding_matrix

In [63]:
print("bulding word2vec matrix of train set")
train_word2vec = np.asarray([get_embedding_matrix(word2vec_model,x) for x in dataX])
print("bulding godin matrix of train set")
train_godin = np.asarray([get_embedding_matrix(godin_model,x,godin_flag=True) for x in dataX])

bulding word2vec matrix of train set
bulding godin matrix of train set


In [64]:
input_array = [train_godin,train_word2vec,train_godin]

In [65]:
model = load_model('models/bestSenti.h5')

In [66]:
pred = model.predict(input_array)

In [68]:
get_acc(pred,dataY)

0.9576957695769577

In [77]:
def get_score_from_pred(pred):
    score_pred = []
    for x in pred:
        n_s = x[0]
        p_s = x[1]
        score_pred.append(p_s-n_s)
    return score_pred

In [80]:
score_pred = get_score_from_pred(pred)

In [84]:
mean_squared_error(score,score_pred)

0.2939288311074967

In [85]:
r2_score(score,score_pred)

-0.8090525510587387

In [20]:
# with open('top20models/emb_key.json') as f:
#     top_models = json.load(f)

In [21]:
# def get_prediction_of_top_n_model(n):
#     pred_n = []
#     for x in range(1,n+1):
#         print('predicting for model rank',x)
#         model_name = top_models[str(x)]['key']
#         model_embeddings = top_models[str(x)]['embeddings']
#         input_array = [train_word2vec if x=='embedding_matrix_word2vec' else train_godin for x in model_embeddings]
#         model = load_model('top20models/'+str(model_name)+'.h5')
#         pred_n.append(model.predict(input_array))
#         del model
#     return pred_n

In [35]:
# def get_ensemble_pred(pred_n):
#     pred = []
#     n_of_pred = len(pred_n)
#     n_of_sampels = len(pred_n[0])
#     for x in range(n_of_sampels):
#         sum_0 = 0.0
#         sum_1 = 0.0
#         for v in pred_n:
#             sum_0+=v[x][0]
#             sum_1+=v[x][1]
#         pred.append(np.asarray([sum_0/n_of_pred,sum_1/n_of_pred],dtype=np.float32))
#     return np.asarray(pred,dtype=np.float32)

In [36]:
# def get_acc(pred,dataY):
#     pred_class = np.asarray([np.asarray([1,0],dtype=np.float32) if x[0]>x[1] else np.asarray([0,1],dtype=np.float32) for x in pred])
#     count = [True if x[0]==y[0] else False for x,y in zip(pred_class,dataY)]
#     return float(count.count(True))/len(count)

In [233]:
# pred_n = get_prediction_of_top_n_model(3)

In [234]:
# pred = get_ensemble_pred(pred_n)

In [235]:
# get_acc(pred,dataY)

In [37]:
# no_of_top_models = []
# acc_score = []
# n_of_top_model_to_use = 20
# pred_n = get_prediction_of_top_n_model(n_of_top_model_to_use)
# print("now doing ensemble")
# for i in range(1,n_of_top_model_to_use+1):
#     no_of_top_models.append(i)
#     pred = get_ensemble_pred(pred_n[:i])
#     acc_score.append(get_acc(pred,dataY))

In [38]:
# no_of_top_models

In [39]:
# acc_score

In [40]:
# plt.plot(no_of_top_models, acc_score)
# plt.xlabel('Number of models in ensemble')
# plt.ylabel('accuracy')

0.7946428571428571

In [133]:
# model_name = top_models[str(1)]['key']
# model_name

In [134]:
# model_embeddings = top_models[str(1)]['embeddings']
# model_embeddings

In [135]:
# input_array = [train_word2vec if x=='embedding_matrix_word2vec' else train_godin for x in model_embeddings]

In [136]:
# model = load_model('top20models/'+str(model_name)+'.h5')

In [137]:
# pred = model.predict(input_array)

In [139]:
# get_acc(pred,dataY)