In [174]:
#multi channel CNN for sentiment analysis
from nltk.corpus import stopwords
from string import punctuation,digits
import pandas as pd
import json
import numpy as np
import re
import word2vecReader as godin_embedding
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = [5,5]
plt.style.use('seaborn-notebook')
from scipy.interpolate import interp1d
from sklearn.metrics import mean_squared_error,r2_score
from aspect_specific_prob import get_normalized_sentence_relation_vector
from gensim.models import KeyedVectors
from keras.models import load_model
import keras.backend as K

In [150]:
#loading data
def load_data_from_file(filename):
    print("loading file = ",filename)
    with open(filename,'r') as f:
        foo = json.load(f)
    sentence_id =[]    
    sentence = []
    sentence_snippet = []
    sentence_target = []
    for key in foo.keys():
        for info in foo[key]['info']:
            sentence_snippet.append(eval(info['snippets'])[0])
            sentence_target.append(info['target'])
            sentence_id.append(key)
            sentence.append(foo[key]['sentence'])
    return sentence_id,sentence,sentence_snippet,sentence_target

In [151]:
sentence_id,sentence,sentence_snippet,sentence_target = load_data_from_file('dataset/master_test.json')

loading file =  dataset/master_test.json


In [152]:
len(sentence_id),len(sentence),len(sentence_snippet),len(sentence_target)

(192, 192, 192, 192)

In [153]:
def rescale(series,old_range,new_range):
    m = interp1d(old_range,new_range)
    return [float(m(x)) for x in series]

In [154]:
def remove_punctuation(s):
    list_punctuation = list(punctuation)
    for i in list_punctuation:
        s = s.replace(i,' ')
    return s

In [155]:
def clean_sentence(sentence):
    sentence = sentence.lower()
    #remove multiple repeat non num-aplha char !!!!!!!!!-->!
    sentence = re.sub(r'(\W)\1{2,}', r'\1', sentence) 
    #removes alpha char repeating more than twice aaaa->aa
    sentence = re.sub(r'(\w)\1{2,}', r'\1\1', sentence)
    #removes links
    sentence = re.sub(r'(?P<url>https?://[^\s]+)', r'', sentence)
    # remove @usernames
    sentence = re.sub(r"\@(\w+)", "", sentence)
    #remove # from #tags
    sentence = sentence.replace('#','')
    sentence = sentence.replace("'s",'')
    sentence = sentence.replace("-",' ')
    # split into tokens by white space
    tokens = sentence.split()
    # remove punctuation from each token
    tokens = [remove_punctuation(w) for w in tokens]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    remove_digits = str.maketrans('', '', digits)
    tokens = [w.translate(remove_digits) for w in tokens]
    tokens = [w.strip() for w in tokens]
    tokens = [w for w in tokens if w!=""]
    tokens = ' '.join(tokens)
    return tokens

In [156]:
print('cleaning data set')
sentence = [clean_sentence(x) for x in sentence]

cleaning data set


In [157]:
print('cleaning targets')
sentence_target = [clean_sentence(x) for x in sentence_target]

cleaning targets


In [158]:
max_length = 19 #as this was the value on which model was trained 

In [159]:
def progress_print(s,target):
    global count
    count+=1
    if count%10==0:
        print(count)
#     print(s)
#     print(target)
    return get_normalized_sentence_relation_vector(s,target)

In [160]:
count = 0
sentence_enchance_prob = [progress_print(x,[y]) for x,y in zip(sentence,sentence_target)]

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190


In [161]:
len(sentence_enchance_prob)

192

In [162]:
#loading Google Word2Vec
def load_google_word2vec(file_name):
    return KeyedVectors.load_word2vec_format(file_name, binary=True)

In [163]:
#loading godin word embedding
def load_godin_word_embedding(path):
    print("Loading the model, this can take some time...")
    return godin_embedding.Word2Vec.load_word2vec_format(path, binary=True)

In [164]:
word2vec_model= load_google_word2vec('word_embeddings/GoogleNews-vectors-negative300.bin')

In [165]:
godin_model = load_godin_word_embedding("word_embeddings/word2vec_twitter_model.bin")

Loading the model, this can take some time...


In [166]:
def get_embedding_matrix(model,sentence,prob_vector,godin_flag = False):
    tokens = sentence.split()[:max_length]
    if godin_flag:
        embedding_matrix = np.zeros((max_length,400))
    else:
        embedding_matrix = np.zeros((max_length,300))
    for i,word in enumerate(tokens):
        try:
            embedding_vector = model[word]
        except KeyError:
            embedding_vector = None
        if embedding_vector is not None:
            embedding_matrix[i]=embedding_vector
            embedding_matrix[i]*=prob_vector[i]
    return embedding_matrix

In [169]:
print("bulding word2vec matrix of data set")
sentence_word2vec = np.asarray([get_embedding_matrix(word2vec_model,x,y[0]) for x,y in zip(sentence,sentence_enchance_prob)])
print("bulding godin matrix of data set")
sentence_godin = np.asarray([get_embedding_matrix(godin_model,x,y[0],godin_flag=True) for x,y in zip(sentence,sentence_enchance_prob)])

bulding word2vec matrix of data set
bulding godin matrix of data set


In [170]:
def sk_mse(y_true,y_pred):
     return K.mean(K.square(y_pred - y_true), axis=-1)

In [178]:
model = load_model("models/sentiment_model.h5", custom_objects={'sk_mse': sk_mse})

In [179]:
input_array_test = [sentence_godin,sentence_word2vec,sentence_godin]

In [180]:
pred = model.predict(input_array_test)

In [181]:
pred_val = [x[0] for x in pred]

In [182]:
pred_val = rescale(pred_val,[0,1],[-1,1])

In [183]:
len(pred_val)

192

In [185]:
# plt.hist(pred_val)

In [186]:
len(sentence_snippet),len(sentence_id),len(pred_val)

(192, 192, 192)

In [195]:
result = {'results':[]}

In [196]:
for s_id,s_sn,s_score in zip(sentence_id,sentence_snippet,pred_val):
    result['results'].append({'id':s_id,'snippet':s_sn,'sentiment_scores':str(s_score)})

In [197]:
result

{'results': [{'id': '0_Cuadrilla',
   'sentiment_scores': '-0.13248014450073242',
   'snippet': 'files to delay application'},
  {'id': '1001_Sainsbury',
   'sentiment_scores': '-0.20463716983795166',
   'snippet': 'warns of squeeze on high street retailers'},
  {'id': '1006_Barclays',
   'sentiment_scores': '-0.279263436794281',
   'snippet': 'fined for anti-money-laundering failings'},
  {'id': '1007_Barclays',
   'sentiment_scores': '-0.07986938953399658',
   'snippet': 'fined for lax crime checks in'},
  {'id': '1014_GSK',
   'sentiment_scores': '0.18167603015899658',
   'snippet': 'file up to 20 new drugs for approval by 2020'},
  {'id': '1031_National Grid',
   'sentiment_scores': '0.2502250671386719',
   'snippet': 'Grid lines up sale of'},
  {'id': '1034_British American Tobacco',
   'sentiment_scores': '-0.13126909732818604',
   'snippet': 'accused of bribing senior politicians to sabotage'},
  {'id': '1035_MillerCoors',
   'sentiment_scores': '0.16458725929260254',
   'snippe

In [198]:
with open('results/sentiment_result.json','w') as fout:
    json.dump(result,fout,indent=4)