In [263]:
#multi channel CNN for sentiment analysis
from nltk.corpus import stopwords
from string import punctuation
from os import listdir
from pickle import dump,load
import pandas as pd
import numpy as np
import re
from random import shuffle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate

In [264]:
#loading data
def load_file_to_df(filename):
    df = pd.read_csv(filename,delimiter='\t',header=0)
    df = df.drop(['Unnamed: 0', 'id'],axis=1)
    df_text = df.iloc[:,:1]
    df_score = df.iloc[:,1:]
    return df_text,df_score

In [265]:
headlines_text,headlines_score = load_file_to_df("FiQA_train_ABSA_financial_headlines.tsv")
post_text,post_score = load_file_to_df("FiQA_train_ABSA_financial_posts.tsv")
text = pd.concat([headlines_text,post_text])
score = pd.concat([headlines_score,post_score])

In [266]:
# score

In [267]:
# turn a sentence into clean tokens
def clean_sentence(sentence):
    #remove multiple repeat non num-aplha char !!!!!!!!!-->!
    sentence = re.sub(r'(\W)\1{2,}', r'\1', sentence) 
    #removes alpha char repeating more than twice aaaa->aa
    sentence = re.sub(r'(\w)\1{2,}', r'\1\1', sentence)
    #removes links
    sentence = re.sub(r'(?P<url>https?://[^\s]+)', r'', sentence)
    # remove @usernames
    sentence = re.sub(r"(?:\@|https?\://)\S+", "", sentence)
    #remove # from #tags
    sentence = sentence.replace('#','')
    # split into tokens by white space
    tokens = sentence.split()
    # remove punctuation from each token
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
#     tokens = [word for word in tokens if word.isalpha()]
#no removing non alpha words to keep stock names($ZSL)
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

In [268]:
# extract sentences out of df and cleaning it
sentences = [clean_sentence(x) for x in text['text']]

In [269]:
#converting real number scores to lables
#0-->-ve sentiment 1-->+ve sentiment
labels_df = (score>=0).astype(int)
labels = [int(x) for x in labels_df['sentiment score']]

In [291]:
#shuffling dataset
numbers = [i for i in range(len(sentences))]
shuffle(numbers)
numbers

[1737,
 1914,
 3753,
 6,
 1582,
 3772,
 1526,
 2202,
 1071,
 1496,
 1420,
 1906,
 2510,
 3754,
 3216,
 163,
 602,
 241,
 1726,
 1089,
 3439,
 1320,
 640,
 3615,
 1715,
 2516,
 2998,
 3402,
 2305,
 1652,
 461,
 1803,
 929,
 644,
 1221,
 3519,
 2770,
 3769,
 2145,
 1712,
 67,
 1308,
 9,
 2267,
 201,
 1372,
 3376,
 1339,
 1907,
 3283,
 1875,
 146,
 2346,
 1003,
 338,
 276,
 2259,
 1517,
 3094,
 3023,
 1120,
 686,
 209,
 2315,
 3084,
 126,
 1391,
 1054,
 451,
 3036,
 1549,
 3195,
 1457,
 907,
 420,
 751,
 3478,
 25,
 1254,
 2691,
 558,
 1148,
 3248,
 961,
 403,
 3714,
 3602,
 20,
 2920,
 27,
 2089,
 1147,
 3050,
 3297,
 3506,
 394,
 2680,
 2352,
 1625,
 3320,
 698,
 1292,
 93,
 1479,
 2805,
 3261,
 1346,
 3153,
 3617,
 3758,
 1442,
 2606,
 2207,
 2233,
 1508,
 1249,
 2037,
 2062,
 2301,
 1836,
 351,
 2698,
 1183,
 2158,
 2790,
 2566,
 3119,
 1022,
 1345,
 3458,
 3429,
 2722,
 299,
 312,
 3420,
 525,
 3649,
 1898,
 1150,
 817,
 2879,
 3661,
 955,
 1102,
 2022,
 757,
 1691,
 3069,
 1916,
 18

In [292]:
temp_text = sentences
temp_lables = labels
for i in numbers:
    sentences[i] = temp_text[i]
    labels[i]=temp_lables[i]
print(len(sentences))
print(len(labels))

3789
3789


In [293]:
#doing train and test split
test_train_split_ratio =0.9
trainX,testX = sentences[:int(test_train_split_ratio*len(sentences))],sentences[int(test_train_split_ratio*len(sentences)):]
trainY,testY = labels[:int(test_train_split_ratio*len(labels))],labels[int(test_train_split_ratio*len(labels)):]

In [294]:
print(len(trainX),len(trainY))
print(len(testX),len(testY))

3410 3410
379 379


In [295]:
# type(trainY[0])

In [296]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [297]:
# calculate the maximum document length
def max_length(lines):
    return max([len(s.split()) for s in lines])

In [298]:
# encode a list of lines
def encode_text(tokenizer, lines, length):
    # integer encode
    encoded = tokenizer.texts_to_sequences(lines)
    # pad encoded sequences
    padded = pad_sequences(encoded, maxlen=length, padding='post')
    return padded

In [299]:
testLines = [' '.join(x) for x in testX]
trainLines = [' '.join(x) for x in trainX]

In [300]:
# trainY = np.array(trainY)
# testY = np.array(testY)
# type(trainY[0])

In [301]:
# create tokenizer
tokenizer = create_tokenizer(trainLines)
# calculate max document length
length = max_length(trainLines)
# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Max document length: %d' % length)
print('Vocabulary size: %d' % vocab_size)
# encode data
trainX = encode_text(tokenizer, trainLines, length)
print(trainX.shape)

Max document length: 23
Vocabulary size: 6884
(3410, 23)


In [302]:
# define the model
def define_model(length, vocab_size):
    # channel 1
    inputs1 = Input(shape=(length,))
    embedding1 = Embedding(vocab_size, 100)(inputs1)
    conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)
    # channel 2
    inputs2 = Input(shape=(length,))
    embedding2 = Embedding(vocab_size, 100)(inputs2)
    conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
    drop2 = Dropout(0.5)(conv2)
    pool2 = MaxPooling1D(pool_size=2)(drop2)
    flat2 = Flatten()(pool2)
    # channel 3
    inputs3 = Input(shape=(length,))
    embedding3 = Embedding(vocab_size, 100)(inputs3)
    conv3 = Conv1D(filters=32, kernel_size=8, activation='relu')(embedding3)
    drop3 = Dropout(0.5)(conv3)
    pool3 = MaxPooling1D(pool_size=2)(drop3)
    flat3 = Flatten()(pool3)
    # merge
    merged = concatenate([flat1, flat2, flat3])
    # interpretation
    dense1 = Dense(10, activation='relu')(merged)
    outputs = Dense(1, activation='sigmoid')(dense1)
    model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
    # compile
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize
    print(model.summary())
#     plot_model(model, show_shapes=True, to_file='multichannel.png')
    return model

In [303]:
# define model
model = define_model(length, vocab_size)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_46 (InputLayer)           (None, 23)           0                                            
__________________________________________________________________________________________________
input_47 (InputLayer)           (None, 23)           0                                            
__________________________________________________________________________________________________
input_48 (InputLayer)           (None, 23)           0                                            
__________________________________________________________________________________________________
embedding_46 (Embedding)        (None, 23, 100)      688400      input_46[0][0]                   
__________________________________________________________________________________________________
embedding_

In [None]:
# fit model
model.fit([trainX,trainX,trainX], trainY, epochs=10, batch_size=16)

In [284]:
#testing
# encode data
# trainX = encode_text(tokenizer, trainLines, length)
testX = encode_text(tokenizer, testLines, length)
print(trainX.shape, testX.shape)

(3410, 23) (379, 23)


In [285]:
# evaluate model on training dataset
loss, acc = model.evaluate([trainX,trainX,trainX], trainY, verbose=0)
print('Train Accuracy: %f' % (acc*100))
 
# evaluate model on test dataset dataset
loss, acc = model.evaluate([testX,testX,testX], testY, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Train Accuracy: 98.299120
Test Accuracy: 82.058048


In [286]:
predicted_prob = [float(x) for x in model.predict([testX,testX,testX])]
predicted_prob

[0.9917216897010803,
 0.5519030094146729,
 0.3393690884113312,
 0.019103292375802994,
 0.0006588365067727864,
 0.06140361353754997,
 0.9871463775634766,
 0.997307538986206,
 0.39849168062210083,
 0.9229839444160461,
 0.9906712770462036,
 0.9992703795433044,
 0.056199293583631516,
 0.9547330141067505,
 0.45457160472869873,
 0.5067245364189148,
 0.8612037897109985,
 0.9984258413314819,
 0.9950929880142212,
 0.9991890788078308,
 0.9369710087776184,
 0.7053764462471008,
 0.8962070941925049,
 0.5635888576507568,
 0.21669049561023712,
 0.5777891278266907,
 0.41223880648612976,
 0.9966940879821777,
 0.9961928129196167,
 0.45327073335647583,
 0.6472412943840027,
 0.4583706259727478,
 0.9508475065231323,
 0.9027577042579651,
 0.10071637481451035,
 0.8224546313285828,
 0.04126504436135292,
 0.0010255440138280392,
 0.9911794066429138,
 0.5619944334030151,
 0.582688570022583,
 0.7861183881759644,
 0.591829776763916,
 0.5150099396705627,
 0.9491631388664246,
 0.7904559373855591,
 0.9967382550239563

In [289]:
predicted_lables = []
for x in predicted_prob:
    if x>0.5:
        predicted_lables.append(1)
    else:
        predicted_lables.append(0)
predicted_lables

[1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [290]:
testY

[1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
