In [310]:
#imports
import pandas as pd
import numpy as np
import tensorflow as tf
import tflearn
from tflearn.data_utils import to_categorical
from collections import Counter

In [230]:
#loading data
headlines = pd.read_csv('FiQA_train_ABSA_financial_headlines.tsv',delimiter='\t',header=0)
posts = pd.read_csv('FiQA_train_ABSA_financial_posts.tsv',delimiter='\t',header=0)
posts = posts.drop(['Unnamed: 0', 'id'],axis=1)
headlines = headlines.drop(['Unnamed: 0', 'id'],axis=1)
posts_text = posts.iloc[:,:1]
posts_score = posts.iloc[:,1:]
headlines_text = headlines.iloc[:,:1]
headlines_score = headlines.iloc[:,1:]
text_frames = [posts_text,headlines_text]
score_frames = [posts_score,headlines_score]
text = pd.concat(text_frames)
score = pd.concat(score_frames)
# text = pd.read_csv('reviews.txt', header=None)
# score = pd.read_csv('labels.txt', header=None)

In [231]:
#counting word frequency to create vocabulary
counter_ = Counter()
for txt in text.values:
    for word in txt[0].split(" "):
        counter_[word]+=1

In [232]:
print(len(counter_))

11384


In [237]:
#keeping only 1000 most commmon words, frequency of 1000th word can be used to see if we need ot increase or decrease this number
vocabulary = sorted(counter_,key=counter_.get,reverse = True)[:1000]

In [270]:
vocabulary[:50]

['to',
 'the',
 'a',
 'for',
 'in',
 'and',
 'of',
 'on',
 '',
 'is',
 '$AAPL',
 'as',
 'up',
 'this',
 '-',
 'at',
 'with',
 'be',
 '$FB',
 '$TSLA',
 'it',
 '$SPY',
 'I',
 'from',
 'will',
 'by',
 'but',
 'buy',
 '&',
 'down',
 'that',
 'are',
 'long',
 'new',
 'after',
 'more',
 'short',
 'not',
 'Tesco',
 'share',
 '$AMZN',
 'out',
 'day',
 'over',
 'all',
 'Barclays',
 'see',
 'like',
 'you',
 'FTSE']

In [239]:
#word to index, will need this mapping to create word vector 
word_to_index = dict()
for index,word in enumerate(vocabulary):
    word_to_index[word] = index

In [240]:
def get_word_vector(data):
    vector = np.zeros(len(vocabulary), dtype=np.int_)
    for word in data.split(' '):
        index = word_to_index.get(word, None)
        if index != None:
            vector[index]+=1
    return vector

In [241]:
text_vectors = np.zeros((len(text),len(vocabulary)),dtype=np.int_)
for index,(_,data) in enumerate(text.iterrows()):
    text_vectors[index] = get_word_vector(data[0])

In [271]:
text_vectors[:50]

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 2, 0, ..., 0, 0, 0]])

In [319]:
#converting scores into positive and negative classes
Y = (score>=0).astype(np.int_)
Y['sentiment score'].value_counts()

1    2407
0    1382
Name: sentiment score, dtype: int64

In [320]:
shuffle = np.arange(len(score))
np.random.shuffle(shuffle)
test_train_split = 0.9
train_split, test_split = shuffle[:int(len(score)*test_train_split)], shuffle[int(len(score)*test_train_split):]
trainX, trainY = text_vectors[train_split,:], to_categorical(Y.values[train_split].ravel(), 2)
testX, testY = text_vectors[test_split,:], to_categorical(Y.values[test_split].ravel(), 2)

In [321]:
trainX = tflearn.data_utils.pad_sequences(trainX, maxlen=100, value=0.)
testX = tflearn.data_utils.pad_sequences(testX, maxlen=100, value=0.)
trainY

array([[ 0.,  1.],
       [ 1.,  0.],
       [ 0.,  1.],
       ..., 
       [ 1.,  0.],
       [ 0.,  1.],
       [ 1.,  0.]])

In [251]:
# Network building
#RNN over-fits, everything is predicted to positive 
# tf.reset_default_graph()
# net = tflearn.input_data([None, 100])
# net = tflearn.embedding(net, input_dim=len(vocabulary), output_dim=128)
# net = tflearn.lstm(net, 128, dropout=0.8)
# net = tflearn.fully_connected(net, 2, activation='softmax')
# net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
#                          loss='categorical_crossentropy')
#USING Simple 
tf.reset_default_graph()
net = tflearn.input_data([None, 100])                          # Input
net = tflearn.fully_connected(net, 200, activation='ReLU')      # Hidden
net = tflearn.fully_connected(net, 25, activation='ReLU')
net = tflearn.fully_connected(net, 2, activation='softmax')   # Output
net = tflearn.regression(net, optimizer='sgd', learning_rate=0.1, loss='categorical_crossentropy')


In [252]:
model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(trainX, trainY,n_epoch=100,validation_set=0.1, show_metric=True,batch_size=32)

Training Step: 9599  | total loss: [1m[32m0.23581[0m[0m | time: 0.381s
| SGD | epoch: 100 | loss: 0.23581 - acc: 0.8935 -- iter: 3040/3069
Training Step: 9600  | total loss: [1m[32m0.23032[0m[0m | time: 1.388s
| SGD | epoch: 100 | loss: 0.23032 - acc: 0.8947 | val_loss: 0.99878 - val_acc: 0.6686 -- iter: 3069/3069
--


In [307]:
# (np.array(model.predict(testX))[:,0]>=0.5).astype(np.int_)

predictions = (np.array(model.predict(testX))[:,0] >= 0.5).astype(np.int_)
test_accuracy = np.mean(predictions == testY[:,0], axis=0)
print("Test accuracy: ", test_accuracy)
# model.predict(testX)

Test accuracy:  0.68073878628


In [328]:
#trying CNN
tf.reset_default_graph()
network = tflearn.input_data(shape=[None, 100], name='input')
network = tflearn.embedding(network, input_dim=len(vocabulary), output_dim=128)
branch1 = tflearn.layers.conv.conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")
branch2 = tflearn.layers.conv.conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")
branch3 = tflearn.layers.conv.conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")
network = tflearn.layers.merge_ops.merge([branch1, branch2, branch3], mode='concat', axis=1)
network = tf.expand_dims(network, 2)
network = tflearn.layers.conv.global_max_pool(network)
network = tflearn.layers.core.dropout(network, 0.5)
network = tflearn.fully_connected(network, 2, activation='softmax')
network = tflearn.regression(network, optimizer='adam', learning_rate=0.001,
                     loss='categorical_crossentropy', name='target')


In [329]:
# Training
model_cnn = tflearn.DNN(network, tensorboard_verbose=0)
model_cnn.fit(trainX, trainY, n_epoch = 5, validation_set=0.1, show_metric=True, batch_size=32)

Training Step: 479  | total loss: [1m[32m0.63930[0m[0m | time: 18.807s
| Adam | epoch: 005 | loss: 0.63930 - acc: 0.6694 -- iter: 3040/3069
Training Step: 480  | total loss: [1m[32m0.63284[0m[0m | time: 20.014s
| Adam | epoch: 005 | loss: 0.63284 - acc: 0.6775 | val_loss: 0.63929 - val_acc: 0.6598 -- iter: 3069/3069
--


In [330]:
predictions_cnn = (np.array(model_cnn.predict(testX))[:,0] >= 0.5).astype(np.int_)
test_accuracy_cnn = np.mean(predictions_cnn == testY[:,0], axis=0)
print("Test accuracy: ", test_accuracy_cnn)
model_cnn.predict_label(testX)

Test accuracy:  0.633245382586


array([[1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1,

In [297]:
#trying svm
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [298]:
data = []
_class=[]
for ele in text.values.tolist():
    data.append(ele[0])
for ele in Y.values.tolist():
    _class.append(ele[0])

data_train,data_test = data[:int(len(data)*0.9)],data[int(len(data)*0.9):]
_class_train,_class_test = _class[:int(len(_class)*0.9)],_class[int(len(_class)*0.9):]

vectorizer = TfidfVectorizer(min_df=5,max_df = 0.8,sublinear_tf=True,use_idf=True)
train_vectors = vectorizer.fit_transform(data_train)
test_vectors = vectorizer.transform(data_test)

In [308]:
classifier_linear = svm.SVC(kernel='linear',probability=True)
classifier_linear.fit(train_vectors, _class_train)
prediction_linear = classifier_linear.predict(test_vectors)
prob = classifier_linear.predict_proba(test_vectors)
accuracy_score(_class_test,prediction_linear)
# prob

0.67018469656992086

In [300]:
# Perform classification with SVM, kernel=linear
classifier_liblinear = svm.LinearSVC()
classifier_liblinear.fit(train_vectors, _class_train)
prediction_liblinear = classifier_liblinear.predict(test_vectors)
accuracy_score(_class_test,prediction_liblinear)

0.65963060686015829

In [301]:
# # Perform classification with SVM, kernel=rbf
classifier_rbf = svm.SVC()
classifier_rbf.fit(train_vectors, _class_train)
prediction_rbf = classifier_rbf.predict(test_vectors)
accuracy_score(_class_test,prediction_rbf)

0.60949868073878632