In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import tensorflow as tf

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sentiment-analysis-for-financial-news/all-data.csv
/kaggle/input/sentiment-analysis-for-financial-news/FinancialPhraseBank/Sentences_66Agree.txt
/kaggle/input/sentiment-analysis-for-financial-news/FinancialPhraseBank/Sentences_AllAgree.txt
/kaggle/input/sentiment-analysis-for-financial-news/FinancialPhraseBank/README.txt
/kaggle/input/sentiment-analysis-for-financial-news/FinancialPhraseBank/License.txt
/kaggle/input/sentiment-analysis-for-financial-news/FinancialPhraseBank/Sentences_75Agree.txt
/kaggle/input/sentiment-analysis-for-financial-news/FinancialPhraseBank/Sentences_50Agree.txt


In [2]:
data = pd.read_csv("../input/sentiment-analysis-for-financial-news/all-data.csv", names = ['Label', 'Text'], encoding= 'latin-1')
data

Unnamed: 0,Label,Text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4846 entries, 0 to 4845
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   4846 non-null   object
 1   Text    4846 non-null   object
dtypes: object(2)
memory usage: 75.8+ KB


## preprocessing

In [4]:
def get_seq(texts):
    tk = Tokenizer() # can cap w num words
    tk.fit_on_texts(texts) 
    seq = tk.texts_to_sequences(texts) # list of lists, num of word in sentence
    # seq numeric rep
    # dense how d, to send word to location in vector space vs sparse or 1 hot    
    # will use keras embeding layer
    print('vocab len:', len(tk.word_index)+1)
    maxLen = np.max(list(map(lambda x: len(x),seq)))
    print('max len: ', maxLen)

    seq = pad_sequences(seq, maxlen= maxLen, padding= 'post' ) # pad str, post= 0 at end for padding
    
    return seq 

In [5]:
get_seq(data['Text']) # all have same length, embeded in embeding layer
#get_seq(data['Text']).shape

vocab len: 10123
max len:  71


array([[  94,    5, 3498, ...,    0,    0,    0],
       [ 840,  336,    5, ...,    0,    0,    0],
       [   1,  293,  656, ...,    0,    0,    0],
       ...,
       [  42,   31,  242, ...,    0,    0,    0],
       [  30,   27,    2, ...,    0,    0,    0],
       [  27,    3,   35, ...,    0,    0,    0]], dtype=int32)

In [6]:
def preproc_inputs(df): #dense encoding, lower mem for encoding, word embedding
    df = df.copy()
    
    seq =  get_seq(df['Text'])
    
    lbl_mapping = { 'negative':0, 'neutral':1, 'positive':2}
    
    y = df['Label'].replace(lbl_mapping)
    train_seq,test_sequences,y_train,y_test = train_test_split(seq,y,train_size = 0.7,shuffle = True,  random_state = 1)
    
    return train_seq,test_sequences,y_train,y_test

In [7]:
train_seq,test_seq ,y_train,y_test = preproc_inputs(data)
train_seq

vocab len: 10123
max len:  71


array([[5442,  510,   16, ...,    0,    0,    0],
       [  22, 1628,    4, ...,    0,    0,    0],
       [1141,  936,  136, ...,    0,    0,    0],
       ...,
       [   1,  419,   16, ...,    0,    0,    0],
       [2586,  123, 3247, ...,    0,    0,    0],
       [  30,  615,  555, ...,    0,    0,    0]], dtype=int32)

In [8]:
y_train

545     2
2374    0
4217    1
1071    1
716     2
       ..
2895    1
2763    1
905     2
3980    1
235     2
Name: Label, Length: 3392, dtype: int64

Training

In [9]:
inputs =  tf.keras.Input(shape = (train_seq.shape[1]))
#input_dim:input space for sparse encoding: for sparse woudl need 10123, 
#output_dim: choice larger gives u more complex relation
#, inp_len: max len of a seq
x =  tf.keras.layers.Embedding(
    input_dim=10123,
    output_dim=128,
    input_length = train_seq.shape[1])(inputs)

# recurrent nn for long term dep, 1 layer would only understand indep, not past
# simple h is feed back, to be used with x, gru simplified lstm
# return at each time step array 
x = tf.keras.layers.GRU(256,return_sequences=True,  activation = 'tanh')(x) # at end outputs sentiment
x = tf.keras.layers.Flatten()(x) # return turns it to 2d
outputs = tf.keras.layers.Dense(3,activation ='softmax')(x) # prob of output


In [10]:
model  = tf.keras.Model(inputs=inputs, outputs=outputs)

model.compile(
    optimizer ='adam',
    loss = 'sparse_categorical_crossentropy',
    metrics = ['accuracy']
)


In [11]:
history = model.fit(
    train_seq,
    y_train,
    validation_split = 0.2,
    batch_size = 32,
    epochs =100,
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor =  'val_loss',
            patience = 3,
            restore_best_weights = True
        )
    ]
    )


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100


## Results

In [12]:
model.evaluate(test_seq,y_test)



[0.6841079592704773, 0.7324621677398682]

In [13]:
y_test.value_counts()

1    850
2    420
0    184
Name: Label, dtype: int64