In [27]:
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, MaxPooling1D, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint,LearningRateScheduler
import tensorflow as tf

import pickle


In [3]:
data = pd.read_csv("all-data-processed-3classes.csv")

In [4]:
data

Unnamed: 0,Sentiment,Sentence,Clean sentences
0,-1,In Finland 's Hobby Hall 's sales decreased by...,finland hobby hall sales decreased 10 internat...
1,0,Panostaja did not disclose the purchase price .,panostaja disclose purchase price
2,-1,Scanfil will execute the temporary lay-offs by...,scanfil execute temporary layoffs midoctober 2...
3,1,Operating profit rose to EUR 13.1 mn from EUR ...,operating profit rose eur 131 mn eur 87 mn cor...
4,0,The company 's share is quoted on NASDAQ OMX H...,company share quoted nasdaq omx helsinki rauta...
...,...,...,...
8632,0,In addition to the presentations held by Presi...,addition presentations held president ceo kai ...
8633,1,"However , the broker gave an `` outperform '' ...",however broker gave outperform recommendation ...
8634,0,Tampere Science Parks is a Finnish company tha...,tampere science parks finnish company owns lea...
8635,0,Aldata noted that its Voice Supply Chain Techn...,aldata noted voice supply chain technology app...


In [5]:
X_train, X_test, y_train, y_test = train_test_split(data["Clean sentences"], data["Sentiment"], test_size=0.1, random_state=42)
X_train.shape , X_test.shape, y_train.shape, y_test.shape

((7773,), (864,), (7773,), (864,))

tokenizing (str to int conversion)

In [6]:
X_train

4655    manager critical politicians failure different...
7959    russia ready participate open tender latteleco...
37      developments partly reflect government higher ...
5761    sampo group become major shareholder nordea ow...
8605    seawind en route finnish port turku stockholm ...
                              ...                        
5734    webcast may followed online company website ww...
5191    according finnish scanfil founder chairman boa...
5390    currency conversions based exchange rates time...
860     thus method cut working costs fasten planning ...
7270    omx helsinki 25 index 092 pct 251867 helsinki ...
Name: Clean sentences, Length: 7773, dtype: object

In [7]:
X_train = X_train.astype(str)

In [8]:
token = Tokenizer()
token.fit_on_texts(X_train)

In [9]:
vocab = len(token.index_word) + 1
print("Vocabulary size={}".format(len(token.word_index)))
print("Number of Documents={}".format(token.document_count))

Vocabulary size=10468
Number of Documents=7773


In [10]:
X_train = token.texts_to_sequences(X_train)
X_test = token.texts_to_sequences(X_test)

padding

In [11]:
sequence_lengths = [len(seq) for seq in X_train]

# Find the maximum length
max_length = max(sequence_lengths)

In [12]:
max_length

47

In [13]:
MAX_SEQUENCE_LENGTH = 47
X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH, padding="post")
X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH, padding="post")
X_train.shape, X_test.shape

((7773, 47), (864, 47))

In [14]:
y_train

4655    0
7959    0
37      0
5761    0
8605   -1
       ..
5734    0
5191    0
5390    0
860     1
7270    1
Name: Sentiment, Length: 7773, dtype: int64

In [15]:
from tensorflow.keras.utils import to_categorical


y_train_labels = np.array(y_train)  # Convert to NumPy array if not already
y_test_labels = np.array(y_test)  # Convert to NumPy array if not already

In [16]:
X_train[0]

array([ 559, 4936, 6357, 6358, 6359, 6360,  560,   54, 1784, 6361,  583,
       6362, 1524,  929, 2460,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0], dtype=int32)

In [17]:
y_train = np.where(y_train_labels == -1, 0, y_train)
y_train = np.where(y_train_labels == 0, 1, y_train)
y_train = np.where(y_train_labels == 1, 2, y_train)

y_test = np.where(y_test_labels == -1, 0, y_test)
y_test = np.where(y_test_labels == 0, 1, y_test)
y_test = np.where(y_test_labels == 1, 2, y_test)

y_train = to_categorical(y_train, num_classes=3)
y_test = to_categorical(y_test, num_classes=3)


In [18]:
def scheduler(epoch, lr):
    if epoch < 5:
        return lr
    else:
        return lr * 0.9

lr_scheduler = LearningRateScheduler(scheduler)

In [19]:



vec_size = 300
model = Sequential()
model.add(Embedding(len(token.index_word) + 1, vec_size, input_length=max_length))
model.add(Conv1D(64, 8, activation="relu"))
model.add(BatchNormalization())  # Add BatchNormalization
model.add(MaxPooling1D(2))
model.add(Dropout(0.1))

model.add(Dense(8, activation="relu"))
model.add(BatchNormalization())  # Add BatchNormalization
model.add(Dropout(0.1))
model.add(Dense(4, activation='relu'))
model.add(Dropout(0.1))
model.add(GlobalMaxPooling1D())
model.add(Dense(3, activation='softmax'))  # Output layer with softmax activation

model.compile(loss='categorical_crossentropy', optimizer=tf.optimizers.Adam(learning_rate=0.0001), metrics=['accuracy'])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 47, 300)           3140700   
                                                                 
 conv1d (Conv1D)             (None, 40, 64)            153664    
                                                                 
 batch_normalization (BatchN  (None, 40, 64)           256       
 ormalization)                                                   
                                                                 
 max_pooling1d (MaxPooling1D  (None, 20, 64)           0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 20, 64)            0         
                                                                 
 dense (Dense)               (None, 20, 8)             5

In [20]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

epochs = 100
batch_size = 4


# Define EarlyStopping and ModelCheckpoint callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.h5', save_best_only=True, monitor='val_loss', mode='min')

# Train the model with callbacks
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test), batch_size=32, callbacks=[early_stopping, model_checkpoint, lr_scheduler])


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [22]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

# Get predicted probabilities
predictions = model.predict(X_test)

# Print some example predictions
print("Predicted probabilities:", predictions[:10])
print("Actual labels:", y_test[:10])

# Convert one-hot encoded y_test back to class labels for comparison
true_classes = np.argmax(y_test, axis=1)

# Create a DataFrame to compare predictions with actual values
results_df = pd.DataFrame({
    'Text': [" ".join([token.index_word.get(idx, "") for idx in x if idx != 0]) for x in X_test],
    'Actual Class': true_classes,
    'Predicted Class': np.argmax(predictions, axis=1),
    'Predicted Probabilities': list(predictions)
})

# Show the DataFrame
print(results_df.head(10))

Test Loss: 0.26447364687919617
Test Accuracy: 0.9131944179534912
Predicted probabilities: [[3.90894786e-02 9.51089025e-01 9.82153788e-03]
 [9.89223838e-01 5.77101298e-03 5.00515010e-03]
 [4.34361249e-02 3.37353005e-04 9.56226528e-01]
 [9.88966286e-01 9.76068527e-03 1.27302855e-03]
 [3.75201441e-02 9.17613983e-01 4.48658541e-02]
 [9.14939865e-02 5.68979502e-01 3.39526534e-01]
 [5.51021732e-02 8.43448102e-01 1.01449735e-01]
 [1.99963689e-01 4.37835723e-01 3.62200558e-01]
 [9.83310878e-01 1.38064167e-02 2.88272835e-03]
 [6.13157712e-02 9.14170802e-01 2.45134234e-02]]
Actual labels: [[0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]]
                                                Text  Actual Class  \
0  pharmaceutical market belgium global research ...             1   
1  repeats sees 2008 operating profit yy reportin...             0   
2    department store sales improved 14 eur 10706 mn             2   
3      compan

In [23]:
results_df['sentiment_score'] = results_df['Predicted Probabilities'].apply(lambda x: -1 * x[0] + 1 * x[2])

In [25]:
results_df.to_excel("scores.xlsx")

In [26]:
model.save("sentimentModel.keras")

In [None]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(token, handle, protocol=pickle.HIGHEST_PROTOCOL)