In [29]:
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, MaxPooling1D, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint,LearningRateScheduler
import tensorflow as tf

import pickle


In [30]:
data = pd.read_csv("all-data-processed-3classes.csv")

In [31]:
data

Unnamed: 0,Sentiment,Sentence,Clean sentences
0,0,The utility will also provide services related...,utility also provide services related electric...
1,0,Niam offer financial investors a high return v...,niam offer financial investors high return via...
2,-1,Pinduoduo's stock plunges 23% premarket on hea...,pinduoduos stock plunges 23 premarket heavy vo...
3,0,Azeri Snap Elections Condemned by Monitors for...,azeri snap elections condemned monitors vote v...
4,-1,The #market is seeing strong #tax-loss selling...,market seeing strong taxloss selling investors...
...,...,...,...
31864,0,The U.S. Is Firing Blanks Against a New Irania...,yous firing blanks new iranian threat
31865,0,Storengy is the GDF SUEZ company that is dedic...,storengy gdf suez company dedicated undergroun...
31866,1,FDA approves Aquestive's ALS treatment https:...,fda approves aquestives als treatment httpstco...
31867,0,SoftBank to Create Japan Internet Giant to Bat...,softbank create japan internet giant battle gl...


In [40]:
data["Clean sentences"] = data["Clean sentences"].astype(str)

In [41]:
X_train, X_test, y_train, y_test = train_test_split(data["Clean sentences"], data["Sentiment"], test_size=0.1, random_state=42)
X_train.shape , X_test.shape, y_train.shape, y_test.shape

((28682,), (3187,), (28682,), (3187,))

tokenizing (str to int conversion)

In [42]:
X_train

911      mjardin group reports q3 results httpstco1owwk...
10405    wells fargo downgrades netflix underperform se...
28158    kohls shares plunge cramer calls ceo glossing ...
25140    line 4 run fully underground comprise 10 stati...
12255    uponor made operating profit eur 1510 mn eur 1...
                               ...                        
29802    recent rally could bear market trap says mille...
5390     disney downgraded analyst says parks attendanc...
860      extraordinary general meeting expected take pl...
15795          energy settlement prices httpstcoy70kosce0a
23654               resulted improved sales figures sweden
Name: Clean sentences, Length: 28682, dtype: object

In [43]:
token = Tokenizer()
token.fit_on_texts(X_train)

In [44]:
vocab = len(token.index_word) + 1
print("Vocabulary size={}".format(len(token.word_index)))
print("Number of Documents={}".format(token.document_count))

Vocabulary size=29081
Number of Documents=28682


In [45]:
X_train = token.texts_to_sequences(X_train)
X_test = token.texts_to_sequences(X_test)

padding

In [46]:
sequence_lengths = [len(seq) for seq in X_train]

# Find the maximum length
max_length = max(sequence_lengths)

In [47]:
max_length

47

In [48]:
MAX_SEQUENCE_LENGTH = 47
X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH, padding="post")
X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH, padding="post")
X_train.shape, X_test.shape

((28682, 47), (3187, 47))

In [49]:
y_train

911      0
10405   -1
28158   -1
25140    0
12255    1
        ..
29802   -1
5390    -1
860      0
15795    0
23654    1
Name: Sentiment, Length: 28682, dtype: int64

In [50]:
from tensorflow.keras.utils import to_categorical


y_train_labels = np.array(y_train)  # Convert to NumPy array if not already
y_test_labels = np.array(y_test)  # Convert to NumPy array if not already

In [51]:
X_train[0]

array([11997,    20,    55,   122,    39, 15261,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0], dtype=int32)

In [52]:
y_train = np.where(y_train_labels == -1, 0, y_train)
y_train = np.where(y_train_labels == 0, 1, y_train)
y_train = np.where(y_train_labels == 1, 2, y_train)

y_test = np.where(y_test_labels == -1, 0, y_test)
y_test = np.where(y_test_labels == 0, 1, y_test)
y_test = np.where(y_test_labels == 1, 2, y_test)

y_train = to_categorical(y_train, num_classes=3)
y_test = to_categorical(y_test, num_classes=3)


In [53]:
def scheduler(epoch, lr):
    if epoch < 5:
        return lr
    else:
        return lr * 0.9

lr_scheduler = LearningRateScheduler(scheduler)

In [54]:



vec_size = 300
model = Sequential()
model.add(Embedding(len(token.index_word) + 1, vec_size, input_length=max_length))
model.add(Conv1D(64, 8, activation="relu"))
model.add(BatchNormalization())  # Add BatchNormalization
model.add(MaxPooling1D(2))
model.add(Dropout(0.1))

model.add(Dense(8, activation="relu"))
model.add(BatchNormalization())  # Add BatchNormalization
model.add(Dropout(0.1))
model.add(Dense(4, activation='relu'))
model.add(Dropout(0.1))
model.add(GlobalMaxPooling1D())
model.add(Dense(3, activation='softmax'))  # Output layer with softmax activation

model.compile(loss='categorical_crossentropy', optimizer=tf.optimizers.Adam(learning_rate=0.0001), metrics=['accuracy'])
model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 47, 300)           8724600   
                                                                 
 conv1d_1 (Conv1D)           (None, 40, 64)            153664    
                                                                 
 batch_normalization_2 (Batc  (None, 40, 64)           256       
 hNormalization)                                                 
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 20, 64)           0         
 1D)                                                             
                                                                 
 dropout_3 (Dropout)         (None, 20, 64)            0         
                                                                 
 dense_3 (Dense)             (None, 20, 8)            

In [55]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

epochs = 100
batch_size = 4


# Define EarlyStopping and ModelCheckpoint callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.h5', save_best_only=True, monitor='val_loss', mode='min')

# Train the model with callbacks
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test), batch_size=32, callbacks=[early_stopping, model_checkpoint, lr_scheduler])


Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20


In [56]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

# Get predicted probabilities
predictions = model.predict(X_test)

# Print some example predictions
print("Predicted probabilities:", predictions[:10])
print("Actual labels:", y_test[:10])

# Convert one-hot encoded y_test back to class labels for comparison
true_classes = np.argmax(y_test, axis=1)

# Create a DataFrame to compare predictions with actual values
results_df = pd.DataFrame({
    'Text': [" ".join([token.index_word.get(idx, "") for idx in x if idx != 0]) for x in X_test],
    'Actual Class': true_classes,
    'Predicted Class': np.argmax(predictions, axis=1),
    'Predicted Probabilities': list(predictions)
})

# Show the DataFrame
print(results_df.head(10))

Test Loss: 0.18837913870811462
Test Accuracy: 0.931597113609314
Predicted probabilities: [[1.3037772e-02 9.8172981e-01 5.2323430e-03]
 [3.6761787e-05 1.5095241e-03 9.9845368e-01]
 [9.9885881e-01 3.9961856e-04 7.4158580e-04]
 [2.0378898e-03 9.8916948e-01 8.7927384e-03]
 [9.2778450e-01 6.6224545e-02 5.9910351e-03]
 [4.8702490e-02 8.5636389e-01 9.4933599e-02]
 [3.6684325e-04 1.0322688e-02 9.8931050e-01]
 [4.4437833e-02 7.5663322e-01 1.9892897e-01]
 [8.4159747e-03 9.1958508e-02 8.9962548e-01]
 [4.5863027e-03 8.7943763e-01 1.1597610e-01]]
Actual labels: [[0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]
                                                Text  Actual Class  \
0  international leads consumer gainers nio tata ...             1   
1         pleased bjorn wahlroos accepted nomination             2   
2  us energy information administration projects ...             0   
3  edited transcript earnings conference cal

In [57]:
results_df['sentiment_score'] = results_df['Predicted Probabilities'].apply(lambda x: -1 * x[0] + 1 * x[2])

In [58]:
results_df.to_excel("scores.xlsx")

In [26]:
model.save("sentimentModel.keras")

In [28]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(token, handle, protocol=pickle.HIGHEST_PROTOCOL)