CLASIFICATION WITH NAIVE BAYES

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical


In [None]:
#Tensorflow keras API
from tensorflow.keras.preprocessing.text import Tokenizer #raw text -> numerical tokens

from tensorflow.keras.preprocessing.sequence import pad_sequences #input sequences same lenght
from tensorflow.keras.utils import to_categorical #numerical labels into one hot encoding vectors
from tensorflow.keras.models import Sequential #Layers to construct neural network
#Each word is assigned a 100 dimensional vector, capturing rich representations like "good"
from tensorflow.keras.layers import Embedding, LSTM, Dense #Long short term memory and fully connected layer
#Long short term memory: preserve important context over long sequences but maintaining gates to forget
#Important to store sentiments, important for stock market analysis
#The fully connected layer ensures a complete probability distribution over the final layers

###################
#Reading the dataset
####################
train = pd.read_csv('sent_train.csv')
test = pd.read_csv('sent_valid.csv')

labels = {
    "0": "Bearish", 
    "1": "Bullish", 
    "2": "Neutral"
}  

X_train = train['text'].values
X_test = test['text'].values
y_train = train['label'].values
y_test = test['label'].values




############
#preprocesing
#############


# Since machine learning models require numerical inputs, 
# we convert categorical labels (strings) into numerical values (0, 1, 2) 
# using LabelEncoder().
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

#To make the model output probabilities for all three classes, we one-hot encode the labels.
#For example, if y_train_enc = 1 (Bullish), then it becomes [0, 1, 0].
num_classes = len(le.classes_)
y_train_onehot = to_categorical(y_train_enc, num_classes=num_classes)
y_test_onehot = to_categorical(y_test_enc, num_classes=num_classes)

#We define a vocabulary size of 10,000 words and initialize a Tokenizer, 
# which assigns a unique index to each word. It then learns the word 
# distributions from the training text.
max_words = 10000  # Tamaño del vocabulario
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

#Now, each headline is converted into a sequence of integers, 
# where each word corresponds to an index in the vocabulary.
#For example,
#Input: "Stock market is rising"
#Output: [23, 564, 3, 78] (assuming 23 = stock, 564 = market, etc.)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

#Since different headlines have varying lengths, we pad shorter sequences 
# with zeros and truncate longer ones to ensure uniformity (max_len = 100).max_len = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)








#Embedding: Converts tokenized words into dense 100-dimensional vector representations, 
# capturing semantic relationships between words.
embedding_dim = 100
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len))



###################
#Neural network
####################

#We choose LSTM for the start, and fully connected layer for the end




#LSTM(128): The heart of the model, this layer consists of 128 LSTM units that process 
#word sequences while preserving contextual relationships.
#dropout=0.2: Prevents overfitting by randomly deactivating 20% of neurons.
#recurrent_dropout=0.2: Adds dropout within the recurrent connections.
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
#Dense(num_classes): Fully connected layer producing three probabilities 
#(one for each sentiment).
#activation='softmax': Ensures the outputs sum to 1, making it a multi-class 
#classification problem.
model.add(Dense(num_classes, activation='softmax'))


#categorical_crossentropy: The loss function for multi-class classification.
#adam: An adaptive optimizer that adjusts learning rates dynamically.
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

#batch_size=32: Trains the model with 32 examples per step.
#epochs=10: The model trains over 10 iterations through the entire dataset.
#validation_split=0.1: Reserves 10% of training data for validation.
history = model.fit(X_train_pad, y_train_onehot, batch_size=32, epochs=10, validation_split=0.1)

#After training, we assess the model’s performance on unseen test data. 
# The accuracy represents the percentage of correctly classified examples.
loss, accuracy = model.evaluate(X_test_pad, y_test_onehot)
print(f"Test Accuracy: {accuracy*100:.2f}%")

# Example:
example_texts = [
    "The production rises",
    "President Trump will be reelected",
    "I will make this country come back to the middle ages",
    "economists are afraid of recesion"
]
example_seq = tokenizer.texts_to_sequences(example_texts)
example_pad = pad_sequences(example_seq, maxlen=max_len)
predictions = model.predict(example_pad)
pred_labels = [le.inverse_transform([np.argmax(pred)])[0] for pred in predictions]

print("\nPredicciones de ejemplo:")
for text, pred in zip(example_texts, pred_labels):
    print(f"Input: {text} -> Predicted Label: {pred}")




Epoch 1/10
[1m269/269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 52ms/step - accuracy: 0.6836 - loss: 0.8249 - val_accuracy: 0.5518 - val_loss: 0.9513
Epoch 2/10
[1m269/269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 47ms/step - accuracy: 0.8140 - loss: 0.4577 - val_accuracy: 0.6209 - val_loss: 0.9163
Epoch 3/10
[1m269/269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 51ms/step - accuracy: 0.9086 - loss: 0.2620 - val_accuracy: 0.6220 - val_loss: 1.0276
Epoch 4/10
[1m269/269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 54ms/step - accuracy: 0.9479 - loss: 0.1565 - val_accuracy: 0.6220 - val_loss: 1.3729
Epoch 5/10
[1m269/269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 54ms/step - accuracy: 0.9657 - loss: 0.1021 - val_accuracy: 0.6262 - val_loss: 1.5746
Epoch 6/10
[1m269/269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 53ms/step - accuracy: 0.9776 - loss: 0.0756 - val_accuracy: 0.6199 - val_loss: 1.7410
Epoch 7/10
[1m2

Test accuracy: 76.34%

We choose some random texts. It failed at the middle ages (¿¿middle ages not associated with negativity???)