# Lab Exercice 02

## Instructions

The csv files are a collection of tweets labelled with sentiment in 3 categories:

sentiments = {
    "LABEL_0": "Bearish", 
    "LABEL_1": "Bullish", 
    "LABEL_2": "Neutral"
}  

Train a LSTM network to with the training file. Validate the trained model with the valid file. Comment what you are doing in each part of your code. As the better the code, comments and result validation as the better the grade.

### Imports

In [70]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.metrics import classification_report

### Visualize the data sets

In [71]:
# Load the datasets
train_file = "sent_train.csv"
valid_file = "sent_valid.csv"

# Read and display datasets head to test
df_train = pd.read_csv(train_file)
df_train.head()


Unnamed: 0,text,label
0,$BYND - JPMorgan reels in expectations on Beyo...,0
1,$CCL $RCL - Nomura points to bookings weakness...,0
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan ...",0
3,$ESS: BTIG Research cuts to Neutral https://t....,0
4,$FNKO - Funko slides after Piper Jaffray PT cu...,0


In [72]:
df_valid = pd.read_csv(valid_file)
df_valid.head()

Unnamed: 0,text,label
0,$ALLY - Ally Financial pulls outlook https://t...,0
1,"$DELL $HPE - Dell, HPE targets trimmed on comp...",0
2,$PRTY - Moody's turns negative on Party City h...,0
3,$SAN: Deutsche Bank cuts to Hold,0
4,$SITC: Compass Point cuts to Sell,0


In [73]:
def load_data(file_path):
    df = pd.read_csv(file_path)
    return df['text'].astype(str), df['label']

### Manage the data

In [74]:
# Load train and validation data
X_train, y_train = load_data(train_file)
X_valid, y_valid = load_data(valid_file)

### Tokenization

In [75]:
# Tokenization and Padding
max_words = 10000
max_len = 100

In [76]:
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_len, padding='post')
X_valid_seq = pad_sequences(tokenizer.texts_to_sequences(X_valid), maxlen=max_len, padding='post')

### LSTM model

In [None]:
# Build LSTM Model
model = Sequential([  
    # Embedding layer: Converts words (encoded as integers) into dense vector representations
    Embedding(input_dim=max_words,  # The size of the vocabulary
              output_dim=128,       # Dimension of the embedding vectors
              input_length=max_len), # Maximum length of input sequences

    # First LSTM layer with 64 units and returning sequences for stacking
    LSTM(64, return_sequences=True),  
    
    # Dropout layer to prevent overfitting by randomly dropping 50% of neurons
    Dropout(0.5),  

    # Second LSTM layer with 32 units (no return_sequences, as it's the last LSTM layer)
    LSTM(32),  

    # Dense output layer with 3 neurons (one for each sentiment class)
    # Using softmax activation to output probabilities for each class (Bearish, Bullish, Neutral)
    Dense(3, activation='softmax')  
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy',  # Suitable for integer-encoded labels
              optimizer='adam',  # Adam optimizer for efficient training
              metrics=['accuracy'])  # Track accuracy during training and evaluation




### Training

In [80]:
# Train Model
history = model.fit(X_train_seq, y_train, epochs=5, batch_size=32)


Epoch 1/5
[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 102ms/step - accuracy: 0.6561 - loss: 0.8812
Epoch 2/5
[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 95ms/step - accuracy: 0.6503 - loss: 0.8881
Epoch 3/5
[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 108ms/step - accuracy: 0.6512 - loss: 0.8851
Epoch 4/5
[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 96ms/step - accuracy: 0.6513 - loss: 0.8868
Epoch 5/5
[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 106ms/step - accuracy: 0.6472 - loss: 0.8921


### Validation

In [None]:
# Validate Model
validation_loss, validation_accuracy = model.evaluate(X_valid_seq, y_valid)
print(f'Validation Loss: {validation_loss}')
print(f'Validation Accuracy: {validation_accuracy}')


[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 36ms/step
              precision    recall  f1-score   support

     Bearish       0.00      0.00      0.00       347
     Bullish       0.00      0.00      0.00       475
     Neutral       0.66      1.00      0.79      1566

    accuracy                           0.66      2388
   macro avg       0.22      0.33      0.26      2388
weighted avg       0.43      0.66      0.52      2388



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
