# Quantexa: Sentiment Analysis

By Ana Lucia Diaz Leppe

## Libraries 

In [1]:
import re
import random
import string
import torch
from torch.utils.data import TensorDataset, DataLoader
from collections import Counter
import torch.nn as nn
import torch.optim as optim
from torch.nn import CrossEntropyLoss
import numpy as np
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import StepLR

## Read Data

In [2]:
def read_and_prepare_data(file_paths, labels, max_lines=2200):
    """
    Read from files, limit the number of lines and handle the labels
    """
    texts = []
    
    for file_path, label in zip(file_paths, labels):
        print(f'Processing file: {file_path} with label: {label}')
        
        # Count the total number of lines in the file
        line_count = 0
        with open(file_path, 'r', encoding='latin-1') as file:
            for _ in file:
                line_count += 1
        
        # Read up to max_lines from the file
        lines_read = 0
        with open(file_path, 'r', encoding='latin-1') as file:
            for line in file:
                if lines_read >= max_lines:
                    break
                stripped_line = line.strip()
                if stripped_line:
                    texts.append((stripped_line, label))
                    print(f'Added text: {stripped_line} with label: {label}')
                    lines_read += 1
    
    return texts

# Paths to your data files
file_paths = ['data/positive', 'data/negative', 'data/neutral']
labels = [1, 0, 2]  # Corresponding labels for positive, negative, and neutral

# Read and prepare the data with a limit of 500 lines per file
texts_and_labels = read_and_prepare_data(file_paths, labels)
print(f'Number of texts and labels: {len(texts_and_labels)}')

Processing file: data/positive with label: 1
Added text: @VirginAmerica plus you've added commercials to the experience... tacky. with label: 1
Added text: @VirginAmerica yes, nearly every time I fly VX this âear wormâ wonât go away :) with label: 1
Added text: @virginamerica Well, I didn'tâ¦but NOW I DO! :-D with label: 1
Added text: @VirginAmerica it was amazing, and arrived an hour early. You're too good to me. with label: 1
Added text: @VirginAmerica I &lt;3 pretty graphics. so much better than minimal iconography. :D with label: 1
Added text: @VirginAmerica This is such a great deal! Already thinking about my 2nd trip to @Australia &amp; I haven't even gone on my 1st trip yet! ;p with label: 1
Added text: @VirginAmerica @virginmedia I'm flying your #fabulous #Seductive skies again! U take all the #stress away from travel http://t.co/ahlXHhKiyn with label: 1
Added text: @VirginAmerica Thanks! with label: 1
Added text: @VirginAmerica So excited for my first cross country flig

In [3]:
texts_and_labels

[("@VirginAmerica plus you've added commercials to the experience... tacky.",
  1),
 ('@VirginAmerica yes, nearly every time I fly VX this â\x80\x9cear wormâ\x80\x9d wonâ\x80\x99t go away :)',
  1),
 ("@virginamerica Well, I didn'tâ\x80¦but NOW I DO! :-D", 1),
 ("@VirginAmerica it was amazing, and arrived an hour early. You're too good to me.",
  1),
 ('@VirginAmerica I &lt;3 pretty graphics. so much better than minimal iconography. :D',
  1),
 ("@VirginAmerica This is such a great deal! Already thinking about my 2nd trip to @Australia &amp; I haven't even gone on my 1st trip yet! ;p",
  1),
 ("@VirginAmerica @virginmedia I'm flying your #fabulous #Seductive skies again! U take all the #stress away from travel http://t.co/ahlXHhKiyn",
  1),
 ('@VirginAmerica Thanks!', 1),
 ("@VirginAmerica So excited for my first cross country flight LAX to MCO I've heard nothing but great things about Virgin America. #29DaysToGo",
  1),
 ('I â\x9d¤ï¸\x8f flying @VirginAmerica. â\x98ºï¸\x8fð\x9f\x91\x8

In [4]:
# Step 1: Extract all the texts from the list of lists
texts = [item[0] for item in texts_and_labels]

# Step 2: Combine the texts into a single string, separated by newlines
combined_text = "\\n".join(texts)  # Use double backslash to escape the backslash

# Output the result
print(combined_text)



## Data pre-processing

1. cleaning the text
2. character level encoding - build vocabulary
3. encoding text
4. paddle sequences
5. Things I could have done: outliers and a type of SMOTE (balancing classes)

The first step when building a neural network model is getting your data into the proper form to feed into the network. 

In [5]:
def cleaning_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r'@\w+', '', tweet)  # Tweets start with User
    tweet = re.sub(r'https?:\/\/\S+', '', tweet)  # no URL
    tweet = re.sub(r'[^\w\s]', '', tweet)  # no special characters
    return tweet

def build_vocab(texts):
    chars = set(''.join(texts))
    chars = sorted(chars)
    char_to_idx = {char: idx + 1 for idx, char in enumerate(chars)}
    char_to_idx['<PAD>'] = 0 
    return char_to_idx


In [6]:
char_to_idx = build_vocab(combined_text)
max_len = 100

In [7]:
char_to_idx

{' ': 1,
 '!': 2,
 '"': 3,
 '#': 4,
 '$': 5,
 '%': 6,
 '&': 7,
 "'": 8,
 '(': 9,
 ')': 10,
 '*': 11,
 '+': 12,
 ',': 13,
 '-': 14,
 '.': 15,
 '/': 16,
 '0': 17,
 '1': 18,
 '2': 19,
 '3': 20,
 '4': 21,
 '5': 22,
 '6': 23,
 '7': 24,
 '8': 25,
 '9': 26,
 ':': 27,
 ';': 28,
 '=': 29,
 '?': 30,
 '@': 31,
 'A': 32,
 'B': 33,
 'C': 34,
 'D': 35,
 'E': 36,
 'F': 37,
 'G': 38,
 'H': 39,
 'I': 40,
 'J': 41,
 'K': 42,
 'L': 43,
 'M': 44,
 'N': 45,
 'O': 46,
 'P': 47,
 'Q': 48,
 'R': 49,
 'S': 50,
 'T': 51,
 'U': 52,
 'V': 53,
 'W': 54,
 'X': 55,
 'Y': 56,
 'Z': 57,
 '[': 58,
 '\\': 59,
 ']': 60,
 '^': 61,
 '_': 62,
 'a': 63,
 'b': 64,
 'c': 65,
 'd': 66,
 'e': 67,
 'f': 68,
 'g': 69,
 'h': 70,
 'i': 71,
 'j': 72,
 'k': 73,
 'l': 74,
 'm': 75,
 'n': 76,
 'o': 77,
 'p': 78,
 'q': 79,
 'r': 80,
 's': 81,
 't': 82,
 'u': 83,
 'v': 84,
 'w': 85,
 'x': 86,
 'y': 87,
 'z': 88,
 '{': 89,
 '|': 90,
 '}': 91,
 '~': 92,
 '\x80': 93,
 '\x81': 94,
 '\x82': 95,
 '\x83': 96,
 '\x84': 97,
 '\x85': 98,
 '\x86': 9

In [8]:
len(char_to_idx)

158

## encoding text

In [9]:
def encode_text(text, char_to_idx):
    return [char_to_idx.get(c, 0) for c in text]

## Padding Sequence

In [10]:
def pad_sequences(sequences, max_len):
    # Initialize padded_sequences with zeros
    padded_sequences = np.zeros((len(sequences), max_len), dtype=int)
    
    for i, seq in enumerate(sequences):
        if len(seq) > 0:  # Only process non-empty sequences
            if len(seq) > max_len:
                padded_sequences[i, :max_len] = seq[:max_len]
            else:
                padded_sequences[i, -len(seq):] = seq
                
    return torch.tensor(padded_sequences)

## How is Managing Char

In [11]:
counter = 0
for tweet in texts:
    counter += 1
    cleaned_tweet = cleaning_tweet(tweet)
    encoded_tweet = encode_text(cleaned_tweet, char_to_idx)
    padded_tweet = pad_sequences([encoded_tweet], max_len)
    print(f'Original Tweet: {tweet}')
    print(f'Cleaned Tweet: {cleaned_tweet}')
    print(f'Encoded Tweet: {encoded_tweet}')
    print(f'Padded Tweet Shape: {padded_tweet.shape}')
    print(f'Padded Tweet: {padded_tweet}')
    print("COUNTER")
    print(counter)
    print('---')

Original Tweet: @VirginAmerica plus you've added commercials to the experience... tacky.
Cleaned Tweet:  plus youve added commercials to the experience tacky
Encoded Tweet: [1, 78, 74, 83, 81, 1, 87, 77, 83, 84, 67, 1, 63, 66, 66, 67, 66, 1, 65, 77, 75, 75, 67, 80, 65, 71, 63, 74, 81, 1, 82, 77, 1, 82, 70, 67, 1, 67, 86, 78, 67, 80, 71, 67, 76, 65, 67, 1, 82, 63, 65, 73, 87]
Padded Tweet Shape: torch.Size([1, 100])
Padded Tweet: tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1, 78, 74, 83, 81,  1, 87,
         77, 83, 84, 67,  1, 63, 66, 66, 67, 66,  1, 65, 77, 75, 75, 67, 80, 65,
         71, 63, 74, 81,  1, 82, 77,  1, 82, 70, 67,  1, 67, 86, 78, 67, 80, 71,
         67, 76, 65, 67,  1, 82, 63, 65, 73, 87]], dtype=torch.int32)
COUNTER
1
---
Original Tweet: @VirginAmerica yes, nearly every time I fly VX this âear wor

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



Padded Tweet: tensor([[ 1, 76, 77,  1, 71, 82, 81,  1, 85, 67, 71, 80, 66,  1, 71,  1, 78, 71,
         65, 73, 67, 66,  1, 77, 82, 70, 67, 80,  1, 65, 71, 82, 71, 67, 81,  1,
         72, 83, 81, 82,  1, 82, 77,  1, 82, 67, 81, 82,  1, 82, 70, 77, 81, 67,
          1, 85, 77, 80, 73, 67, 66, 76, 77, 82,  1, 82, 70, 67,  1, 77, 76, 67,
          1, 71,  1, 85, 63, 76, 82,  1, 85, 77, 80, 73, 81,  1, 77, 76,  1, 78,
         70, 77, 76, 67,  1, 82, 70, 77, 83, 69]], dtype=torch.int32)
COUNTER
6000
---
Original Tweet: Listen, im not gonna deny this but... RT @JetBlue: Our fleet's on fleek. http://t.co/eNXV64RkbU
Cleaned Tweet: listen im not gonna deny this but rt  our fleets on fleek 
Encoded Tweet: [74, 71, 81, 82, 67, 76, 1, 71, 75, 1, 76, 77, 82, 1, 69, 77, 76, 76, 63, 1, 66, 67, 76, 87, 1, 82, 70, 71, 81, 1, 64, 83, 82, 1, 80, 82, 1, 1, 77, 83, 80, 1, 68, 74, 67, 67, 82, 81, 1, 77, 76, 1, 68, 74, 67, 67, 73, 1]
Padded Tweet Shape: torch.Size([1, 100])
Padded Tweet: tensor([[ 0,  0,  

## Now Really Proccess it

In [12]:
# Encode and pad texts
encoded_texts = []
processed_labels = []


for text, label in texts_and_labels:
    cleaned_text = cleaning_tweet(text)
    encoded_text = encode_text(cleaned_text, char_to_idx)
    padded_text = pad_sequences([encoded_text], max_len)
    
    encoded_texts.append(padded_text)
    processed_labels.append(label)

# Convert lists to numpy arrays
padded_texts = np.concatenate(encoded_texts, axis=0)
processed_labels = np.array(processed_labels)

print(f'Number of padded texts: {padded_texts.shape[0]}')
print(f'Number of processed labels: {len(processed_labels)}')

Number of padded texts: 6600
Number of processed labels: 6600


In [13]:
padded_texts

array([[ 0,  0,  0, ..., 65, 73, 87],
       [ 0,  0,  0, ..., 63, 87,  1],
       [ 0,  0,  0, ..., 77,  1, 66],
       ...,
       [ 0,  0,  0, ..., 82, 65, 70],
       [ 0,  0,  0, ..., 80, 82,  1],
       [ 0,  0,  0, ...,  1,  1,  1]])

In [14]:
processed_labels

array([1, 1, 1, ..., 2, 2, 2])

In [15]:
texts_and_labels

[("@VirginAmerica plus you've added commercials to the experience... tacky.",
  1),
 ('@VirginAmerica yes, nearly every time I fly VX this â\x80\x9cear wormâ\x80\x9d wonâ\x80\x99t go away :)',
  1),
 ("@virginamerica Well, I didn'tâ\x80¦but NOW I DO! :-D", 1),
 ("@VirginAmerica it was amazing, and arrived an hour early. You're too good to me.",
  1),
 ('@VirginAmerica I &lt;3 pretty graphics. so much better than minimal iconography. :D',
  1),
 ("@VirginAmerica This is such a great deal! Already thinking about my 2nd trip to @Australia &amp; I haven't even gone on my 1st trip yet! ;p",
  1),
 ("@VirginAmerica @virginmedia I'm flying your #fabulous #Seductive skies again! U take all the #stress away from travel http://t.co/ahlXHhKiyn",
  1),
 ('@VirginAmerica Thanks!', 1),
 ("@VirginAmerica So excited for my first cross country flight LAX to MCO I've heard nothing but great things about Virgin America. #29DaysToGo",
  1),
 ('I â\x9d¤ï¸\x8f flying @VirginAmerica. â\x98ºï¸\x8fð\x9f\x91\x8

## Training, Validation, Test

In [16]:
train_val_x, test_x, train_val_y, test_y = train_test_split(padded_texts, processed_labels, test_size=0.2, stratify=processed_labels, random_state=123)
train_x, val_x, train_y, val_y = train_test_split(train_val_x, train_val_y, test_size=0.1, stratify=train_val_y, random_state=123)

train_x = torch.tensor(train_x, dtype=torch.int64)
train_y = torch.tensor(train_y, dtype=torch.int64)
val_x = torch.tensor(val_x, dtype=torch.int64)
val_y = torch.tensor(val_y, dtype=torch.int64)
test_x = torch.tensor(test_x, dtype=torch.int64)
test_y = torch.tensor(test_y, dtype=torch.int64)

# Dataloaders :)
batch_size = 80
train_data = TensorDataset(train_x, train_y)
valid_data = TensorDataset(val_x, val_y)
test_data = TensorDataset(test_x, test_y)

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)


print("Training set shape:", train_x.shape)
print("Validation set shape:", val_x.shape)
print("Test set shape:", test_x.shape)

Training set shape: torch.Size([4752, 100])
Validation set shape: torch.Size([528, 100])
Test set shape: torch.Size([1320, 100])


## Check Data Distribution 

In [17]:
def label_distribution(labels):
    return dict(Counter(labels.tolist()))

print("Training set label distribution:", label_distribution(train_y))
print("Validation set label distribution:", label_distribution(val_y))
print("Test set label distribution:", label_distribution(test_y))

Training set label distribution: {1: 1584, 2: 1584, 0: 1584}
Validation set label distribution: {1: 176, 0: 176, 2: 176}
Test set label distribution: {0: 440, 2: 440, 1: 440}


In [18]:
train_y

tensor([1, 1, 1,  ..., 1, 1, 1])

In [19]:
train_x

tensor([[ 0,  1, 82,  ..., 69, 70, 82],
        [71, 82,  1,  ...,  1, 65, 77],
        [ 0,  0,  0,  ..., 77, 75,  1],
        ...,
        [ 0,  0,  0,  ..., 82, 67, 80],
        [ 0,  0,  0,  ..., 63, 73,  1],
        [ 1, 73, 83,  ..., 81, 81, 83]])

## My Hybrid Model

#### 1. Self-Attention Mechanism (SelfAttention class):

Goal: helps to get the dependencies between characters in the input sequence so you can give more attention on different parts of a returning sequence when processing each character.
Structure:
1.1 Linear Layers: Mapping the input token embeddings to Que, Key, Val matrices.
1.2 Multi-Head Attention: It divides the word embeddings into multiple heads, and each head attends onto a different part of the sequence.
1.3 Scaled Dot-Product Attention: Calculates attention scores by scaling the dot product of queries and keys, followed by applying a softmax function.
1.4 Concatenation and Output (results of all heads are combined followed by the final linear layer.)

#### 2. Convolutional Layers:

Goal: Capture patterns in the character embeddings with convolutional operations to extract local features
Structure:
Three Convolutional Layers: These layers start with a small number of filters, which increases further downstream and is followed by MaxPooling to reduce the dimensional space while capturing more hierarchical features.

#### 3. LSTM Layer:

Use: Takes the sequence of features produced from conv layers and support temporal dependencies
Structure:
Bidirectional LSTM: It enables the model to capture context from both sides of sample.

#### 4. Fully Connected Layer:
Use: Maps the activations of the LSTM to a final classification space.
Architecture: Linear transformation for each class-producing logits as output.

#### 5. Dropout Layer:
Goal: Preventing overftting by randomly turn off a fraction of the input units at each time during training.

In [20]:
class SelfAttention(nn.Module):
    def __init__(self, embed_size, num_heads):
        super(SelfAttention, self).__init__()

        self.embed_size = embed_size
        self.num_heads = num_heads
        self.head_dim = embed_size // num_heads
        self.values = nn.Linear(embed_size, embed_size, bias=False)
        self.keys = nn.Linear(embed_size, embed_size, bias=False)
        self.queries = nn.Linear(embed_size, embed_size, bias=False)
        self.fc_out = nn.Linear(embed_size, embed_size)
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)

    def forward(self, x):
        N = x.shape[0]
        value_len, key_len, query_len = x.shape[1], x.shape[1], x.shape[1]
        values = self.values(x).view(N, value_len, self.num_heads, self.head_dim).to(device)
        keys = self.keys(x).view(N, key_len, self.num_heads, self.head_dim).to(device)
        queries = self.queries(x).view(N, query_len, self.num_heads, self.head_dim).to(device)
        values = values.permute(0, 2, 1, 3)
        keys = keys.permute(0, 2, 1, 3)
        queries = queries.permute(0, 2, 1, 3)
        energy = torch.matmul(queries, keys.permute(0, 1, 3, 2)) / self.scale
        attention = torch.nn.functional.softmax(energy, dim=3)
        out = torch.matmul(attention, values)
        out = out.permute(0, 2, 1, 3).contiguous().view(N, query_len, self.embed_size)
        return self.fc_out(out)

class TransformerLSTM(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, hidden_size, output_size, num_layers, dropout=0.5):
        super(TransformerLSTM, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embed_size).to(device)
        
        # Layers
        self.conv1 = nn.Conv1d(in_channels=embed_size, out_channels=128, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(in_channels=256, out_channels=512, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2)
        
        self.self_attention = nn.ModuleList([SelfAttention(embed_size, num_heads) for _ in range(3)])  # Multiple Self-Attention Layers
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.embedding(x).permute(0, 2, 1)
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = self.pool(torch.relu(self.conv3(x)))
        x = x.permute(0, 2, 1)
        
        for layer in self.self_attention:
            x = layer(x)
        
        x, _ = self.lstm(x)
        return self.fc(self.dropout(x[:, -1, :]))

In [21]:
vocab_size = len(char_to_idx)
embed_size = 512
num_heads = 8
hidden_size = 512
output_size = 3
num_layers = 2
dropout = 0.5

In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model = TransformerLSTM(vocab_size, embed_size, num_heads, hidden_size, output_size, num_layers, dropout).to(device)

# Define loss function and optimizer
criterion = CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = StepLR(optimizer, step_size=5, gamma=0.1)

In [23]:
# Training loop
num_epochs = 10
best_val_loss = float('inf')
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * texts.size(0)
    
    train_loss = train_loss / len(train_loader.dataset)
    
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for texts, labels in valid_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * texts.size(0)
    
    val_loss = val_loss / len(valid_loader.dataset)
    
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model.pt')
    
    scheduler.step()


Epoch 1/10, Train Loss: 1.0597, Validation Loss: 1.0272
Epoch 2/10, Train Loss: 1.0251, Validation Loss: 1.0213
Epoch 3/10, Train Loss: 0.9881, Validation Loss: 0.9960
Epoch 4/10, Train Loss: 0.8623, Validation Loss: 0.8436
Epoch 5/10, Train Loss: 0.7959, Validation Loss: 0.8429
Epoch 6/10, Train Loss: 0.6750, Validation Loss: 0.7975
Epoch 7/10, Train Loss: 0.5982, Validation Loss: 0.8053
Epoch 8/10, Train Loss: 0.5356, Validation Loss: 0.8312
Epoch 9/10, Train Loss: 0.4603, Validation Loss: 0.8931
Epoch 10/10, Train Loss: 0.3816, Validation Loss: 0.9666


In [24]:
for texts, labels in train_loader:
    print(labels)

tensor([0, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 0, 0, 2, 2, 0, 2, 1, 0, 2, 2, 1, 2, 1,
        0, 1, 0, 1, 1, 0, 2, 0, 0, 2, 1, 2, 2, 0, 0, 1, 0, 0, 0, 1, 1, 2, 0, 2,
        1, 0, 2, 1, 2, 2, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1,
        0, 1, 0, 2, 0, 0, 0, 2])
tensor([2, 1, 1, 0, 1, 0, 0, 2, 2, 2, 1, 2, 0, 1, 1, 1, 1, 1, 2, 1, 2, 1, 0, 1,
        2, 1, 2, 0, 2, 0, 0, 0, 0, 0, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 0, 0, 0,
        2, 0, 2, 2, 0, 1, 2, 1, 0, 0, 1, 0, 2, 1, 1, 1, 2, 2, 0, 2, 0, 0, 0, 0,
        0, 2, 2, 1, 2, 1, 1, 2])
tensor([0, 2, 2, 0, 0, 0, 1, 0, 2, 1, 2, 1, 2, 1, 0, 2, 0, 1, 2, 0, 2, 0, 1, 0,
        2, 1, 2, 1, 0, 1, 1, 0, 1, 1, 1, 2, 1, 2, 1, 0, 2, 1, 2, 0, 1, 2, 2, 2,
        0, 1, 2, 2, 2, 0, 1, 0, 2, 1, 0, 2, 0, 0, 1, 0, 2, 0, 2, 2, 2, 2, 0, 2,
        2, 1, 1, 1, 0, 2, 1, 1])
tensor([2, 2, 1, 0, 2, 2, 0, 2, 2, 1, 2, 2, 0, 2, 2, 0, 0, 1, 2, 2, 1, 2, 1, 2,
        0, 2, 1, 0, 2, 0, 0, 1, 0, 0, 2, 1, 2, 0, 0, 0, 1, 1, 1, 1, 2, 0, 0, 2,
        0, 0, 1, 1, 2

In [25]:
for texts, labels in train_loader:
    print(texts)

tensor([[ 0,  0,  0,  ..., 69, 70, 82],
        [ 0,  0,  0,  ..., 67, 87,  1],
        [ 0,  0,  0,  ..., 66, 71, 63],
        ...,
        [ 0,  0,  0,  ..., 67, 74, 78],
        [ 0,  0,  0,  ..., 87, 77, 83],
        [ 1, 63, 76,  ..., 63, 71, 75]])
tensor([[ 1, 71, 82,  ..., 83, 80,  1],
        [72, 83, 81,  ..., 77, 75, 67],
        [ 1, 71, 65,  ..., 80, 76, 71],
        ...,
        [ 0,  0,  0,  ..., 74, 83, 67],
        [ 1, 68, 71,  ..., 78, 80, 71],
        [ 0,  0,  0,  ..., 83, 76, 66]])
tensor([[ 1, 68, 74,  ..., 63, 87,  1],
        [ 0,  0,  0,  ..., 67, 76,  1],
        [ 0,  0,  0,  ..., 74, 63, 86],
        ...,
        [ 1, 74, 77,  ..., 81, 67, 66],
        [ 0,  0,  0,  ..., 82, 67, 66],
        [ 1, 71,  1,  ..., 70, 67, 80]])
tensor([[ 0,  0,  0,  ..., 76, 73, 81],
        [ 0,  0,  0,  ..., 67, 76, 82],
        [ 1,  1, 82,  ..., 82,  1, 68],
        ...,
        [ 0,  0,  0,  ..., 71, 78, 82],
        [ 0,  0,  0,  ..., 76, 82,  1],
        [ 1, 81, 82,  ...

## Testing

In [26]:
from sklearn.metrics import precision_recall_fscore_support

# Evaluation
model.load_state_dict(torch.load('best_model.pt'))
model.eval()

test_losses = []
num_correct = 0
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        output = model(inputs)
        test_loss = criterion(output, labels)
        test_losses.append(test_loss.item())
        
        pred = torch.argmax(output, dim=1)
        num_correct += (pred == labels).sum().item()
        
        all_preds.extend(pred.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

avg_test_loss = np.mean(test_losses)
test_acc = num_correct / len(test_loader.dataset)

# Calculate precision, recall, and F1-score
all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

# Compute precision, recall, and F1 score
precision, recall, f1_score, _ = precision_recall_fscore_support(all_labels, all_preds, average=None, labels=[0, 1, 2])

# Calculate counts of each class
unique, counts = np.unique(all_preds, return_counts=True)
pred_counts = dict(zip(unique, counts))
total_preds = len(all_preds)

# Print out proportions and metrics
print(f'Test Loss: {avg_test_loss:.3f}')
print(f'Test Accuracy: {test_acc:.3f}')

for label in sorted(pred_counts.keys()):
    proportion = pred_counts[label] / total_preds
    print(f'Proportion of predictions for class {label}: {proportion:.3f}')

# Print precision, recall, and F1-score for each class
for i, label in enumerate([0, 1, 2]):
    print(f'Class {label}: Precision = {precision[i]:.3f}, Recall = {recall[i]:.3f}, F1-Score = {f1_score[i]:.3f}')

  model.load_state_dict(torch.load('best_model.pt'))


Test Loss: 0.796
Test Accuracy: 0.686
Proportion of predictions for class 0: 0.325
Proportion of predictions for class 1: 0.326
Proportion of predictions for class 2: 0.349
Class 0: Precision = 0.720, Recall = 0.702, F1-Score = 0.711
Class 1: Precision = 0.735, Recall = 0.718, F1-Score = 0.726
Class 2: Precision = 0.607, Recall = 0.636, F1-Score = 0.622


# Single Tweet testing

In [97]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = TransformerLSTM(vocab_size, embed_size, num_heads, hidden_size, output_size, num_layers, dropout).to(device)

# Load the saved model state
model.load_state_dict(torch.load('ana-q-aylien_trained.pt', map_location=device))
model.eval()

  model.load_state_dict(torch.load('ana-q-aylien_trained.pt', map_location=device))


TransformerLSTM(
  (embedding): Embedding(158, 512)
  (conv1): Conv1d(512, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv2): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv3): Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (self_attention): ModuleList(
    (0-2): 3 x SelfAttention(
      (values): Linear(in_features=512, out_features=512, bias=False)
      (keys): Linear(in_features=512, out_features=512, bias=False)
      (queries): Linear(in_features=512, out_features=512, bias=False)
      (fc_out): Linear(in_features=512, out_features=512, bias=True)
    )
  )
  (lstm): LSTM(512, 512, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=1024, out_features=3, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [100]:
# Example tweet
tweet = "@PascalTheRM I play with my toys all day long"

# Clean the tweet
cleaned_tweet = cleaning_tweet(tweet)

# Encode the tweet
encoded_tweet = encode_text(cleaned_tweet, char_to_idx)

# Pad the tweet
max_len = 100  # Ensure this matches the max_len used during training
padded_tweet = pad_sequences([encoded_tweet], max_len)

# Convert to a tensor and move to the correct device
padded_tweet_tensor = padded_tweet.to(device)

In [101]:
# Ensure the model is in evaluation mode
model.eval()

# Disable gradient calculation
with torch.no_grad():
    output = model(padded_tweet_tensor)

# Get the predicted class (index with the highest score)
predicted_class_idx = torch.argmax(output, dim=1).item()

# Map the index to the sentiment label
class_labels = {0: 'Negative', 1: 'Positive', 2: 'Neutral'}
predicted_label = class_labels[predicted_class_idx]

print(f'The predicted sentiment for the tweet "{tweet}" is: {predicted_label}')

The predicted sentiment for the tweet "@PascalTheRM I play with my toys all day long" is: Neutral
