**Install necessary packages and downgrade torchtext**

In [None]:
!pip install torchtext==0.6.0
!pip install spacy

In [None]:
#!pip install numpy==1.21.0
#!pip install pandas
!python -m spacy download en_core_web_sm

**Load the dataset + Split dataset + Data Cleaning**

In [None]:
import pandas as pd
base_csv = './training.1600000.processed.noemoticon.csv'
df = pd.read_csv(base_csv,encoding='ISO-8859-1',header=None)
df.columns = ['sentiment','id','date','flag','user','tweet']
df.head()

df = df[['sentiment', 'tweet']]

import re

def clean_text(text):
    if isinstance(text, str):  # Check if it's a string
        text = text.lower() # lowercase
        text = text.replace(r"\#","") # replaces hashtags
        text = text.replace("\s{2,}", " ")
        text = re.sub(r'http\S+', '', text)  # Remove URLs
        text = re.sub(r'@[^\s]+', '', text)  # Remove Twitter handles
        text = text.replace(r"[^A-Za-z0-9()!?\'\`\"]", " ")
        return text
    else:
        return ''

df['tweet'] = df['tweet'].apply(clean_text)
df.head()
# Specify the file path where you want to save the CSV file
csv_file_path = 'data.csv'

# Save the DataFrame to a CSV file with headers
df.to_csv(csv_file_path, index=False)

In [None]:
import spacy
spacy.load('en_core_web_sm')
import torch
import torchtext.data as data

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy',
                  tokenizer_language = 'en_core_web_sm',
                  include_lengths = True)

LABEL = data.LabelField(dtype = torch.float)

# Create fields mapping to your DataFrame columns
fields = [('sentiment', LABEL), ('tweet', TEXT)]

# Load the DataFrame into a torchtext TabularDataset
all_data = data.TabularDataset(
    path='data.csv',
    format='csv',
    fields={'sentiment': ('label', LABEL), 'tweet': ('text', TEXT)},
    skip_header=False
)
print(all_data[:10])

[<torchtext.data.example.Example object at 0x0000012728DF2A50>, <torchtext.data.example.Example object at 0x000001270A181CD0>, <torchtext.data.example.Example object at 0x000001270FFC9C90>, <torchtext.data.example.Example object at 0x000001272C588550>, <torchtext.data.example.Example object at 0x000001272C58BC90>, <torchtext.data.example.Example object at 0x000001272C58BDD0>, <torchtext.data.example.Example object at 0x000001272C588450>, <torchtext.data.example.Example object at 0x0000012709104290>, <torchtext.data.example.Example object at 0x0000012709104150>, <torchtext.data.example.Example object at 0x000001272C58B750>]


In [None]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(all_data,
                 max_size = MAX_VOCAB_SIZE,
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(all_data)

.vector_cache\glove.6B.zip: 862MB [02:43, 5.28MB/s]                                                  
100%|████████████████████████████████████████████████████▉| 399999/400000 [00:24<00:00, 16381.68it/s]


In [None]:
import spacy
import random
nlp = spacy.load("en_core_web_sm")
# Split the data into training (60%), validation (20%), and test (20%)
train_data, valid_data, test_data = all_data.split(split_ratio=[0.6, 0.2, 0.2], random_state=random.seed(42))

# Create iterators
BATCH_SIZE = 256
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    sort_within_batch=True,
    sort_key=lambda x: len(x.text),
    device=device
)

cuda


**C)Model using Bi-LSTM**

In [None]:
import torch.nn as nn

class ImprovedRNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'))
        output, (hidden, _) = self.rnn(packed_embedded)
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
        return self.fc(hidden)

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 1
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

# model
model = ImprovedRNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)
model.to()


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

model.embedding.weight.data.copy_(pretrained_embeddings)

#UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

#model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
#model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)



The model has 3,233,897 trainable parameters
torch.Size([25002, 100])
tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [ 0.4298,  0.8205, -1.4562,  ...,  1.4802,  0.2942,  1.3924],
        ...,
        [-0.3882, -0.1119, -1.0825,  ...,  0.0722, -0.3288,  2.0514],
        [-1.6867,  0.4126, -1.3478,  ..., -0.1019, -0.2058, -1.6541],
        [-0.5421,  0.2786,  0.4252,  ...,  0.4125, -0.0876, -0.2877]])


In [None]:
import torch.optim as optim

def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division
    acc = correct.sum() / len(correct)
    return acc

def train(model, iterator, optimizer, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:

        optimizer.zero_grad()
        #print(batch)
        text, text_lengths = batch.text

        predictions = model(text, text_lengths).squeeze(1)

        loss = criterion(predictions, batch.label)

        acc = binary_accuracy(predictions, batch.label)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()
        #print('--')

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model, iterator, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():

        for batch in iterator:

            text, text_lengths = batch.text

            predictions = model(text, text_lengths).squeeze(1)

            loss = criterion(predictions, batch.label)

            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

**Trainning**

In [None]:
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)


epoch = 1
test_acc = 0
best_valid_loss = float('inf')

while test_acc < 0.85:

    start_time = time.time()

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    test_loss, test_acc = evaluate(model, valid_iterator, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'birnnmodel.pt')

    print(f'Epoch: {epoch:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

    epoch = epoch + 1

Epoch: 01 | Epoch Time: 1m 2s
	Train Loss: 0.370 | Train Acc: 83.56%
	 Val. Loss: 0.345 |  Val. Acc: 84.88%
Epoch: 02 | Epoch Time: 1m 2s
	Train Loss: 0.349 | Train Acc: 84.67%
	 Val. Loss: 0.339 |  Val. Acc: 85.26%


**d) 85% in test set**

In [None]:
model.load_state_dict(torch.load('birnnmodel.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.340 | Test Acc: 85.31%
