# RNN

In [None]:
import fastai
import tweepy
import torch
import re
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertTokenizerFast, GPT2Tokenizer
from sklearn.model_selection import train_test_split

# Loading the Dataset

In [None]:
columns = ['sentiment','id','date','query','user','text']
dataset_path = 'encoded-training.1600000.processed.noemoticon.csv'
df = pd.read_csv(dataset_path, header = None, names = columns, encoding = 'utf-8', dtype ={0:str}, low_memory=False)
print(df.head())

# Pre Processing Dataset

In [None]:
def preprocess_tweets(text):
    text  = re.sub(r"http\S+|www.\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@|\#','', text)
    return text

df['text'] = df['text'].apply(preprocess_tweets)

In [None]:
print(df.head())

# Tokenization

In [None]:
from nltk.tokenize import word_tokenize
import nltk
#nltk.download('punkt')

# Tokenizing the tweets
#df['text'] = df['text'].apply(word_tokenize)


# Converting Labels

In [None]:
# Convert sentiment labels, assuming 0 is negative and 4 is positive
df['sentiment'] = df['sentiment'].astype(int)
df['sentiment'] = df['sentiment'].replace(4,1)
df['sentiment'] = df['sentiment'].replace(0,0)

# Example conversion, adjust based on your actual labels
#df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 4 else 0)

print(df['sentiment'].value_counts())
print(df['sentiment'].dtype)


# Splitting Data into Train, Test and Validation

In [None]:

# Splitting the dataset into training and test sets (80% train, 20% test)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Further split the training set into training and validation sets (80% train, 20% validation)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)


# Implementing Datasets and Dataloaders

In [None]:
from rnn_twitter_dataset import TwitterDataset
      
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MAX_LEN = 250
                                              

In [None]:
from torch.utils.data import DataLoader

def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = TwitterDataset(
    tweets=df['text'].to_numpy(),
    labels=df['sentiment'].to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=8
  )

BATCH_SIZE = 16000

#data_loader = create_data_loader(df, tokenizer, MAX_LEN, BATCH_SIZE)
train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(val_df, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(test_df, tokenizer, MAX_LEN, BATCH_SIZE)

# Creating the Model

In [None]:
import torch.nn as nn

class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text,lengths, attention_mask):
         
        lengths, sorted_idx = lengths.sort(0, descending=True)
        text = text[sorted_idx]
        attention_mask = attention_mask[sorted_idx]
        
        # Pack sequence
        embedded = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True)
        packed_output, (hidden, _) = self.rnn(packed_embedded)
        
        # Unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        return self.fc(hidden.squeeze(0))

# Model hyperparameters (you should tune these)
VOCAB_SIZE = len(tokenizer)  # Adjust based on your tokenizer
EMBEDDING_DIM = 12
HIDDEN_DIM = 8
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = False
DROPOUT = 0.5

model = SentimentRNN(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)


# Creating Training and Evaluating Functions

In [None]:
def train(model, data_loader, optimizer, criterion):
    model.train()

    epoch_loss = 0
    epoch_acc = 0

    for batch in data_loader:
        #print('hello')
        optimizer.zero_grad()
        #text, text_lengths = batch.text
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        lengths = batch['length'] 
        labels = batch['labels']
        predictions = model(input_ids, lengths,attention_mask).squeeze(1)
        labels = labels.float()
        loss = criterion(predictions, labels)
        acc = binary_accuracy(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(data_loader), epoch_acc / len(data_loader)

def evaluate(model, data_loader, criterion):
    model.eval()

    epoch_loss = 0
    epoch_acc = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            lengths = batch['length'] 
            labels = batch['labels']
            predictions = model(input_ids, lengths,attention_mask).squeeze(1)
            labels = labels.float()
            loss = criterion(predictions, labels)
            acc = binary_accuracy(predictions, labels)
            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(data_loader), epoch_acc / len(data_loader)

def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc


# Training Model

In [None]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=0.5)
criterion = nn.BCEWithLogitsLoss()

# Training loop
num_epochs = 5  # Number of epochs

for epoch in range(num_epochs):
    train_loss, train_acc = train(model, train_data_loader, optimizer, criterion)
    val_loss, val_acc = evaluate(model, val_data_loader, criterion)

    print(f'Epoch: {epoch+1}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {val_loss:.3f} |  Val. Acc: {val_acc*100:.2f}%')


# Final Test Accuracy

In [None]:
test_loss, test_acc = evaluate(model, test_data_loader, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
