# BERT Transfomer

In [1]:
import fastai
import tweepy
import torch
import re
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertTokenizerFast, GPT2Tokenizer, DistilBertTokenizer, DistilBertModel
from sklearn.model_selection import train_test_split
from torch.profiler import profile, record_function, ProfilerActivity


# Loading Dataset

In [None]:
import pandas as pd

# Parameters
n = 799000  # Number of rows to remove from the start and end
csv_path = 'encoded-training.1600000.processed.noemoticon.csv'  # Path to your CSV file
output_path = 'reduced-encoded-training.1600000.processed.noemoticon.csv'  # Path for the modified CSV file

# Step 1: Read the CSV file
df = pd.read_csv(csv_path)

# Step 2: Check the number of rows
if len(df) <= 2 * n:
    raise ValueError("The DataFrame is too small to remove that many rows.")

# Step 3: Drop the first n and last n rows
df_modified = df.iloc[n:-n]

df_modified = df_modified.sample(frac=1).reset_index(drop=True)

# Step 4: Save the modified DataFrame
df_modified.to_csv(output_path, index=False)


In [None]:
columns = ['sentiment','id','date','query','user','text']
dataset_path = 'reduced-encoded-training.1600000.processed.noemoticon.csv'
df = pd.read_csv(dataset_path, header = None, names = columns, encoding = 'utf-8', dtype ={0:str}, low_memory=False)
print(df.head())

# Pre Processing Dataset

In [None]:
def preprocess_tweets(text):
    text  = re.sub(r"http\S+|www.\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@|\#','', text)
    return text

df['text'] = df['text'].apply(preprocess_tweets)
print(df.head())

In [None]:
from nltk.tokenize import word_tokenize
import nltk
#nltk.download('punkt')

# Tokenizing the tweets
#df['text'] = df['text'].apply(word_tokenize)


In [None]:
# Convert sentiment labels, assuming 0 is negative and 4 is positive
df['sentiment'] = df['sentiment'].astype(int)
df['sentiment'] = df['sentiment'].replace(4,1)
df['sentiment'] = df['sentiment'].replace(0,0)

# Example conversion, adjust based on your actual labels
#df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 4 else 0)

print(df['sentiment'].value_counts())
print(df['sentiment'].dtype)


# Splitting Data into Train, Test and Validation

In [None]:

# Splitting the dataset into training and test sets (80% train, 20% test)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Further split the training set into training and validation sets (80% train, 20% validation)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)


# Implementing Datasets and Dataloaders

In [None]:
from bert_twitter_dataset import SentimentDataset
      
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
MAX_LEN = 100

#dataset = TwitterDataset(df['text'].to_numpy(), df['sentiment'].to_numpy(), tokenizer, MAX_LEN)
        
                                              

In [None]:
from torch.utils.data import DataLoader

def create_data_loader(df, max_len, batch_size):
  ds = SentimentDataset(
    texts=df['text'].to_numpy(),
    labels=df['sentiment'].to_numpy(),
    #tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=8
  )

BATCH_SIZE = 900

#data_loader = create_data_loader(df, tokenizer, MAX_LEN, BATCH_SIZE)
train_data_loader = create_data_loader(train_df, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(val_df, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(test_df, MAX_LEN, BATCH_SIZE)

# Creating the Model

In [None]:
from transformers import BertModel
import torch
import torch.nn as nn

class BertForSentimentAnalysis(nn.Module):
    def __init__(self, freeze_bert=True):
        super(BertForSentimentAnalysis, self).__init__()
        # Instantiating BERT model object 
        self.bert_layer = DistilBertModel.from_pretrained('distilbert-base-uncased')

        # Freeze BERT layers to prevent training (optional)
        if freeze_bert:
            for param in self.bert_layer.parameters():
                param.requires_grad = False
        print("hii")
        # Classification layer
        #self.cls_layer = nn.Linear(768, 1)
        self.classifier = nn.Sequential(
            nn.Linear(self.bert_layer.config.hidden_size, 8),
            nn.BatchNorm1d(8),  # Batch Normalization
            nn.LeakyReLU(0.5),
            nn.Dropout(0.5),
            nn.Linear(8, 1)
)


    def forward(self, input_ids, attention_mask):
        # Pass inputs through BERT
        outputs = self.bert_layer(input_ids=input_ids, attention_mask=attention_mask)
        
        # Extract the last hidden state
        last_hidden_state = outputs.last_hidden_state

        # Apply classification layers
        logits = self.classifier(last_hidden_state[:, 0, :])
        return logits


In [None]:
model = BertForSentimentAnalysis()
from transformers import BertTokenizer

tokenizer = DistilBertModel.from_pretrained('distilbert-base-uncased')
# Tokenize your data here...



# Creating Training and Evaluating Functions

In [None]:
accumulation_steps = 4  # Example: Accumulate gradients over 4 forward passes
def train(model, data_loader, criterion, optimizer, accumulation_steps=4):
    model.train()
    total_loss = 0
    epoch_acc = 0 
    total_correct = 0
    total_samples = 0


    steps_accumulated = 0
    
    with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:

        for batch in data_loader:
            print("hello")
            # Move batch data to the device (CPU/GPU)
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels'].float()   

            # Reset gradients
            #optimizer.zero_grad()

            # Forward pass, get predictions
            outputs = model(input_ids, attention_mask).squeeze(1)
            # Calculate loss and scale it
            loss = criterion(outputs.squeeze(-1), labels) / accumulation_steps
            total_loss += loss.item() * accumulation_steps  # Scale back up

            # Backward pass
            loss.backward()
            steps_accumulated += 1

            if steps_accumulated == accumulation_steps:
                optimizer.step()
                optimizer.zero_grad()
                steps_accumulated = 0
                
            acc = binary_accuracy(outputs, labels) 
            epoch_acc += acc.item()


    print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
    
    return total_loss / len(data_loader), epoch_acc / len(data_loader)

def evaluate(model, data_loader, criterion):
    model.eval()
    total_loss = 0
    epoch_acc = 0
    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for batch in data_loader:
            # Move batch data to the device (CPU/GPU)
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels'].float()
            
            
            # Forward pass, get predictions
            outputs = model(input_ids, attention_mask).squeeze(1)


            # Calculate loss
            loss = criterion(outputs.squeeze(-1), labels)
            acc = binary_accuracy(outputs, labels)
            total_loss += loss.item()
            epoch_acc += acc.item()

    #epoch_acc = total_correct / total_samples      
        
    return total_loss / len(data_loader), epoch_acc / len(data_loader)

def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc





# Training Model

In [None]:
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.classifier.parameters(), lr=0.5)


In [None]:
num_epochs = 2  # Number of training epochs

for epoch in range(num_epochs):
    print(epoch)
    train_loss, train_acc = train(model, train_data_loader, criterion, optimizer, accumulation_steps )
    val_loss, val_acc = evaluate(model, val_data_loader, criterion )
    print(f'Epoch {epoch + 1}/{num_epochs} - Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}, Val Loss: {val_loss:.3f}|  Val. Acc: {val_acc*100:.2f}')

# Final Test Accuracy

In [None]:
test_loss, test_acc = evaluate(model, test_data_loader, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
