# Training the Bert model

In [None]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.cuda.amp import autocast, GradScaler


def load_data(folder_path):
    #get small data path
    small_neg_path = os.path.join(folder_path, "twitter-datasets\\train_neg_full.txt")
    small_pos_path = os.path.join(folder_path, "twitter-datasets\\train_pos_full.txt")
    test_path = os.path.join(folder_path, "twitter-datasets\\test_data.txt")

    #create small data dataframe
    with open(small_neg_path, 'r') as file:
        lines_neg = file.readlines()
    with open(small_pos_path, 'r') as file:
        lines_pos = file.readlines()
    with open(test_path, 'r') as file:
        lines_test = file.readlines()
        lines_test = [s.split(',', 1)[1] for s in lines_test]

    small_neg_df = pd.DataFrame({'Tweets': lines_neg, 'Sentiment': -1})
    small_pos_df = pd.DataFrame({'Tweets': lines_pos, 'Sentiment': 1})
    test_df= pd.DataFrame({'Tweets': lines_test})
    test_df['Tweets'] = test_df['Tweets'].str.rstrip('\n')
    combined_df = pd.concat([small_neg_df, small_pos_df], ignore_index=True).sample(frac=1).reset_index(drop=True)
    combined_df['Tweets'] = combined_df['Tweets'].str.rstrip('\n')
    return combined_df, test_df


def tokenize_tweets(tokenizer, tweet):
    tokens = tokenizer.encode_plus(tweet, add_special_tokens=True, max_length=128, padding='max_length', return_tensors='pt')
    return tokens['input_ids'].squeeze(), tokens['attention_mask'].squeeze()

class TweetsDataset(Dataset):
    def __init__(self, tweets, sentiments, tokenizer):
        self.tweets = tweets
        self.sentiments = sentiments
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, idx):
        tweet, sentiment = self.tweets[idx], self.sentiments[idx]
        input_ids, attention_mask = tokenize_tweets(self.tokenizer, tweet)
        sentiment = 1 if sentiment == 1 else 0
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'sentiment': sentiment}

def train_model(model, train_loader, optimizer, criterion, device):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, sentiment = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['sentiment'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=sentiment)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

def evaluate_model(model, test_loader, device):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, sentiment = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['sentiment'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(sentiment.cpu().numpy())

    return true_labels, predictions


def predict_sentiment(model, tokenizer, tweet, device):
    model.eval()
    tokens = tokenizer.encode_plus(tweet, add_special_tokens=True, max_length=128, padding='max_length', return_tensors='pt')
    input_ids, attention_mask = tokens['input_ids'].to(device), tokens['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    predicted_class = torch.argmax(logits, dim=1).item()
    return predicted_class

def main():
    # File path in Google Drive
    folder_path = os.path.dirname(os.getcwd())

    # Load pre-trained BERT model and tokenizer
    model_name = 'bert-base-uncased'
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForSequenceClassification.from_pretrained(model_name).to(device)

    # Tokenize and preprocess tweets using BERT
    combined_df, test_df = load_data(folder_path)

    # Split the data into train and test sets
    train_df, test_df = train_test_split(combined_df, test_size=0.2, random_state=2)

    # Create DataLoader for training set
    train_dataset = TweetsDataset(train_df['Tweets'].values, train_df['Sentiment'].values, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    # Create DataLoader for test set
    test_dataset = TweetsDataset(test_df['Tweets'].values, test_df['Sentiment'].values, tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=40, shuffle=False)

    # Training loop with optimizations
    scaler = GradScaler()

    epochs = 5
    accumulation_steps = 5  # Adjust this based on GPU memory
    total_steps = len(train_loader) * epochs // accumulation_steps

    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, total_steps)

    for epoch in range(epochs):
        model.train()
        print(epoch)
        for i, batch in enumerate(train_loader):
            input_ids, attention_mask, sentiment = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['sentiment'].to(device)

            with autocast():
                outputs = model(input_ids, attention_mask=attention_mask, labels=sentiment)
                loss = outputs.loss / accumulation_steps  # Scale the loss

            scaler.scale(loss).backward()

            if (i + 1) % accumulation_steps == 0 or i == len(train_loader) - 1:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()

    # Evaluation
    true_labels, predictions = evaluate_model(model, test_loader, device)

    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predictions)
    print(f"Accuracy: {accuracy}")

    # Save the model
    model_save_path = os.path.join(folder_path, 'saved_model')
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)

if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    main()

# Loading and evaluating the Bert model

In [None]:
# Load the model
folder_path = os.path.dirname(os.getcwd())
combined_df, test_df = load_data(folder_path)
model_save_path = os.path.join(folder_path, 'models\\saved_model')
loaded_model = BertForSequenceClassification.from_pretrained(model_save_path).to(device)
loaded_tokenizer = BertTokenizer.from_pretrained(model_save_path)
test_df['Predicted_Sentiment'] = test_df['Tweets'].apply(lambda tweet: predict_sentiment(loaded_model, loaded_tokenizer, tweet, device))
sentiment_mapping = {0: -1, 1: 1}
test_df['Predicted_Sentiment'] = test_df['Predicted_Sentiment'].map(sentiment_mapping)
y_pred = test_df['Predicted_Sentiment'].values
ids = np.arange(1, len(y_pred) + 1)
submission_path = os.path.join(folder_path, "predictions\\BerSubmission.csv")
submission_df = pd.DataFrame({'Id': ids, 'Prediction': y_pred})
submission_df.to_csv(submission_path, index=False)