In [None]:
import torch
import os
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaModel, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix, classification_report, precision_score

In [None]:
df = pd.read_excel('/content/drive/MyDrive/Master/sentiment1.xlsx')
df['textOriginal'] = df['textOriginal'].astype(str)
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x > 0 else 0 if x <= 0 else x)
df['title'] = df['title'].astype(str)
df['video_or_channel'] = df['video_or_channel'].astype(int)
df = df.dropna(subset='sentiment')
df = df.dropna(subset=['why_sentiment'])
df['why_sentiment'] = df['why_sentiment'].astype(str)
df['sentiment'] = df['sentiment'].astype(int)
df['comment'] = "[CHANNEL]" + df['channel_name'] + "[WHY]" + df['why_sentiment'] + "[COMMENT]" + df['textOriginal'] + "[TITLE]" + df['title']

In [None]:
SPECIAL_TOKENS = {
    'channel': '[CHANNEL]',
    'comment': '[COMMENT]',
    'title': '[TITLE]',
    'why': '[WHY]'
}
tokenizer = RobertaTokenizer.from_pretrained("siebert/sentiment-roberta-large-english")
tokenizer.add_special_tokens({'additional_special_tokens': list(SPECIAL_TOKENS.values())})

In [None]:
df['comment'] = (
    f"{SPECIAL_TOKENS['channel']}" + df['channel_name'] +
    f"{SPECIAL_TOKENS['comment']}" + df['textOriginal'] +
    f"{SPECIAL_TOKENS['title']}" + df['title'] +    f"{SPECIAL_TOKENS['why']}" + df['why_sentiment']
)

MAX_SEQ_LEN = 512
BATCH_SIZE = 16

In [None]:
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_seq_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index].comment
        label = self.data.iloc[index].sentiment
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_seq_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df.sentiment)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df.sentiment)

train_dataset = TextDataset(train_df, tokenizer, MAX_SEQ_LEN)
valid_dataset = TextDataset(valid_df, tokenizer, MAX_SEQ_LEN)
test_dataset = TextDataset(test_df, tokenizer, MAX_SEQ_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RobertaForSequenceClassification.from_pretrained('siebert/sentiment-roberta-large-english', num_labels=2)
model.resize_token_embeddings(len(tokenizer))
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
criterion = torch.nn.BCEWithLogitsLoss()
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [None]:
EPOCHS = 4

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{EPOCHS} - Training loss: {avg_train_loss}')

    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in valid_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()


    avg_val_loss = total_val_loss / len(valid_loader)
    print(f'Epoch {epoch+1}/{EPOCHS} - Validation loss: {avg_val_loss}')

print("Training complete!")

In [None]:
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).flatten()

        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)
cm = confusion_matrix(true_labels, predictions)

print(f'Accuracy: {accuracy}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print('Confusion Matrix:')
print(cm)

print(classification_report(true_labels, predictions, target_names=['Class 0', 'Class 1']))

In [None]:
model.save_pretrained('/content/drive/MyDrive/Master/model_SiEBERT')