### 1.  FINBERT Finetuned

finetuned on evaluation set

In [18]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import pandas as pd

# Load your dataset
data = pd.read_csv('../data/eval_set_labelled.csv')
data = data[['data', 'sentiment']]  # Adjust columns as needed

# Map your sentiments to numeric values if not already done
label_dict = {'negative': 0, 'neutral': 1, 'positive': 2}
data['sentiment'] = data['sentiment'].replace(label_dict)

# Split data into train and validation
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)


## Define Dataset Class

In [19]:
class SentimentDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = data['data'].tolist()
        self.labels = data['sentiment'].tolist()
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        labels = int(self.labels[item])

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
        }


## Initialize Tokenizer and Model

In [20]:
# Parameters
MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=len(label_dict))

# Data loaders
train_dataset = SentimentDataset(train_data, tokenizer, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = SentimentDataset(val_data, tokenizer, MAX_LEN)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)


## Setup Optimizer and Scheduler

In [21]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0, 
                                            num_training_steps=total_steps)

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)




BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30873, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

## Train

In [22]:
def train_epoch(model, data_loader, optimizer, device, scheduler):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        _, preds = torch.max(outputs.logits, dim=1)

        loss = outputs.loss
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)


## Evaluation Function 

In [23]:
def eval_model(model, data_loader, device):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            labels = d['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            _, preds = torch.max(outputs.logits, dim=1)

            loss = outputs.loss
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)



In [24]:
# Run training and evaluation
for epoch in range(EPOCHS):
    train_acc, train_loss = train_epoch(model, train_loader, optimizer, device, scheduler)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Train Loss: {train_loss}, Train Accuracy: {train_acc}')

    val_acc, val_loss = eval_model(model, val_loader, device)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Validation Loss: {val_loss}, Validation Accuracy: {val_acc}')

Epoch 1/3, Train Loss: 1.5531390236135114, Train Accuracy: 0.5210643015521065
Epoch 1/3, Validation Loss: 0.9341669508389064, Validation Accuracy: 0.5841584158415841
Epoch 2/3, Train Loss: 0.7753509366721437, Train Accuracy: 0.6607538802660754
Epoch 2/3, Validation Loss: 0.9367065259388515, Validation Accuracy: 0.5841584158415841
Epoch 3/3, Train Loss: 0.6672359380805701, Train Accuracy: 0.7305986696230599
Epoch 3/3, Validation Loss: 0.9526035104479108, Validation Accuracy: 0.5445544554455446


In [39]:
model_path = r'C:\Users\Neel\OneDrive\Desktop\University\Y4S2\IR\Information-Retrieval\classification'
model.save_pretrained(model_path)

# Save the tokenizer
tokenizer_path = r'C:\Users\Neel\OneDrive\Desktop\University\Y4S2\IR\Information-Retrieval\classification'
tokenizer.save_pretrained(tokenizer_path)

('C:\\Users\\Neel\\OneDrive\\Desktop\\University\\Y4S2\\IR\\Information-Retrieval\\classification\\tokenizer_config.json',
 'C:\\Users\\Neel\\OneDrive\\Desktop\\University\\Y4S2\\IR\\Information-Retrieval\\classification\\special_tokens_map.json',
 'C:\\Users\\Neel\\OneDrive\\Desktop\\University\\Y4S2\\IR\\Information-Retrieval\\classification\\vocab.txt',
 'C:\\Users\\Neel\\OneDrive\\Desktop\\University\\Y4S2\\IR\\Information-Retrieval\\classification\\added_tokens.json')

### Evaluatiing the model on original data

In [40]:
from transformers import BertForSequenceClassification, BertTokenizer

# Load the model
model = BertForSequenceClassification.from_pretrained(model_path)

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

In [42]:
import pandas as pd

# Load the new dataset
new_data_path = '../data/original_data.csv'
new_data = pd.read_csv(new_data_path)

# Check for missing values and count them
print(new_data['body'].isnull().sum())

# Option to fill NaN values with a placeholder string (like an empty string)
new_data['body'] = new_data['body'].fillna('')

# Or, to drop rows with NaN values
new_data = new_data.dropna(subset=['body'])

new_data['body'] = new_data['body'].astype(str)

# Assuming the text to be predicted is in a column named 'data'
texts = new_data['body'].tolist()

3009


In [44]:
from torch.utils.data import DataLoader, TensorDataset

# Tokenize the text
new_encodings = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors="pt")

# Create a DataLoader
new_dataset = TensorDataset(new_encodings['input_ids'], new_encodings['attention_mask'])
new_loader = DataLoader(new_dataset, batch_size=32)  # Adjust batch size if needed


In [45]:
import torch

# Ensure the model is in evaluation mode
model.eval()

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Predict function
def predict(loader):
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in loader:
            input_ids, attention_mask = batch[0].to(device), batch[1].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            predictions.extend(preds.tolist())

    return predictions

# Get predictions
sentiment_predictions = predict(new_loader)

# Optionally, map numerical predictions back to sentiment labels
sentiment_labels = {0: 'negative', 1: 'neutral', 2: 'positive'}
predicted_labels = [sentiment_labels[pred] for pred in sentiment_predictions]

# Add predictions to DataFrame
new_data['predicted_sentiment'] = predicted_labels


In [None]:
# Save to new CSV
new_data.to_csv('predicted_sentiments_original.csv', index=False)


NameError: name 'predicted_sentiments' is not defined