In [3]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("mwong/fever-evidence-related")
train_dataset = dataset['train']
train_dataset
df = pd.DataFrame(train_dataset)
df = df.drop(["input_ids", "attention_mask"], axis = 1)
df = df.rename(columns={'labels': 'label', 'claim': 'Headline', 'evidence': 'articleBody'})
df = df.head(30000)
df['label'].value_counts()

label
0    21257
1     8743
Name: count, dtype: int64

In [11]:
import pandas as pd
from sklearn.utils import resample

# Assuming your dataframe is named df and the class label column is named 'label'

# Separate the classes
df_class_0 = df[df.label == 0]
df_class_1 = df[df.label == 1]

# Resample each class to desired number of samples
df_class_0_resampled = resample(df_class_0, 
                                replace=True,     # sample with replacement
                                n_samples=5000,   # to match majority class
                                random_state=123) # reproducible results

df_class_1_resampled = resample(df_class_1, 
                                replace=True,     # sample with replacement
                                n_samples=5000,   # to match majority class
                                random_state=123) # reproducible results

# Combine the resampled classes back into one DataFrame
balanced_df = pd.concat([df_class_0_resampled, df_class_1_resampled])

# Shuffle the dataset (optional but recommended)
df = balanced_df.sample(frac=1, random_state=123).reset_index(drop=True)

# Now balanced_df is your balanced dataset
df.head()
df['label'].value_counts()

label
0    5000
1    5000
Name: count, dtype: int64

In [12]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split

In [13]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
class StanceDataset(Dataset):
    def __init__(self, headlines, bodies, labels, tokenizer, max_len):
        self.headlines = headlines
        self.bodies = bodies
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.headlines)

    def __getitem__(self, item):
        headline = str(self.headlines[item])
        body = str(self.bodies[item])
        label = self.labels[item]

        # Combine headline and body for tokenization
        combined_text = headline + " " + body

        encoding = self.tokenizer.encode_plus(
            combined_text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [15]:
# Split into train and validation sets
train_df, temp_df = train_test_split(df, test_size=0.2)  # Adjust the test_size as needed
val_df, test_df = train_test_split(temp_df, test_size=0.5)  # Split the temp_df equally into validation and test

print(train_df.shape)
print(val_df.shape)
print(test_df.shape)

# Create training dataset
train_dataset = StanceDataset(
    headlines = train_df.Headline.to_numpy(),
    bodies = train_df.articleBody.to_numpy(),
    labels = train_df.label.to_numpy(),
    tokenizer = tokenizer,
    max_len = 350
)

# Create validation dataset
val_dataset = StanceDataset(
    headlines = val_df.Headline.to_numpy(),
    bodies = val_df.articleBody.to_numpy(),
    labels = val_df.label.to_numpy(),
    tokenizer = tokenizer,
    max_len = 350
)

# Create test dataset
test_dataset = StanceDataset(
    headlines = test_df.Headline.to_numpy(),
    bodies = test_df.articleBody.to_numpy(),
    labels = test_df.label.to_numpy(),
    tokenizer = tokenizer,
    max_len = 350
)


(8000, 3)
(1000, 3)
(1000, 3)


In [16]:
batch_size = 16

train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_data_loader = DataLoader(val_dataset, batch_size=batch_size)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size)

In [17]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 16

for epoch in range(epochs):
    # Training
    model.train()
    total_loss = 0  # Initialize total loss for the epoch
    train_loop = tqdm(train_data_loader, desc=f"Epoch {epoch+1}/{epochs} [Training]")

    for batch in train_loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()  # Accumulate the loss
        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_data_loader)  # Calculate average loss
    print(f"Epoch {epoch+1}/{epochs} - Average Training Loss: {avg_train_loss:.4f}")

    # Validation
    model.eval()
    total_eval_loss = 0

    # Initialize lists to store predictions and true labels
    all_predictions = []
    all_true_labels = []

    val_loop = tqdm(val_data_loader, desc=f"Epoch {epoch+1}/{epochs} [Validation]", leave=False)
    for batch in val_loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        total_eval_loss += loss.item()

        logits = outputs.logits
        # Convert logits to probabilities and then to predicted labels
        predictions = torch.argmax(logits, dim=-1)
        all_predictions.extend(predictions.detach().cpu().numpy())
        all_true_labels.extend(labels.detach().cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(all_true_labels, all_predictions)
    precision = precision_score(all_true_labels, all_predictions)  # use average='binary' for binary classification
    recall = recall_score(all_true_labels, all_predictions)  # use average='binary' for binary classification

    # Print metrics
    print(f"Epoch {epoch+1}/{epochs} - Accuracy: {accuracy:.4f}")
    print(f"Epoch {epoch+1}/{epochs} - Precision: {precision:.4f}")
    print(f"Epoch {epoch+1}/{epochs} - Recall: {recall:.4f}")



Epoch 1/16 [Training]: 100%|██████████| 500/500 [06:11<00:00,  1.34it/s]


Epoch 1/16 - Average Training Loss: 0.1011


                                                                        

Epoch 1/16 - Accuracy: 0.9960
Epoch 1/16 - Precision: 0.9959
Epoch 1/16 - Recall: 0.9959


Epoch 2/16 [Training]: 100%|██████████| 500/500 [06:11<00:00,  1.35it/s]


Epoch 2/16 - Average Training Loss: 0.0174


                                                                        

Epoch 2/16 - Accuracy: 0.9970
Epoch 2/16 - Precision: 0.9959
Epoch 2/16 - Recall: 0.9979


Epoch 3/16 [Training]: 100%|██████████| 500/500 [06:11<00:00,  1.35it/s]


Epoch 3/16 - Average Training Loss: 0.0091


                                                                        

Epoch 3/16 - Accuracy: 0.9960
Epoch 3/16 - Precision: 0.9979
Epoch 3/16 - Recall: 0.9938


Epoch 4/16 [Training]: 100%|██████████| 500/500 [06:12<00:00,  1.34it/s]


Epoch 4/16 - Average Training Loss: 0.0078


                                                                        

Epoch 4/16 - Accuracy: 0.9980
Epoch 4/16 - Precision: 0.9979
Epoch 4/16 - Recall: 0.9979


Epoch 5/16 [Training]: 100%|██████████| 500/500 [06:11<00:00,  1.34it/s]


Epoch 5/16 - Average Training Loss: 0.0047


                                                                        

Epoch 5/16 - Accuracy: 0.9970
Epoch 5/16 - Precision: 0.9939
Epoch 5/16 - Recall: 1.0000


Epoch 6/16 [Training]: 100%|██████████| 500/500 [06:12<00:00,  1.34it/s]


Epoch 6/16 - Average Training Loss: 0.0070


                                                                        

Epoch 6/16 - Accuracy: 0.9970
Epoch 6/16 - Precision: 0.9959
Epoch 6/16 - Recall: 0.9979


Epoch 7/16 [Training]: 100%|██████████| 500/500 [06:11<00:00,  1.34it/s]


Epoch 7/16 - Average Training Loss: 0.0023


                                                                        

Epoch 7/16 - Accuracy: 0.9850
Epoch 7/16 - Precision: 0.9701
Epoch 7/16 - Recall: 1.0000


Epoch 8/16 [Training]: 100%|██████████| 500/500 [06:11<00:00,  1.34it/s]


Epoch 8/16 - Average Training Loss: 0.0084


                                                                        

Epoch 8/16 - Accuracy: 0.9990
Epoch 8/16 - Precision: 0.9980
Epoch 8/16 - Recall: 1.0000


Epoch 9/16 [Training]: 100%|██████████| 500/500 [06:12<00:00,  1.34it/s]


Epoch 9/16 - Average Training Loss: 0.0001


                                                                        

Epoch 9/16 - Accuracy: 0.9980
Epoch 9/16 - Precision: 0.9959
Epoch 9/16 - Recall: 1.0000


Epoch 10/16 [Training]: 100%|██████████| 500/500 [06:13<00:00,  1.34it/s]


Epoch 10/16 - Average Training Loss: 0.0087


                                                                         

Epoch 10/16 - Accuracy: 0.9990
Epoch 10/16 - Precision: 0.9980
Epoch 10/16 - Recall: 1.0000


Epoch 11/16 [Training]: 100%|██████████| 500/500 [06:11<00:00,  1.34it/s]


Epoch 11/16 - Average Training Loss: 0.0007


                                                                         

Epoch 11/16 - Accuracy: 0.9990
Epoch 11/16 - Precision: 0.9980
Epoch 11/16 - Recall: 1.0000


Epoch 12/16 [Training]: 100%|██████████| 500/500 [06:11<00:00,  1.35it/s]


Epoch 12/16 - Average Training Loss: 0.0085


                                                                         

Epoch 12/16 - Accuracy: 0.9990
Epoch 12/16 - Precision: 0.9980
Epoch 12/16 - Recall: 1.0000


Epoch 13/16 [Training]: 100%|██████████| 500/500 [06:11<00:00,  1.34it/s]


Epoch 13/16 - Average Training Loss: 0.0021


                                                                         

Epoch 13/16 - Accuracy: 0.9910
Epoch 13/16 - Precision: 0.9979
Epoch 13/16 - Recall: 0.9836


Epoch 14/16 [Training]: 100%|██████████| 500/500 [06:11<00:00,  1.35it/s]


Epoch 14/16 - Average Training Loss: 0.0143


                                                                         

Epoch 14/16 - Accuracy: 0.9970
Epoch 14/16 - Precision: 0.9959
Epoch 14/16 - Recall: 0.9979


Epoch 15/16 [Training]: 100%|██████████| 500/500 [06:11<00:00,  1.34it/s]


Epoch 15/16 - Average Training Loss: 0.0060


                                                                         

Epoch 15/16 - Accuracy: 0.9980
Epoch 15/16 - Precision: 0.9959
Epoch 15/16 - Recall: 1.0000


Epoch 16/16 [Training]: 100%|██████████| 500/500 [04:57<00:00,  1.68it/s]


Epoch 16/16 - Average Training Loss: 0.0036


                                                                         

Epoch 16/16 - Accuracy: 0.9990
Epoch 16/16 - Precision: 1.0000
Epoch 16/16 - Recall: 0.9979




In [18]:
model_path = "/home/george.ibrahim/Downloads/AI701/Project/RoBERTa_Fever_7(Balanced)"

model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)


('/home/george.ibrahim/Downloads/AI701/Project/RoBERTa_Fever_7(Balanced)/tokenizer_config.json',
 '/home/george.ibrahim/Downloads/AI701/Project/RoBERTa_Fever_7(Balanced)/special_tokens_map.json',
 '/home/george.ibrahim/Downloads/AI701/Project/RoBERTa_Fever_7(Balanced)/vocab.json',
 '/home/george.ibrahim/Downloads/AI701/Project/RoBERTa_Fever_7(Balanced)/merges.txt',
 '/home/george.ibrahim/Downloads/AI701/Project/RoBERTa_Fever_7(Balanced)/added_tokens.json')

In [19]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

# Path to your saved model
model_path = "/home/george.ibrahim/Downloads/AI701/Project/RoBERTa_Fever_7(Balanced)"

# Load the tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained(model_path)
model = RobertaForSequenceClassification.from_pretrained(model_path)


In [20]:
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

import numpy as np

# Initialize lists to store all predictions and labels
all_predictions = []
all_true_labels = []

# Evaluation
model.eval()

for batch in test_data_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()

    # Flatten the outputs and labels and store them
    all_predictions.extend(np.argmax(logits, axis=1).flatten())
    all_true_labels.extend(label_ids.flatten())

# Compute metrics
accuracy = accuracy_score(all_true_labels, all_predictions)
precision = precision_score(all_true_labels, all_predictions)
recall = recall_score(all_true_labels, all_predictions)
f1_score = f1_score(all_true_labels, all_predictions)

# Print metrics
print("Test Accuracy: {0:.2f}".format(accuracy))
print("Test Precision: {0:.2f}".format(precision))
print("Test Recall: {0:.2f}".format(recall))
print("Test f1_score: {0:.2f}".format(f1_score))





Test Accuracy: 1.00
Test Precision: 1.00
Test Recall: 1.00
Test f1_score: 1.00
