README

*   change the file path to your directory where you store the datasets and models.
*   for only showing the results, skip the training cell (# run the entire pipeline) and go to result cell(# results showing)
*   together with this script,two best models are uploaded (best_model_state_0.82.bin) for original pap dataset  and (best_model_state_0.84.bin) for data augmented dataset, which add pep3k data into original data. load the one you wish to see the results in results cell.








In [None]:
# load google drive in colab for reading data and model save
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# install library requirements
!pip install transformers torch pandas

In [14]:
# imports
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.metrics import precision_score, recall_score, accuracy_score


In [15]:
# dataset class
class SemanticDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [16]:
#parameter configuration
MAX_LEN = 128
BATCH_SIZE = 8
EPOCHS = 1
LEARNING_RATE = 2e-5
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')


In [17]:
#data loader function
def create_data_loader(df, tokenizer, max_len, batch_size,mode=1):
    if mode==1:
      label_mapping = {'plausible': 0,'implausible': 1}
      df['label_num'] = df['original_label'].map(label_mapping)
    ds = SemanticDataset(
        texts=df.text.to_numpy(),
        labels=df.label_num.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(ds, batch_size=batch_size, num_workers=4)


In [18]:
# model evaluate function
def eval_model(model, data_loader, device, n_examples):
    model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, sum(losses) / len(losses)


In [19]:
# training function
def train_epoch(model, data_loader, optimizer, device, n_examples):
    model.train()
    losses = []
    correct_predictions = 0
    Trained_sample_count = 0
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        Trained_sample_count+=BATCH_SIZE

        loss = outputs.loss
        correct_predictions += torch.sum(outputs.logits.argmax(1) == labels)
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if Trained_sample_count%50==0:
          print("finished:",Trained_sample_count,"accuracy:",correct_predictions.double()/Trained_sample_count)

    return correct_predictions.double() / n_examples, sum(losses) / len(losses)


In [20]:
# main training loop
def train_model(model, train_data_loader, val_data_loader, device, n_epochs):
    best_accuracy = 0
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

    for epoch in range(n_epochs):
        print(f'Epoch {epoch + 1}/{n_epochs}')
        train_acc, train_loss = train_epoch(
            model,
            train_data_loader,
            optimizer,
            device,
            len(train_df)
        )
        print(f'Train loss {train_loss} accuracy {train_acc}')

        val_acc, val_loss = eval_model(
            model,
            val_data_loader,
            device,
            len(val_df)
        )
        print(f'Validation loss {val_loss} accuracy {val_acc}')

        if val_acc > best_accuracy:
            torch.save(model.state_dict(), '/content/drive/MyDrive/pap/best_model_state.bin')
            best_accuracy = val_acc
            print('Saved Best Model')

    print ('model training finished')

In [21]:
# evaluate model
def evaluate(model, test_data_loader, device):
    model.eval()  # Set the model to evaluation mode
    predictions, true_labels = [], []
    total_loss = 0

    with torch.no_grad():
        for batch in test_data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            # Accumulate the loss
            total_loss += loss.item()

            # Convert to class predictions
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            labels = labels.cpu().numpy()

            predictions.extend(preds)
            true_labels.extend(labels)

    # Calculate the average loss
    avg_loss = total_loss / len(test_data_loader)

    # Calculate metrics
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    accuracy = accuracy_score(true_labels, predictions)

    print(f'Test Loss: {avg_loss}')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')


In [22]:
# load data


# 1 original training data: with pap dataset
#train_df = pd.read_csv('/content/drive/MyDrive/pap/train.csv')
#val_df = pd.read_csv('/content/drive/MyDrive/pap/dev.csv')
#test_df = pd.read_csv('/content/drive/MyDrive/pap/test.csv')

# 2 augmented training data: add pep3k data into original pap dataset
train_df1 = pd.read_csv('/content/drive/MyDrive/pap/train.csv')
train_df2 = pd.read_csv('/content/drive/MyDrive/pep3k/train.csv')
label_mapping = {1:'plausible',0:'implausible'}
train_df2['original_label'] = train_df2['label'].map(label_mapping)
train_df2 = train_df2[['text', 'original_label']]
train_df1 = train_df1[['text', 'original_label']]

train_df = train_df1.append(train_df2)
train_df.sample(frac=1).reset_index(drop=True)
val_df = pd.read_csv('/content/drive/MyDrive/pap/dev.csv')
test_df = pd.read_csv('/content/drive/MyDrive/pap/test.csv')



  train_df = train_df1.append(train_df2)


In [None]:
# run the entire pipeline
try:
    train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
    val_data_loader = create_data_loader(val_df, tokenizer, MAX_LEN, BATCH_SIZE)
    test_data_loader = create_data_loader(test_df, tokenizer, MAX_LEN, BATCH_SIZE)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = RobertaForSequenceClassification.from_pretrained('roberta-base')
    model.load_state_dict(torch.load('/content/drive/MyDrive/pap/best_model_state_0.84.bin'))
    # model.load_state_dict(torch.load('/content/drive/MyDrive/pap/best_model_state_0.82.bin'))
    model = model.to(device)

    train_model(model, train_data_loader, val_data_loader, device, EPOCHS)
    evaluate(model, test_data_loader, device)

except Exception as e:
    print(f"An error occurred: {e}")


In [25]:
# results showing
test_data_loader = create_data_loader(test_df, tokenizer, MAX_LEN, BATCH_SIZE)
model_temp = RobertaForSequenceClassification.from_pretrained('roberta-base')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_temp.load_state_dict(torch.load('/content/drive/MyDrive/pap/best_model_state_0.84.bin')) # to see best result with pap and pep3k dataset
# model_temp.load_state_dict(torch.load('/content/drive/MyDrive/pap/best_model_state_0.82.bin')) # to see best result with original pap dataset
model_temp = model_temp.to(device)

evaluate(model_temp, test_data_loader, device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test Loss: 0.4608197945424102
Accuracy: 0.8390804597701149
Precision: 0.8072289156626506
Recall: 0.8481012658227848
