In [2]:
from google.colab import drive
import zipfile
import os

# Mount Google Drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
# Define paths to zip file and extraction location
zip_file_path = '/content/drive/MyDrive/easy.zip'  # Replace with your zip file path
extract_path = '/content/input_folder'

# Extract the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Files extracted to:", extract_path)

Files extracted to: /content/input_folder


In [6]:
validation_path = '/content/input_folder/easy/validation'
print(os.listdir(validation_path))

['problem-120.txt', 'truth-problem-678.json', 'truth-problem-713.json', 'problem-230.txt', 'problem-550.txt', 'truth-problem-779.json', 'problem-388.txt', 'problem-224.txt', 'problem-809.txt', 'problem-735.txt', 'truth-problem-706.json', 'problem-786.txt', 'truth-problem-815.json', 'problem-107.txt', 'truth-problem-477.json', 'problem-826.txt', 'problem-553.txt', 'truth-problem-374.json', 'truth-problem-16.json', 'truth-problem-87.json', 'truth-problem-268.json', 'problem-563.txt', 'problem-777.txt', 'problem-286.txt', 'problem-862.txt', 'truth-problem-128.json', 'truth-problem-250.json', 'truth-problem-843.json', 'truth-problem-693.json', 'truth-problem-497.json', 'truth-problem-327.json', 'truth-problem-401.json', 'truth-problem-33.json', 'problem-895.txt', 'truth-problem-39.json', 'problem-35.txt', 'problem-213.txt', 'truth-problem-827.json', 'problem-813.txt', 'problem-390.txt', 'problem-274.txt', 'problem-593.txt', 'problem-353.txt', 'truth-problem-734.json', 'problem-784.txt', 'p

In [7]:
import os
import json
import pandas as pd
from sklearn.model_selection import train_test_split

# Define paths for training and validation datasets
val_text_path = '/content/input_folder/easy/validation'

def load_data(text_path):
    data = []
    truth_data = []

    for file in os.listdir(text_path):
        if file.endswith(".txt"):
            problem_number = file.split('-')[1].split('.')[0]  # Extract the problem number
            with open(os.path.join(text_path, file), 'r', encoding='utf-8') as f:
                paragraphs = f.read().split('\n\n')  # Assuming paragraphs are separated by a double newline

            with open(os.path.join(text_path, f'truth-problem-{problem_number}.json'), 'r', encoding='utf-8') as f:
                truth = json.load(f)

            data.append(paragraphs)
            truth_data.append(truth)

    return data, truth_data

# Load valing data
val_data, val_truths = load_data(val_text_path)

In [8]:
# Function to generate paragraph pairs and labels
def generate_paragraph_pairs(paragraphs, truth_changes):
    pairs = []
    labels = []
    # Check lengths of paragraphs and truth_changes
    if len(paragraphs) - 1 != len(truth_changes):
        truth_changes.append(0)
        truth_changes.append(0)
        truth_changes.append(0)

    # Generate pairs from consecutive paragraphs
    for i in range(len(paragraphs) - 1):
        p1 = paragraphs[i]
        p2 = paragraphs[i + 1]
        change = truth_changes[i]
        # Create paragraph pairs
        pairs.append((p1, p2))
        labels.append(change)

    return pairs, labels

all_pairs = []
all_labels = []
paragraphs = []
for i, (data, truth) in enumerate(zip(val_data, val_truths)):
    paragraphs = [para.split('\n') for para in data]
    pairs, labels = generate_paragraph_pairs(paragraphs[0], val_truths[i]["changes"])
    all_pairs.extend(pairs)
    all_labels.extend(labels)


print(len(all_pairs))
print(len(all_labels))


2471
2471


In [9]:
!pip install peft



In [12]:
import zipfile
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel

# Unzip the model
zip_path = "/content/lora_fine_tuned_model.zip"  # Path to your zip file
extract_path = "/content/lora_fine_tuned_model/lora_fine_tuned_model"  # Directory to extract the files

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(extract_path)
base_model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
model = PeftModel.from_pretrained(base_model, extract_path)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
def tokenize_paragraph_pairs(paragraph_pairs, tokenizer):
    # Tokenize pairs of paragraphs
    tokenized_data = tokenizer(
        [f"{p1} [SEP] {p2}" for p1, p2 in paragraph_pairs],
        truncation=True,
        padding="max_length",  # Ensure fixed input size
        return_tensors="pt",   # Return PyTorch tensors
    )
    return tokenized_data

In [14]:
from torch.utils.data import Dataset
import torch # Import torch

class ParagraphPairDataset(Dataset):
    def __init__(self, paragraph_pairs, labels, tokenizer):
        # Tokenize the paragraph pairs
        tokenized_data = tokenize_paragraph_pairs(paragraph_pairs, tokenizer)
        self.input_ids = tokenized_data["input_ids"]
        self.attention_mask = tokenized_data.get("attention_mask", None)  # Optional
        # Convert labels to tensors
        self.labels = torch.tensor(labels, dtype=torch.long)  # For classification tasks

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        data = {
            "input_ids": self.input_ids[idx],
            "labels": self.labels[idx],
        }
        # Include attention_mask if it exists
        if self.attention_mask is not None:
            data["attention_mask"] = self.attention_mask[idx]
        return data


In [15]:
dataset = ParagraphPairDataset(all_pairs, all_labels, tokenizer)

In [24]:
from transformers import Trainer

predictions = Trainer(model=model).predict(dataset)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [25]:
import numpy as np

# Convert logits to predicted classes
predicted_classes = np.argmax(predictions.predictions, axis=1)

# Extract true labels
true_labels = predictions.label_ids


In [26]:
from sklearn.metrics import f1_score

# Calculate F1 score
f1 = f1_score(true_labels, predicted_classes, average="weighted")  # Use "micro" or "macro" as needed
print(f"F1 Score: {f1:.4f}")


F1 Score: 0.9302
