In [1]:
from google.colab import drive
import zipfile
import os

# Mount Google Drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
# Define paths to zip file and extraction location
zip_file_path = '/content/drive/MyDrive/NLPProject/medium.zip'  # Replace with your zip file path
extract_path = '/content/input_folder'

# Extract the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Files extracted to:", extract_path)

Files extracted to: /content/input_folder


In [3]:
train_path = '/content/input_folder/medium/train'
validation_path = '/content/input_folder/medium/validation'

print(os.listdir(train_path))

['truth-problem-622.json', 'truth-problem-2267.json', 'truth-problem-607.json', 'problem-2264.txt', 'truth-problem-811.json', 'problem-3083.txt', 'truth-problem-2847.json', 'problem-2363.txt', 'problem-885.txt', 'problem-2508.txt', 'problem-410.txt', 'problem-224.txt', 'problem-2011.txt', 'problem-639.txt', 'problem-222.txt', 'truth-problem-2629.json', 'problem-3570.txt', 'truth-problem-3735.json', 'truth-problem-3281.json', 'problem-1315.txt', 'problem-2343.txt', 'problem-1407.txt', 'truth-problem-1126.json', 'problem-4053.txt', 'problem-1670.txt', 'problem-318.txt', 'truth-problem-112.json', 'problem-2847.txt', 'problem-4009.txt', 'truth-problem-781.json', 'truth-problem-3434.json', 'problem-3911.txt', 'truth-problem-704.json', 'problem-3507.txt', 'truth-problem-739.json', 'truth-problem-1459.json', 'problem-85.txt', 'problem-4093.txt', 'problem-1350.txt', 'problem-2860.txt', 'truth-problem-3014.json', 'truth-problem-3189.json', 'truth-problem-2783.json', 'truth-problem-1132.json', '

In [4]:
import os
import json
import pandas as pd
from sklearn.model_selection import train_test_split

# Define paths for training and validation datasets
train_text_path = '/content/input_folder/medium/train'

def load_data(text_path):
    data = []
    truth_data = []

    for file in os.listdir(text_path):
        if file.endswith(".txt"):
            problem_number = file.split('-')[1].split('.')[0]  # Extract the problem number
            with open(os.path.join(text_path, file), 'r', encoding='utf-8') as f:
                paragraphs = f.read().split('\n\n')  # Assuming paragraphs are separated by a double newline

            with open(os.path.join(text_path, f'truth-problem-{problem_number}.json'), 'r', encoding='utf-8') as f:
                truth = json.load(f)

            data.append(paragraphs)
            truth_data.append(truth)

    return data, truth_data

# Load training data
train_data, train_truths = load_data(train_text_path)

In [6]:
# Function to generate paragraph pairs and labels
def generate_paragraph_pairs(paragraphs, truth_changes):
    pairs = []
    labels = []
    # Check lengths of paragraphs and truth_changes
    if len(paragraphs) - 1 != len(truth_changes):
        truth_changes.append(0)
        truth_changes.append(0)
        truth_changes.append(0)

    # Generate pairs from consecutive paragraphs
    for i in range(len(paragraphs) - 1):
        p1 = paragraphs[i]
        p2 = paragraphs[i + 1]
        change = truth_changes[i]
        # Create paragraph pairs
        pairs.append((p1, p2))
        labels.append(change)

    return pairs, labels

all_pairs = []
all_labels = []
paragraphs = []
for i, (data, truth) in enumerate(zip(train_data, train_truths)):
    paragraphs = [para.split('\n') for para in data]
    pairs, labels = generate_paragraph_pairs(paragraphs[0], train_truths[i]["changes"])
    all_pairs.extend(pairs)
    all_labels.extend(labels)


print(len(all_pairs))
print(len(all_labels))


21919
21919


In [7]:
!pip install peft



In [7]:
from transformers import AutoModelForSequenceClassification

base_model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from peft import LoraConfig, get_peft_model

# Define LoRA configuration
lora_config = LoraConfig(
    task_type="SEQ_CLS",       # Task type (Sequence Classification in this case)
    r=16,                      # Rank of the LoRA matrices
    lora_alpha=32,             # Scaling factor for LoRA
    lora_dropout=0.1,          # Dropout applied to LoRA layers
    target_modules=["query", "key"]  # Apply LoRA to attention layers (e.g., query/key projections)
)

# Wrap the model with LoRA
model = get_peft_model(base_model, lora_config)


In [9]:
def tokenize_paragraph_pairs(paragraph_pairs, tokenizer):
    # Tokenize pairs of paragraphs
    tokenized_data = tokenizer(
        [f"{p1} [SEP] {p2}" for p1, p2 in paragraph_pairs],
        truncation=True,
        padding="max_length",  # Ensure fixed input size
        return_tensors="pt",   # Return PyTorch tensors
    )
    return tokenized_data

In [10]:
from torch.utils.data import Dataset
import torch # Import torch

class ParagraphPairDataset(Dataset):
    def __init__(self, paragraph_pairs, labels, tokenizer):
        # Tokenize the paragraph pairs
        tokenized_data = tokenize_paragraph_pairs(paragraph_pairs, tokenizer)
        self.input_ids = tokenized_data["input_ids"]
        self.attention_mask = tokenized_data.get("attention_mask", None)  # Optional
        # Convert labels to tensors
        self.labels = torch.tensor(labels, dtype=torch.long)  # For classification tasks

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        data = {
            "input_ids": self.input_ids[idx],
            "labels": self.labels[idx],
        }
        # Include attention_mask if it exists
        if self.attention_mask is not None:
            data["attention_mask"] = self.attention_mask[idx]
        return data


In [11]:
# Initialize tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

# Create the dataset
dataset = ParagraphPairDataset(all_pairs, all_labels, tokenizer)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [12]:
# Split into train and evaluation datasets
from sklearn.model_selection import train_test_split

train_pairs, eval_pairs, train_labels, eval_labels = train_test_split(all_pairs, all_labels, test_size=0.2)
train_dataset = ParagraphPairDataset(train_pairs, train_labels, tokenizer)
eval_dataset = ParagraphPairDataset(eval_pairs, eval_labels, tokenizer)

In [13]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-4,  # Higher learning rate is often used for LoRA
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# Start training
trainer.train()


  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.4039,0.376532
2,0.3783,0.363855
3,0.3572,0.375156


TrainOutput(global_step=6576, training_loss=0.3996161625623123, metrics={'train_runtime': 4015.1693, 'train_samples_per_second': 13.102, 'train_steps_per_second': 1.638, 'total_flos': 1.403196385941504e+16, 'train_loss': 0.3996161625623123, 'epoch': 3.0})

In [14]:
# Save the LoRA model
model.save_pretrained("./lora_fine_tuned_model_medium")
tokenizer.save_pretrained("./lora_fine_tuned_model_medium")

# Zip the model files
!zip -r lora_fine_tuned_model_medium.zip ./lora_fine_tuned_model_medium

# Download the zip file
from google.colab import files
files.download("lora_fine_tuned_model_medium.zip")

  adding: lora_fine_tuned_model_medium/ (stored 0%)
  adding: lora_fine_tuned_model_medium/adapter_config.json (deflated 52%)
  adding: lora_fine_tuned_model_medium/tokenizer_config.json (deflated 76%)
  adding: lora_fine_tuned_model_medium/vocab.json (deflated 59%)
  adding: lora_fine_tuned_model_medium/tokenizer.json (deflated 82%)
  adding: lora_fine_tuned_model_medium/special_tokens_map.json (deflated 52%)
  adding: lora_fine_tuned_model_medium/adapter_model.safetensors (deflated 7%)
  adding: lora_fine_tuned_model_medium/merges.txt (deflated 53%)
  adding: lora_fine_tuned_model_medium/README.md (deflated 66%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [15]:
test_text_path = '/content/input_folder/medium/validation'

test_data, test_truths = load_data(test_text_path)


In [16]:
test_pairs = []
test_labels = []
paragraphs = []
for i, (data, truth) in enumerate(zip(test_data, test_truths)):
    paragraphs = [para.split('\n') for para in data]
    pairs, labels = generate_paragraph_pairs(paragraphs[0], test_truths[i]["changes"])
    test_pairs.extend(pairs)
    test_labels.extend(labels)


print(len(test_pairs))
print(len(test_labels))

4592
4592


In [17]:
# Load the test dataset
test_dataset = ParagraphPairDataset(test_pairs, test_labels, tokenizer)

# Use the Trainer's predict method to get predictions
predictions = trainer.predict(test_dataset)



In [18]:
import numpy as np

# Convert logits to predicted classes
predicted_classes = np.argmax(predictions.predictions, axis=1)

# Extract true labels
true_labels = predictions.label_ids


In [19]:
from sklearn.metrics import f1_score

# Calculate F1 score
f1 = f1_score(true_labels, predicted_classes, average="weighted")  # Use "micro" or "macro" as needed
print(f"F1 Score: {f1:.4f}")


F1 Score: 0.8205
