In [1]:
!pip install torch transformers peft accelerate bitsandbytes datasets scikit-learn wandb --break-system-packages

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import torch
import random
import numpy as np
import wandb
from torch.utils.data import Dataset
import os
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer,AutoModelForSequenceClassification,BitsAndBytesConfig,Trainer, TrainingArguments
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig, TaskType
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())
print(torch.cuda.device_count())


2.7.0+cu126
12.6
True
1


In [3]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33me274028[0m ([33me274028-metu-middle-east-technical-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [5]:
def parse_reviews_to_dataframe(dataset_dir):
    records = []

    for year_dir in os.listdir(dataset_dir):
        year_path = os.path.join(dataset_dir, year_dir)
        if not os.path.isdir(year_path):
            continue

        review_dir = os.path.join(year_path, f"{year_dir}_review")
        if not os.path.exists(review_dir):
            continue

        for fname in os.listdir(review_dir):
            if not (fname.endswith(".json") and "ICLR" in fname):
                continue

            file_path = os.path.join(review_dir, fname)
            with open(file_path, "r") as f:
                try:
                    data = json.load(f)
                except Exception as e:
                    print(f"Failed to parse {file_path}: {e}")
                    continue

            paper_id = data.get("id", fname.replace(".json", ""))
            meta_review = data.get("metaReview", "")
            reviews = data.get("reviews", [])

            review_texts = []
            rating_scores = []

            for review in reviews:
                review_text = review.get("review", "")
                rating_raw = review.get("rating", "")
                try:
                    rating_score = int(rating_raw.split(":")[0].strip())
                    rating_scores.append(rating_score)
                except Exception as e:
                    print(f"Invalid rating. Error is e: {e}")

                review_texts.append(review_text)

            full_text = " ".join(review_texts + [meta_review]).strip()
            avg_rating = sum(rating_scores) / len(rating_scores)
            label = 1 if avg_rating >= 6 else 0

            records.append({
                "paper_id": paper_id,
                "text": full_text,
                "avg_rating": avg_rating,
                "label": label
            })

    return pd.DataFrame(records)

# Example usage:
df = parse_reviews_to_dataframe("dataset")
print(df.head())

         paper_id                                               text  \
0   ICLR_2020_686  The paper studies scaling multi-hop QA to larg...   
1  ICLR_2020_1004  The paper presents an interesting connection b...   
2  ICLR_2020_1470  The authors propose a deep learning agent for ...   
3  ICLR_2020_1471  This paper proposes the variational hyper RNN ...   
4   ICLR_2020_228  The main claim of this paper is that a simple ...   

   avg_rating  label  
0    8.000000      1  
1    4.333333      0  
2    3.000000      0  
3    6.000000      1  
4    6.666667      1  


In [6]:
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=42)

In [7]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, padding_side="right")
tokenizer.pad_token = tokenizer.eos_token 

In [8]:
class ReviewDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=1024):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        encoding = self.tokenizer(
            item["text"],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(item["label"], dtype=torch.long)
        }


In [9]:
train_dataset = ReviewDataset(train_df, tokenizer)
val_dataset = ReviewDataset(val_df, tokenizer)
test_dataset = ReviewDataset(test_df, tokenizer)

In [10]:
# Load quantized model
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True,
                                bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    num_labels=2,
    device_map="auto"
)

model = prepare_model_for_kbit_training(model)


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at TinyLlama/TinyLlama-1.1B-Chat-v1.0 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    acc = accuracy_score(labels, predictions)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


sweep_config = {
    "method": "grid", 
    "metric": {
        "name": "eval/accuracy",
        "goal": "maximize"
    },
    "parameters": {
        "learning_rate": {"values": [2e-5, 3e-5]},
        "r": {"values": [1, 2]}
    }
}

def sweep_train():
    wandb.init()
    config = wandb.config

    peft_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=config.r,
        lora_alpha=16,
        lora_dropout=0.05,
        bias="none",
        inference_mode=False
    )

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        num_labels=2,
        device_map="auto"
    )
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, peft_config)

    training_args = TrainingArguments(
        output_dir="./results",
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        num_train_epochs=5,
        logging_dir="./logs",
        logging_steps=100,
        save_steps=500,
        eval_steps=500,
        save_total_limit=1,    
        save_strategy="steps",            
        metric_for_best_model="accuracy",  
        greater_is_better=True,     
        fp16=True,
        report_to="wandb",
        run_name="term-project"
    )


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()

    eval_results = trainer.evaluate(eval_dataset=test_dataset)
    print("Test Set Evaluation:", eval_results)
    wandb.log(eval_results)


In [None]:
sweep_id = wandb.sweep(sweep_config, project="tinyllama-lora")
wandb.agent(sweep_id, function=sweep_train)

Create sweep with ID: jotq009i
Sweep URL: https://wandb.ai/e274028-metu-middle-east-technical-university/tinyllama-lora/sweeps/jotq009i


[34m[1mwandb[0m: Agent Starting Run: pfqcd28o with config:
[34m[1mwandb[0m: 	learning_rate: 2e-05
[34m[1mwandb[0m: 	r: 1


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at TinyLlama/TinyLlama-1.1B-Chat-v1.0 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
100,1.216
200,1.6241
300,1.622
400,1.7574
500,1.5686
600,1.6877
700,1.7614
800,1.9501
900,1.7193
1000,2.3064


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
