In [1]:
"""
This notebook is depreciated. Accuracy results were below baselines.
"""

!pip install torch transformers peft accelerate bitsandbytes datasets scikit-learn wandb --break-system-packages

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import torch
import random
import numpy as np
import wandb
from torch.utils.data import Dataset
import os
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer,AutoModelForCausalLM,BitsAndBytesConfig,Trainer, TrainingArguments,EvalPrediction
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig, TaskType
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from transformers.modeling_outputs import SequenceClassifierOutput
from torch.utils.data import DataLoader
import re
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())
print(torch.cuda.device_count())

2.7.0+cu126
12.6
True
1


In [3]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [4]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33me274028[0m ([33me274028-metu-middle-east-technical-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [6]:
def clean_text_keep_english_and_punctuation(text):
    return re.sub(r"[^a-zA-Z0-9\s.,:;!?()\[\]\"'-]", "", text)

def parse_reviews_to_dataframe(dataset_dir, tokenizer=None, max_tokens=1900):
    records = []

    for year_dir in os.listdir(dataset_dir):
        year_path = os.path.join(dataset_dir, year_dir)
        if not os.path.isdir(year_path):
            continue

        review_dir = os.path.join(year_path, f"{year_dir}_review")
        if not os.path.exists(review_dir):
            continue

        for fname in os.listdir(review_dir):
            if not (fname.endswith(".json") and "ICLR" in fname):
                continue

            file_path = os.path.join(review_dir, fname)
            with open(file_path, "r") as f:
                try:
                    data = json.load(f)
                except Exception as e:
                    print(f"Failed to parse {file_path}: {e}")
                    continue

            paper_id = data.get("id", fname.replace(".json", ""))
            meta_review = data.get("metaReview", "")
            reviews = data.get("reviews", [])

            review_texts = []
            rating_scores = []

            for review in reviews:
                review_text = review.get("review", "")
                rating_raw = review.get("rating", "")
                try:
                    rating_score = int(rating_raw.split(":")[0].strip())
                    rating_scores.append(rating_score)
                except Exception as e:
                    print(f"Invalid rating. Error is e: {e}")

                review_texts.append(review_text)

            #full_text = " ".join(review_texts + [meta_review]).strip()
            #full_text = clean_text_keep_english_and_punctuation(full_text)
            #if tokenizer is not None:
            #    tokens = tokenizer(full_text, truncation=True, max_length=max_tokens, return_tensors="pt")
            #    full_text = tokenizer.decode(tokens["input_ids"][0], skip_special_tokens=True)

            if meta_review == '':
                continue
            if len(rating_scores) == 0:
                continue  # boş puanlı kayıtları atla

            avg_rating = sum(rating_scores) / len(rating_scores)
            label = 1 if avg_rating >= 6 else 0

            records.append({
                "paper_id": paper_id,
                "text": meta_review,
                "avg_rating": avg_rating,
                "label": label
            })

    return pd.DataFrame(records)

# Example usage:
df = parse_reviews_to_dataframe("dataset")
print(df.head())

         paper_id                                               text  \
0   ICLR_2020_686  This paper proposes a novel architecture for q...   
1  ICLR_2020_1004  This paper provides a novel approach for addre...   
2  ICLR_2020_1470  This paper proposes a new training method for ...   
3  ICLR_2020_1471  The paper proposes a neural network architectu...   
4   ICLR_2020_228  This paper provides a surprising result: that ...   

   avg_rating  label  
0    8.000000      1  
1    4.333333      0  
2    3.000000      0  
3    6.000000      1  
4    6.666667      1  


In [7]:
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=42)

In [8]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, padding_side="right")
tokenizer.pad_token = tokenizer.eos_token 

In [9]:

def build_few_shot_prompt(support_examples, query_text):
    prompt_parts = []

    for support_text, support_label in support_examples:
        label_str = "accepted" if support_label == 1 else "rejected"
        prompt_parts.append(
            f"Meta Review:\n{support_text.strip()}\nPrediction: {label_str}"
        )

    prompt_parts.append(
        "Now read the following meta review and guess whether the paper was accepted or rejected for the conference."
    )
    prompt_parts.append(
        f"\nMeta Review:\n{query_text.strip()}\nYour Prediction in One Word Accepted or Rejected:"
    )

    return "\n\n".join(prompt_parts)

class FewShotReviewDataset(Dataset):
    def __init__(self, dataframe, tokenizer, support_examples):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.support_examples = support_examples

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        query = self.data.iloc[idx]["text"]

        # Prompt construction moved to external helper
        prompt = build_few_shot_prompt(self.support_examples, query)

        # Tokenize to get token count
        tokens = self.tokenizer(prompt, return_tensors="pt")["input_ids"][0]

        # Re-tokenize for padding and attention mask
        encoding = self.tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": self.data.iloc[idx]["label"],
            "prompt": prompt  # optional, useful for logging/debugging
        }

In [11]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average="binary", zero_division=0
    )
    acc = accuracy_score(labels, predictions)

    metrics = {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

    if logits.shape[1] == 2:
        try:
            probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1)[:, 1].numpy()
            auc = roc_auc_score(labels, probs)
            metrics["auc"] = auc
        except Exception as e:
            print(f"Warning: AUC calculation failed: {e}")

    return metrics

def evaluate(dataset, model, tokenizer, name="Validation"):
    model.eval()
    loader = DataLoader(dataset, batch_size=1)

    preds = []
    labels = []

    for batch in loader:
        query_label = batch["label"].item()
        input_ids = batch["input_ids"].to(model.device)
        attention_mask = batch["attention_mask"].to(model.device)
        prompt = batch["prompt"][0]

        with torch.no_grad():
            output = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=5
            )

        decoded = tokenizer.decode(output[0], skip_special_tokens=True).lower()

        prediction_part = decoded.split("your prediction in one word accepted or rejected:", 3)[-1].strip()
        
        if "rejected" in prediction_part:
            pred = 0
        else:
            pred = 1
        
        #print(decoded)
        #rint(pred)
        preds.append(pred)
        labels.append(query_label)
        #print(query_label)
        
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary", zero_division=0)

    print(f"=== {name} ===")
    print(f"accuracy: {acc:.4f}")
    print(f"precision: {precision:.4f}")
    print(f"recall: {recall:.4f}")
    print(f"f1: {f1:.4f}")


    
def get_balanced_support_examples(df, n_shots):
    half = n_shots // 2
    accepted = df[df["label"] == 1].sample(n=half, random_state=42)
    rejected = df[df["label"] == 0].sample(n=half, random_state=42)
    return list(accepted[["text", "label"]].itertuples(index=False, name=None)) + \
           list(rejected[["text", "label"]].itertuples(index=False, name=None))

def sweep_prompt_eval():
    wandb.init()
    config = wandb.config

    # === Select support examples dynamically ===
    n_shots = config.n_shots
    support_examples = get_balanced_support_examples(train_df, config.n_shots)

    val_dataset = FewShotReviewDataset(val_df, tokenizer, support_examples)
    test_dataset = FewShotReviewDataset(test_df, tokenizer, support_examples)
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
    )
    
    base_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.float16
    )
    
    base_model = prepare_model_for_kbit_training(base_model)
    
    peft_config = LoraConfig(
        r=config.r,
        lora_alpha=config.lora_alpha,
        lora_dropout=config.lora_dropout,
        target_modules=["q_proj", "v_proj"],
        task_type="CAUSAL_LM"
    )
    
    model = get_peft_model(base_model, peft_config)
    model.eval()
    evaluate(val_dataset, model, tokenizer, name="Validation")
    evaluate(test_dataset, model, tokenizer, name="Test")

    wandb.finish()

In [12]:
sweep_config = {
    "method": "grid",
    "metric": {"name": "test_accuracy", "goal": "maximize"},
    "parameters": {
        "n_shots": {"values": [0,2,4]},
        "r": {"values": [4, 8]},
        "lora_alpha": {"values": [16, 32]},
        "lora_dropout": {"values": [0.05, 0.1]}
    }
}


In [13]:
sweep_id = wandb.sweep(sweep_config, project="tinyllama-fewshot")
wandb.agent(sweep_id, function=sweep_prompt_eval, count=12)

Create sweep with ID: cy1gb8xv
Sweep URL: https://wandb.ai/e274028-metu-middle-east-technical-university/tinyllama-fewshot/sweeps/cy1gb8xv


[34m[1mwandb[0m: Agent Starting Run: r0kokxn2 with config:
[34m[1mwandb[0m: 	lora_alpha: 16
[34m[1mwandb[0m: 	lora_dropout: 0.05
[34m[1mwandb[0m: 	n_shots: 0
[34m[1mwandb[0m: 	r: 4


=== Validation ===
accuracy: 0.6578
precision: 0.4813
recall: 0.8897
f1: 0.6247
=== Test ===
accuracy: 0.6564
precision: 0.4818
recall: 0.9041
f1: 0.6286


[34m[1mwandb[0m: Agent Starting Run: uauq7n0v with config:
[34m[1mwandb[0m: 	lora_alpha: 16
[34m[1mwandb[0m: 	lora_dropout: 0.05
[34m[1mwandb[0m: 	n_shots: 0
[34m[1mwandb[0m: 	r: 8


=== Validation ===
accuracy: 0.6578
precision: 0.4813
recall: 0.8897
f1: 0.6247
=== Test ===
accuracy: 0.6564
precision: 0.4818
recall: 0.9041
f1: 0.6286


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: gsf253rc with config:
[34m[1mwandb[0m: 	lora_alpha: 16
[34m[1mwandb[0m: 	lora_dropout: 0.05
[34m[1mwandb[0m: 	n_shots: 2
[34m[1mwandb[0m: 	r: 4


=== Validation ===
accuracy: 0.4879
precision: 0.3795
recall: 0.9448
f1: 0.5415
=== Test ===
accuracy: 0.5286
precision: 0.4023
recall: 0.9589
f1: 0.5668


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 0q82bhjn with config:
[34m[1mwandb[0m: 	lora_alpha: 16
[34m[1mwandb[0m: 	lora_dropout: 0.05
[34m[1mwandb[0m: 	n_shots: 2
[34m[1mwandb[0m: 	r: 8


=== Validation ===
accuracy: 0.4879
precision: 0.3795
recall: 0.9448
f1: 0.5415
=== Test ===
accuracy: 0.5286
precision: 0.4023
recall: 0.9589
f1: 0.5668


[34m[1mwandb[0m: Agent Starting Run: uk7maq5n with config:
[34m[1mwandb[0m: 	lora_alpha: 16
[34m[1mwandb[0m: 	lora_dropout: 0.05
[34m[1mwandb[0m: 	n_shots: 4
[34m[1mwandb[0m: 	r: 4


=== Validation ===
accuracy: 0.3620
precision: 0.3310
recall: 0.9724
f1: 0.4939
=== Test ===
accuracy: 0.3789
precision: 0.3404
recall: 0.9932
f1: 0.5070


[34m[1mwandb[0m: Agent Starting Run: veou8kgc with config:
[34m[1mwandb[0m: 	lora_alpha: 16
[34m[1mwandb[0m: 	lora_dropout: 0.05
[34m[1mwandb[0m: 	n_shots: 4
[34m[1mwandb[0m: 	r: 8


[34m[1mwandb[0m: [32m[41mERROR[0m Run veou8kgc errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/home/cagri/.local/lib/python3.12/site-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_960510/2086419514.py", line 105, in sweep_prompt_eval
[34m[1mwandb[0m: [32m[41mERROR[0m     base_model = prepare_model_for_kbit_training(base_model)
[34m[1mwandb[0m: [32m[41mERROR[0m                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/home/cagri/.local/lib/python3.12/site-packages/peft/utils/other.py", line 141, in prepare_model_for_kbit_training
[34m[1mwandb[0m: [32m[41mERROR[0m     param.data = param.data.to(torch.float32)
[34m[1mwandb[0m: [32m[41mERROR[0m                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[34

[34m[1mwandb[0m: [32m[41mERROR[0m Run nzu81brh errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/home/cagri/.local/lib/python3.12/site-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_960510/2086419514.py", line 98, in sweep_prompt_eval
[34m[1mwandb[0m: [32m[41mERROR[0m     base_model = AutoModelForCausalLM.from_pretrained(
[34m[1mwandb[0m: [32m[41mERROR[0m                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/home/cagri/.local/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
[34m[1mwandb[0m: [32m[41mERROR[0m     return model_class.from_pretrained(
[34m[1mwandb[0m: [32m[41mERROR[0m            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32

=== Validation ===
accuracy: 0.6578
precision: 0.4813
recall: 0.8897
f1: 0.6247
=== Test ===
accuracy: 0.6564
precision: 0.4818
recall: 0.9041
f1: 0.6286


[34m[1mwandb[0m: Agent Starting Run: p57bqckn with config:
[34m[1mwandb[0m: 	lora_alpha: 16
[34m[1mwandb[0m: 	lora_dropout: 0.1
[34m[1mwandb[0m: 	n_shots: 2
[34m[1mwandb[0m: 	r: 4


=== Validation ===
accuracy: 0.4879
precision: 0.3795
recall: 0.9448
f1: 0.5415
=== Test ===
accuracy: 0.5286
precision: 0.4023
recall: 0.9589
f1: 0.5668


[34m[1mwandb[0m: Agent Starting Run: lbuifmzt with config:
[34m[1mwandb[0m: 	lora_alpha: 16
[34m[1mwandb[0m: 	lora_dropout: 0.1
[34m[1mwandb[0m: 	n_shots: 2
[34m[1mwandb[0m: 	r: 8


=== Validation ===
accuracy: 0.4879
precision: 0.3795
recall: 0.9448
f1: 0.5415
=== Test ===
accuracy: 0.5286
precision: 0.4023
recall: 0.9589
f1: 0.5668


[34m[1mwandb[0m: Agent Starting Run: okszxjr4 with config:
[34m[1mwandb[0m: 	lora_alpha: 16
[34m[1mwandb[0m: 	lora_dropout: 0.1
[34m[1mwandb[0m: 	n_shots: 4
[34m[1mwandb[0m: 	r: 4


[34m[1mwandb[0m: [32m[41mERROR[0m Run okszxjr4 errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/home/cagri/.local/lib/python3.12/site-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_960510/2086419514.py", line 105, in sweep_prompt_eval
[34m[1mwandb[0m: [32m[41mERROR[0m     base_model = prepare_model_for_kbit_training(base_model)
[34m[1mwandb[0m: [32m[41mERROR[0m                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/home/cagri/.local/lib/python3.12/site-packages/peft/utils/other.py", line 141, in prepare_model_for_kbit_training
[34m[1mwandb[0m: [32m[41mERROR[0m     param.data = param.data.to(torch.float32)
[34m[1mwandb[0m: [32m[41mERROR[0m                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[34

[34m[1mwandb[0m: [32m[41mERROR[0m Run sbujqwvs errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/home/cagri/.local/lib/python3.12/site-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_960510/2086419514.py", line 98, in sweep_prompt_eval
[34m[1mwandb[0m: [32m[41mERROR[0m     base_model = AutoModelForCausalLM.from_pretrained(
[34m[1mwandb[0m: [32m[41mERROR[0m                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/home/cagri/.local/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
[34m[1mwandb[0m: [32m[41mERROR[0m     return model_class.from_pretrained(
[34m[1mwandb[0m: [32m[41mERROR[0m            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32