#LLM Authorship Attribution (JavaScript)

This Google Colab contains the code associated with the paper **The Hidden DNA of LLM-Generated JavaScript: Structural Patterns Enable High-Accuracy Authorship Attribution**, accepted at the 3rd International Workshop on Large Language Models for Code (LLM4Code ’26).

**Description:** This script can be used to train the modified CodeT5 model for testing LLM authorship attribution. The Colab notebook provides all the necessary steps to train BERT to distinguish which LLM generated the JavaScript code

##STEP 0: Installing dependencies  

In [None]:
#STEP 0: Install dependencies, download the dataset, prepare helper functions, and import all required packages

'''
OPTIMIZATION for A100
The following optimizations were applied to the CodeT5 training pipeline to maximize efficiency on the A100 GPU.

  a. bf16: Uses 16-bit math instead of full 32-bit. Same accuracy in practice, but faster and uses less GPU memory.
  b. Dynamic padding: Instead of always padding to 512/1024 tokens, pads only to the longest sequence in a batch. Cuts waste, speeds things up.
  c. Fused AdamW: Optimizer with CUDA-fused kernels — same math as normal AdamW but runs faster and with less overhead.
  d. TF32: Special Ampere hardware math mode that keeps 32-bit range but uses 10-bit mantissa. Almost same accuracy as FP32, but much faster on matrix multiplies.
'''

import os
# Install dependencies
print("[*] Installing dependencies...")
!pip install --upgrade transformers > /dev/null 2>&1
print("[*] Installation: DONE")

# Load packages
import torch, json, tempfile, shutil, zipfile, datetime
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
import numpy as np
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
print("[*] Loading: DONE")

# Download dataset
print("[*] Downloading LLM-NodeJS Medium dataset")
!wget -O LLM-NodeJS-medium.json.zip https://github.com/LLM-NodeJS-dataset/LLM-NodeJS-dataset/releases/download/LLM-NodeJS-medium/LLM-NodeJS-medium.json.zip > /dev/null 2>&1
!unzip -o LLM-NodeJS-medium.json.zip > /dev/null 2>&1
!rm LLM-NodeJS-medium.json.zip > /dev/null 2>&1
print("[*] Dataset downloaded: DONE")

# Load dataset
PATH = '/content/LLM-NodeJS-medium.json'
with open(PATH, 'r', encoding='utf-8') as f:
    data = json.load(f)
rows = data if isinstance(data, list) else [data]
df = pd.json_normalize(rows, sep='.')
models = df['model_name'].dropna().astype(str).unique().tolist()
print("[*] Dataset processed: DONE")

# Check if CUDA (GPU) is available
if torch.cuda.is_available():
    device_name = torch.cuda.get_device_name(0)
    device_count = torch.cuda.device_count()
    print(f"[*] GPU in available: {device_name}")
    os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
    if torch.cuda.is_available():
        torch.backends.cuda.matmul.fp32_precision = "tf32"
        torch.backends.cudnn.conv.fp32_precision = "tf32"
else:
    print("[*] CUDA is NOT available. Using CPU only.")

print(f"[*] Available models in dataset: [{', '.join(models)}]")


[*] Installing dependencies...
[*] Installation: DONE
[*] Loading: DONE
[*] Downloading LLM-NodeJS Medium dataset
[*] Dataset downloaded: DONE
[*] Dataset processed: DONE
[*] GPU in available: NVIDIA A100-SXM4-40GB
[*] Available models in dataset: [codestral-2508, deepseek-v3.1, gemini-2.0-flash, gemini-2.5-flash-lite, gemma-3-27b, gpt-4o, gpt-4o-mini, gpt-5-mini, gpt-5-nano, gpt-oss-120b, grok-3-mini, grok-code-fast-1, llama-3.1-8b, llama-3.3-70b, llama-4-scout, mixtral-8x7b, phi-4-reasoning-plus, qwen-2.5-7b, qwen-2.5-coder-32b, qwen3-coder]


In [None]:

# ------------------------------------------------------------------------------------------------------------------------------------
# CodeT5-specific helpers
# ------------------------------------------------------------------------------------------------------------------------------------
import transformers
from torch.nn import CrossEntropyLoss
from transformers.modeling_outputs import SequenceClassifierOutput


def load_tokenizer():
    return transformers.AutoTokenizer.from_pretrained("Salesforce/codet5p-770m")

def load_model():
    print(f'Loading CodeT5 model without decocder layers...')
    model_kwargs = dict(dtype=torch.bfloat16)
    transformers.T5EncoderModel._keys_to_ignore_on_load_unexpected = ["decoder.*"]
    print("Modael loaded.")
    encoder = transformers.T5EncoderModel.from_pretrained("Salesforce/codet5p-770m", **model_kwargs)
    return encoder

class CodeT5_Classifier(torch.nn.Module):
    def __init__(self, pretrained_encoder, hidden_dim, num_labels=20):
        super().__init__()
        self.encoder = pretrained_encoder
        self.config = pretrained_encoder.config
        self.config.num_labels = num_labels

        self.pre_classifier = torch.nn.Linear(hidden_dim, 768, dtype=torch.bfloat16)
        self.activation = torch.nn.Tanh()
        self.dropout = torch.nn.Dropout(0.2)
        self.classifier = torch.nn.Linear(768, num_labels, dtype=torch.bfloat16)
        self.loss_fn = CrossEntropyLoss()

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state if hasattr(outputs, "last_hidden_state") else outputs[0]

        cls_output = hidden_state[:, 0]
        cls_output = self.dropout(cls_output)
        pooled = self.pre_classifier(cls_output)
        pooled = self.activation(pooled)
        pooled = self.dropout(pooled)
        logits = self.classifier(pooled)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)

        return SequenceClassifierOutput(loss=loss, logits=logits)



# ------------------------------------------------------------------------------------------------------------------------------------
# Training helpers (save, metrics, visualization)
# ------------------------------------------------------------------------------------------------------------------------------------


def COMPUTE_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    p, r, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": p, "recall": r}

def SAVE_confusion_matrix(trainer, test_ds, label_encoder, out_path="confusion_matrix.pdf"):
    predictions = trainer.predict(test_ds)
    y_true = predictions.label_ids
    y_pred = predictions.predictions.argmax(axis=1)
    conf_matrix = confusion_matrix(y_true, y_pred)

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues",
                xticklabels=label_encoder.classes_,
                yticklabels=label_encoder.classes_)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.savefig(out_path)
    plt.show()
    plt.close()

    print(f"[*] Confusion matrix saved to: {out_path}")

def get_dataframes(df, *, sample_size=None, test_size=0, val_size=0, random_state=42):
    groups = df["prompt"].unique()

    current_df = df

    if sample_size is not None and sample_size < len(df):
        avg_rows_per_group = len(df) / len(groups)
        n_groups_needed = int(sample_size / avg_rows_per_group)

        if n_groups_needed < len(groups):
            rng = np.random.RandomState(random_state)
            selected_groups = rng.choice(groups, size=n_groups_needed, replace=False)

            current_df = df[df["prompt"].isin(selected_groups)].copy()
        else:
            print(f"WARNING: The size of the Dataframe ({sample_size}) is smaller than the sample size ({len(df)})")

    if test_size > 0:
        splitter = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
        train_val_idx, test_idx = next(splitter.split(current_df, groups=current_df["prompt"]))

        train_val_df = current_df.iloc[train_val_idx]
        test_df = current_df.iloc[test_idx].copy()
    else:
        train_val_df = current_df
        test_df = pd.DataFrame(columns=df.columns)

    if val_size > 0:
        relative_val_size = val_size / (1 - test_size)
        if relative_val_size >= 1.0:
             raise ValueError("ERROR: The sum of test and val size reaches or exceeds 1.0!")

        splitter = GroupShuffleSplit(n_splits=1, test_size=relative_val_size, random_state=random_state)
        train_idx, val_idx = next(splitter.split(train_val_df, groups=train_val_df["prompt"]))

        train_df = train_val_df.iloc[train_idx].copy()
        val_df = train_val_df.iloc[val_idx].copy()
    else:
        train_df = train_val_df.copy()
        val_df = pd.DataFrame(columns=df.columns)

    return train_df, val_df, test_df


# ------------------------------------------------------------------------------------------------------------------------------------
# Fine-tuning pipeline for CodeT5
# ------------------------------------------------------------------------------------------------------------------------------------
def RUN_fine_tuning(INPUT_TEXT, TARGET_COLUMN, MAX_TOKEN, CLASSES, df):
    filtered_df = df[df[TARGET_COLUMN].isin(CLASSES)].copy()
    encoder = LabelEncoder()
    filtered_df["labels"] = encoder.fit_transform(filtered_df[TARGET_COLUMN])
    num_labels = len(encoder.classes_)
    print("Number of labels:", num_labels)

    train_df, _, test_df = get_dataframes(filtered_df, val_size=0, test_size=0.2)

    tokenizer = load_tokenizer()
    encoder_model = load_model()
    model = CodeT5_Classifier(encoder_model, hidden_dim=encoder_model.shared.embedding_dim, num_labels=num_labels)

    train_ds = Dataset.from_pandas(train_df)
    test_ds = Dataset.from_pandas(test_df)

    def tokenize_function(batch):
        return tokenizer(batch[INPUT_TEXT], truncation=True, max_length=MAX_TOKEN, add_special_tokens=True)

    train_ds = train_ds.map(tokenize_function, batched=True, num_proc=4, desc="Tokenizing train set")
    test_ds = test_ds.map(tokenize_function, batched=True, num_proc=4, desc="Tokenizing test set")

    train_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
    test_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)

    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=10,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        gradient_accumulation_steps=2,
        eval_accumulation_steps=16,
        bf16=True, fp16=False,
        optim="adamw_torch_fused",
        learning_rate=2e-5,
        warmup_ratio=0.06,
        lr_scheduler_type="cosine_with_restarts",
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        load_best_model_at_end=True,
        label_smoothing_factor=0.05,
        max_grad_norm=1.0,
        metric_for_best_model="accuracy",
        save_total_limit=2,
        logging_dir="./logs",
        logging_steps=100,
        report_to="none",
        dataloader_num_workers=2,
        dataloader_pin_memory=True,
        save_safetensors=False
    )



    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_ds,
        eval_dataset=test_ds,
        compute_metrics=COMPUTE_metrics,
    )

    trainer.train()
    trainer.evaluate()

    return trainer, encoder, tokenizer, test_ds


In [None]:

# ------------------------------------------------------------------------------------------------------------------------------------
# Run training (CodeT5)
# ------------------------------------------------------------------------------------------------------------------------------------
INPUT_TEXT = "js_original.js_code"
TARGET_COLUMN = "model_name"
MAX_TOKEN = 512


# 5-class
TARGET_MODELS = ["gpt-4o", "gpt-4o-mini" ,"gpt-5-mini", "gpt-5-nano", "gpt-oss-120b"]

# ------- MODEL config parameters ----------------
CLASS_num_lables=str(len(TARGET_MODELS))+"-class_family_"

print(f"\n===== Training CodeT5 (Salesforce/codet5p-770m) =====")
trainer, encoder, tokenizer, test_ds = RUN_fine_tuning(INPUT_TEXT, TARGET_COLUMN, MAX_TOKEN, TARGET_MODELS, df)

cm_path = "CodeT5_confusion_matrix.pdf"
SAVE_confusion_matrix(trainer, test_ds, encoder, out_path=cm_path)


print("===== Finished CodeT5 Training =====")


===== Training CodeT5 (Salesforce/codet5p-770m) =====
Number of labels: 5
Loading CodeT5 model without decocder layers...
Modael loaded.


Tokenizing train set (num_proc=4):   0%|          | 0/9990 [00:00<?, ? examples/s]

Tokenizing test set (num_proc=4):   0%|          | 0/2510 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,2.4931,0.607992,0.847809,0.847926,0.849086,0.847809
2,1.0558,0.487133,0.888446,0.888766,0.890452,0.888446
3,0.886,0.448531,0.901992,0.902281,0.902788,0.901992
4,0.8143,0.436591,0.90757,0.907808,0.908236,0.90757
5,0.7832,0.437368,0.913147,0.913264,0.91428,0.913147
6,0.7599,0.43393,0.909562,0.909627,0.910439,0.909562
7,0.7516,0.434335,0.910359,0.910529,0.911312,0.910359
8,0.7517,0.433865,0.90996,0.910118,0.910875,0.90996
9,0.7503,0.433813,0.910359,0.910499,0.91114,0.910359
