In [None]:
#Installing libraries
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3  peft trl triton
!pip install --no-deps unsloth
!pip install --no-deps cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer


In [None]:
#Loading the unsloth base model
from unsloth import FastLanguageModel
import torch
from huggingface_hub import login

#Insert your HF API token
login("")
max_seq_length = 1024

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/llama-3.2-3b",
    max_seq_length=max_seq_length,
    dtype=torch.float16,
    load_in_4bit=False,
    low_cpu_mem_usage=True,
)

In [None]:

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                   "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    use_rslora = False,
    loftq_config = None,
)

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict


def load_balanced_split(path, frac):
    df = pd.read_csv(path, sep="\t", index_col=0)
    total_samples = int(len(df) * frac)
    samples_per_class = total_samples // df['label'].nunique()

    balanced = df.groupby('label', group_keys=False).apply(lambda x: x.sample(n=min(samples_per_class, len(x)), random_state=42))
    dataset = Dataset.from_pandas(balanced, preserve_index=False)
    return dataset

"Upload your files in the format tweet | label (Human/Machine) "
data_splits = {
    "train": "train.tsv",
    "validation": "dev.tsv",
}

raw_datasets = DatasetDict({
    split: load_balanced_split(path, frac=0.6) ]
    for split, path in data_splits.items()
})

print(raw_datasets)

In [None]:

label2id = {"Human": 0, "Machine": 1}

def remap_labels(example):
    example["label"] = label2id[example["label"]]
    return example

raw_datasets = raw_datasets.map(remap_labels)

In [None]:

def format_prompts(data):
    texts = []
    for tweet in data["tweet"]:

        text = f'''Below is a tweet. Classify it as either Human or Machine generated.
Tweet:
{tweet}

Classification:
'''
        texts.append(text)
    return {"text": texts}

# Apply prompt formatting
formatted_datasets = raw_datasets.map(format_prompts, batched=True)

In [None]:
def tokenise_data(examples):

    full_texts = []
    for i, text in enumerate(examples["text"]):
        label_text = "Human" if examples["label"][i] == 0 else "Machine"
        full_text = text + label_text + tokenizer.eos_token
        full_texts.append(full_text)


    model_inputs = tokenizer(
        full_texts,
        truncation=True,
        max_length=512,
        padding=False,
    )


    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

tokenised_datasets = formatted_datasets.map(
    tokenise_data,
    batched=True,
    remove_columns=["tweet", "label", "text"],
)


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForLanguageModeling


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized_dsets["train"],
    eval_dataset=tokenized_dsets["validation"],
    dataset_text_field="text",
    max_seq_length=512,
    data_collator=data_collator,
    args=TrainingArguments(
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=10,
        num_train_epochs=3,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=10,
        eval_strategy="steps",
        eval_steps=50,
        save_strategy="steps",
        save_steps=100,
        output_dir="./results",
        optim="adamw_8bit",
        dataloader_pin_memory=False,
        remove_unused_columns=False,
        report_to="none",
    ),
)


In [None]:
trainer.train()

In [None]:
from huggingface_hub import login

login("HF token")
model.push_to_hub(
    "your personal repository"
    tokenizer,
    private=True,
)