In [1]:
!pip install -U transformers
!pip install -U bitsandbytes
!pip install evaluate
!pip install imbalanced-learn scikit-learn --upgrade
!pip install wandb

Collecting scikit-learn
  Using cached scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (17 kB)


In [2]:

import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling
from torch.utils.data import DataLoader
import pandas as pd
from datasets import Dataset
import os

os.environ["WANDB_SILENT"] = "True"

import wandb
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

2025-06-10 20:22:04.455263: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749586924.480965     824 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749586924.488794     824 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Model page: https://huggingface.co/meta-llama/Llama-3.1-8B

⚠️ If the generated code snippets do not work, please open an issue on either the [model repo](https://huggingface.co/meta-llama/Llama-3.1-8B)
			and/or on [huggingface.js](https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/model-libraries-snippets.ts) 🙏

In [3]:
from google.colab import userdata
wandb_key = userdata.get('wandb')
hf = userdata.get('huggingface')



In [4]:
from huggingface_hub import login
login(token=hf)
# Use the imported wandb module and the correct variable for the key
import wandb
wandb.login(key=wandb_key)

True

## Tokenizing

In [5]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

In [6]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")


from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [7]:
import polars as pl

df = pl.read_parquet('hf://datasets/ahmadreza13/human-vs-Ai-generated-dataset/data/train-*.parquet')

### Litle EDA

In [8]:
df1 = df.clone()

In [9]:
df1

data,generated,model
str,i64,str
"""def add(a, b):  """"""This fun…",1,"""claude"""
"""Once upon a time in a faraway …",1,"""claude"""
"""I understand that video games …",1,"""claude"""
"""(x - 3)(x - 2) = 0 Therefore, …",1,"""claude"""
"""The Importance of Clear and Co…",1,"""claude"""
…,…,…
"""Stony Creek Mills, Pennsylvani…",0,"""wikipedia"""
"""Stone labyrinths of Bolshoi Za…",0,"""wikipedia"""
"""Scott Bailey (curler) Scott B…",0,"""wikipedia"""
"""Monmouth-Roseville High School…",0,"""wikipedia"""


In [10]:
df1['generated'].value_counts()

generated,count
i64,u32
0,2092358
1,1521889


Vamos separar 250 mil linhas de textos gerados por IA para pré-treinar o modelo

In [11]:
ai_generated = df1.filter(df1['generated'] == 1)

In [12]:
ai_generated

data,generated,model
str,i64,str
"""def add(a, b):  """"""This fun…",1,"""claude"""
"""Once upon a time in a faraway …",1,"""claude"""
"""I understand that video games …",1,"""claude"""
"""(x - 3)(x - 2) = 0 Therefore, …",1,"""claude"""
"""The Importance of Clear and Co…",1,"""claude"""
…,…,…
"""The error message is indicatin…",1,"""GPT4"""
"""There are a few things you can…",1,"""GPT4"""
"""You can use the <code>proc uni…",1,"""GPT4"""
"""As an AI language model, I don…",1,"""GPT4"""


In [13]:
lora_train = ai_generated[:20000]
lora_test = ai_generated[20000:25000]

Para evitar vazamento de dados, vamos no pos-treino retirar manualmente os dados filtrados e concatena-los

In [14]:
lora_train = pl.DataFrame(lora_train['data'])
lora_test = pl.DataFrame(lora_test['data'])

In [15]:
lora_train

data
str
"""def add(a, b):  """"""This fun…"
"""Once upon a time in a faraway …"
"""I understand that video games …"
"""(x - 3)(x - 2) = 0 Therefore, …"
"""The Importance of Clear and Co…"
…
"""The overarching themes in Roma…"
"""To thoroughly analyze this com…"
"""The warm summer breeze carried…"
"""The emergence of language and …"


## Processing

In [16]:
lora_train = Dataset.from_polars(lora_train)
lora_test = Dataset.from_polars(lora_test)

# Redefine preprocess_function to only tokenize and not add padding or labels
def preprocess_function(examples):
    # Tokenize the texts. The data collator will handle padding and labels.
    tokenized_inputs = tokenizer(examples['data'], truncation=True, padding=True)
    tokenized_inputs['labels'] = tokenized_inputs['input_ids'].copy()
    return tokenized_inputs

pre_train = lora_train.map(
    preprocess_function,
    batched=True,
    remove_columns=['data'] # Remove the original 'text' column

)

pre_test = lora_test.map(
    preprocess_function,
    batched=True,
    remove_columns=['data'] # Remove the original 'text' column
)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [17]:
total_tokens = sum(len(x) for x in pre_train["input_ids"])
print(f"Total de tokens: {total_tokens}")


Total de tokens: 16524000


## Pre-Training with LoRA

In [18]:
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

# Define the quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True, # Specify 8-bit loading within the config
    bnb_4bit_compute_dtype=torch.float16 # Optional: specify compute dtype if using 4-bit, good practice to include
)

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-1B",
    quantization_config=bnb_config, # Pass the BitsAndBytesConfig object
    device_map='auto',
)

In [19]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear8bitLt(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear8bitLt(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear8bitLt(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear8bitLt(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMS

In [20]:
model.config.pad_token_id = tokenizer.pad_token_id

In [21]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    task_type = "CAUSAL_LM",
    r=4,
    lora_alpha=16,
    target_modules = [
    "q_proj", "k_proj", "v_proj", "o_proj",
    "gate_proj", "up_proj", "down_proj"
    ],
    lora_dropout=0.05,
    bias="none",
    modules_to_save=["causal"],
)
model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 2,818,048 || all params: 1,238,632,448 || trainable%: 0.2275


In [22]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=4, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): l

In [23]:
os.environ["WANDB_PROJECT"] = "SYA-AI"  # name your W&B project

training_args = TrainingArguments(
    output_dir="LoRA-PreTraining_2",
    learning_rate=1e-4,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=5,
    weight_decay=0.05,
    save_strategy="steps",
    eval_strategy="steps",
    eval_steps = 250,
    save_steps = 10000,
    load_best_model_at_end=False,
    eval_accumulation_steps=1,
    push_to_hub=False,
    report_to="wandb",
    fp16= False,
    logging_steps=10,
    label_names=['labels'],
    remove_unused_columns=False,
    max_grad_norm=1.0
)

In [24]:
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=pre_train,
    eval_dataset = pre_test,
    data_collator=data_collator)

trainer.train()

Step,Training Loss,Validation Loss


KeyboardInterrupt: 

## Training for Classification

In [None]:
trainer.

# Fontes

LoRA: https://huggingface.co/docs/peft/v0.15.0/en/task_guides/lora_based_methods

Dataset de Pré-treino: https://huggingface.co/datasets/nvidia/OpenCodeReasoning?library=polars

Como funciona LoRA: https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora

Como fine-tunar um CLM (Causal Language Modeling) : https://huggingface.co/docs/transformers/tasks/language_modeling