In [1]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

Looking in indexes: https://download.pytorch.org/whl/cu121
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
%pip install -q transformers datasets accelerate peft bitsandbytes

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
from datasets import load_dataset

dataset = load_dataset("vmal/jobs_dataset")

print(dataset['train'][0])

  from .autonotebook import tqdm as notebook_tqdm


{'source_text': 'Generative AI Specialist, Blackbelt, Google Cloud share link link Copy link email email Email a friend corporate_fare Google place Bengaluru, Karnataka, India bar_chart bar_chart Mid Mid Mid Experience driving progress, solving problems, and mentoring more junior team members; deeper expertise and applied knowledge within relevant area. Apply share link link Copy link email email Email a friend Minimum qualifications: Preferred qualifications: About the job As a Generative Artificial Intelligence (AI) Specialist, you will work with Product Development and Technical Sales teams as an Generative AI subject matter expert to bring Google Cloud AI products to customers and partners. In this role, you will help prospective customers and partners understand the power of Google AI, explain technical features, help customers design architectures, build AI powered applications, and problem-solve any potential roadblocks. You will also have the opportunity to help customers lever

In [4]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['source_text', 'target_text'],
        num_rows: 5579
    })
})


In [5]:
dataset = dataset['train'].train_test_split(test_size=0.1)

In [6]:
def format(example):
  return {
        "input": f"Extract structured JSON from this job description page content:\n{example['source_text']}",
        "output": example["target_text"]  # Already a JSON string
    }

formatted = dataset.map(format)

Map: 100%|████████████████████████████████████████████████████████████████| 5021/5021 [00:00<00:00, 8683.58 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 558/558 [00:00<00:00, 6785.07 examples/s]


In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

def tokenize(batch):
    input = tokenizer(batch["input"], padding="max_length", truncation=True, max_length=1024)
    output = tokenizer(batch["output"], padding="max_length", truncation=True, max_length=1024)
    input["labels"] = output["input_ids"]
    return input

tokenized = formatted.map(tokenize, batched=True)

Map: 100%|█████████████████████████████████████████████████████████████████| 5021/5021 [00:05<00:00, 928.10 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████| 558/558 [00:00<00:00, 952.15 examples/s]


In [8]:
from transformers import AutoModelForSeq2SeqLM
from peft import get_peft_model, LoraConfig, TaskType

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 344,064 || all params: 77,305,216 || trainable%: 0.4451


In [9]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_dir="./logs",
    save_strategy="epoch",
    save_total_limit=2,
    fp16=True,
    gradient_accumulation_steps=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
)


In [None]:
trainer.train()

Step,Training Loss
500,0.0
1000,0.0


In [None]:
dataset['test'][1]['source_text']

In [None]:
input_text = dataset['test'][1]['source_text']

inputs = tokenizer(f"Extract structured JSON from this job description:\n{input_text}",
                   return_tensors="pt").input_ids.to(model.device)

outputs = model.generate(input_ids=inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
model.save_pretrained("flan-jd-json")
tokenizer.save_pretrained("flan-jd-json")