In [None]:
# !pip install bitsandbytes

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# !pip install datasets

In [None]:
import pandas as pd

In [None]:
from google.colab import userdata
token = userdata.get('HF_TOKEN')
# token

In [None]:
df = pd.read_csv("/content/pre-processed_reasoning.csv")
df = df[4:]
df = df.reset_index(drop=True)
df

In [None]:
import pandas as pd
import json

formatted_data = []

def format_output(skills):
    output_lines = []
    for item in skills:
        output_lines.append(f"- Skill: {item['skill']}\n  Reason: {item['reason']}")
    return "\n".join(output_lines)

for index, row in df.iterrows():
    try:
        job_description = row["input"]
        output_str = row["output"]
        if pd.isna(output_str):
            raise ValueError("Empty output")

        record_list = json.loads(output_str)
        record = record_list[0]

        thinking = record.get("thinking", {})
        step_1 = thinking.get("step_1", {})
        step_2 = thinking.get("step_2", {})
        step_3 = thinking.get("step_3", {})

        # Include full step-by-step reasoning in the prompt
        thinking_str = json.dumps(thinking, indent=2)

        # Construct skills
        explicit_skills = [{"skill": k, "reason": v} for k, v in step_2.items()]
        implicit_skills = [{"skill": k, "reason": v} for k, v in step_3.items()]
        all_skills = explicit_skills + implicit_skills

        prompt = (
            "You are an AI assistant that extracts skills from job descriptions using chain-of-thought reasoning.\n"
            "Think step-by-step and provide both the skills and your reasoning.\n\n"
            f"Job Description:\n{job_description}\n\n"
            f"Thinking:\n{thinking_str}\n\n"
            "Answer:"
        )

        completion = format_output(all_skills).strip()

        formatted_data.append({
            "prompt": prompt,
            "completion": completion
        })

    except (json.JSONDecodeError, TypeError, ValueError) as e:
        print(f"Row {index} caused error: {e}")
        print(f"Raw output:\n{row['output']}\n{'='*60}")
        continue

# Save to JSONL
with open("formatted_data.jsonl", "w", encoding="utf-8") as f:
    for item in formatted_data:
        f.write(json.dumps(item) + "\n")

print("JSONL conversion complete.")


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model

model_name = "microsoft/phi-2"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=token)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto",
    trust_remote_code=True,
    token = token
)

In [None]:
lora_config = LoraConfig(
    r=8,                     
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
model = get_peft_model(model, lora_config)

In [None]:
from datasets import Dataset
df = pd.read_json("/content/formatted_data.jsonl", lines=True)
dataset = Dataset.from_pandas(df)

In [None]:
dataset

In [None]:
from datasets import Dataset
import pandas as pd

# Load JSONL into pandas dataframe
df = pd.read_json("formatted_data.jsonl", lines=True)

# Convert to HuggingFace dataset
dataset = Dataset.from_pandas(df)
if tokenizer.pad_token is None:
    if tokenizer.eos_token:
        tokenizer.pad_token = tokenizer.eos_token
    else:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        tokenizer.pad_token = '[PAD]'

def tokenize_function(examples):
    inputs = [p + c for p, c in zip(examples["prompt"], examples["completion"])]
    tokenized = tokenizer(
        inputs,
        truncation=True,
        max_length=512,
        padding="max_length",
    )
    # Add raw text columns back to tokenized output
    tokenized["prompt"] = examples["prompt"]
    tokenized["completion"] = examples["completion"]
    return tokenized

tokenized_dataset = dataset.map(tokenize_function, batched=True)


In [None]:
# tokenizer.pad_token = tokenizer.eos_token
# tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
# tokenized_dataset[0]

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./phi2-lora-finetuned",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=1,
    save_steps=1000,
    save_total_limit=2,
    report_to="none",
    eval_strategy="epoch",  
    save_strategy="epoch",
)

In [None]:
# from transformers import TrainingArguments

# print(TrainingArguments.__module__)
# print(dir(TrainingArguments))

In [None]:
dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]
def tokenize_function(examples):
    inputs = [p + c for p, c in zip(examples["prompt"], examples["completion"])]
    return tokenizer(
        inputs,
        truncation=True,
        max_length=512,
        padding="max_length",
    )

tokenizer.pad_token = tokenizer.eos_token

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


In [None]:
trainer.train()

In [None]:
import math
eval_results = trainer.evaluate()
print(f"Validation loss: {eval_results['eval_loss']:.4f}")
print(f"Validation perplexity: {math.exp(eval_results['eval_loss']):.2f}")