In [6]:
import os
import numpy as np
from datasets import load_dataset
from transformers import (
    T5ForConditionalGeneration,
    T5TokenizerFast,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
)
from evaluate import load

# ✅ Settings
model_name = "t5-small"   # try "flan-t5-base" if outputs collapse
train_file = "C:/Users/WIN11/Intelligent-Resume-Feedback-System/data/train_T5.jsonl"
valid_file = "C:/Users/WIN11/Intelligent-Resume-Feedback-System/data/valid_T5.jsonl"
output_dir = "./t5_skill_feedback"

# ✅ Training hyperparameters
epochs = 5
batch_size = 4
grad_accum = 4
lr = 3e-4
weight_decay = 0.01
warmup_ratio = 0.06
patience = 3
seed = 42
fp16 = True
gradient_checkpointing = True
eval_every_steps = 500
save_total_limit = 2
logging_steps = 50


# 1) Load tokenizer & model
tokenizer = T5TokenizerFast.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

if gradient_checkpointing:
    model.gradient_checkpointing_enable()

# 2) Load dataset
dataset = load_dataset(
    "json",
    data_files={"train": train_file, "validation": valid_file}
)

# 3) Preprocess
def preprocess(examples):
    model_inputs = tokenizer(
        examples["input"],
        max_length=max_input_len,
        truncation=True
    )
    labels = tokenizer(
        examples["output"],
        max_length=max_target_len,
        truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset["train"].column_names
)

# 4) Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# 5) Metrics
rouge = load("rouge")
bleu = load("bleu")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE + BLEU
    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    bleu_result = bleu.compute(predictions=[p.split() for p in decoded_preds],
                               references=[[l.split()] for l in decoded_labels])

    # 🔎 Show a live sample after each eval
    print("\n--- Live Sample ---")
    sample_input = "Company: Google | Role: Data Scientist | Candidate Skills: Python, SQL, Pandas"
    inputs = tokenizer(sample_input, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_length=max_target_len)
    print("INPUT:", sample_input)
    print("OUTPUT:", tokenizer.decode(outputs[0], skip_special_tokens=True))
    print("--------------------\n")

    return {
        "rouge1": rouge_result["rouge1"],
        "rougeL": rouge_result["rougeL"],
        "bleu": bleu_result["bleu"]
    }

# 6) Training arguments
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# ✅ Optimized Training Arguments for 4GB GPU
training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch",          # evaluate once per epoch
    save_strategy="epoch",                # save best checkpoint per epoch
    save_total_limit=2,                   # keep last 2 checkpoints only
    learning_rate=lr,
    num_train_epochs=epochs,
    per_device_train_batch_size=4,        # smaller batch to fit VRAM
    per_device_eval_batch_size=1,         # eval one sample at a time
    gradient_accumulation_steps=4,        # simulate effective batch=16
    weight_decay=weight_decay,
    warmup_ratio=warmup_ratio,
    logging_steps=logging_steps,
    load_best_model_at_end=True,
    report_to=["none"],                   # or ["tensorboard"] if needed
    seed=seed,
    fp16=True,                            # mixed precision = less VRAM
    fp16_full_eval=True,                  # mixed precision for eval too
    dataloader_num_workers=0,             # reduce extra memory threads
)

# ✅ Early stopping (optional)
callbacks = []
if patience > 0:
    callbacks.append(EarlyStoppingCallback(early_stopping_patience=patience))

# ✅ Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=callbacks,
)

# 🚀 Train
trainer.train()

# ✅ After training, display validation results
metrics = trainer.evaluate()
print("\n📊 Final Evaluation Metrics:", metrics)

# Save model
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.6693,0.413191
2,0.3604,0.262749
3,0.293,0.219035
4,0.2555,0.1987
5,0.2428,0.19285


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].



📊 Final Evaluation Metrics: {'eval_loss': 0.19284331798553467, 'eval_runtime': 23.5935, 'eval_samples_per_second': 17.717, 'eval_steps_per_second': 17.717, 'epoch': 5.0}


('./t5_skill_feedback\\tokenizer_config.json',
 './t5_skill_feedback\\special_tokens_map.json',
 './t5_skill_feedback\\tokenizer.json')

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5TokenizerFast

# ✅ Load fine-tuned model + tokenizer
model_path = "./t5_skill_feedback"   # change if saved elsewhere
tokenizer = T5TokenizerFast.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# ✅ Function for inference
def generate_feedback(input_text, max_length=128):
    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_beams=4,           # beam search for better text
        early_stopping=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# ✅ Test samples
samples = [
    "Company: Google | Role: Data Scientist | Candidate Skills: Python, SQL, Pandas",
    "Company: Microsoft | Role: Backend Developer | Candidate Skills: Python, Django, SQL",
    "Company: Accenture | Role: Data Scientist | Candidate Skills: Numpy, TensorFlow, SQL, Pandas, Deep Learning"
]

print("\n🔎 Testing Fine-Tuned Model\n")
for s in samples:
    print("Input:", s)
    print("Output:", generate_feedback(s))
    print("-" * 80)



🔎 Testing Fine-Tuned Model

Input: Company: Google | Role: Data Scientist | Candidate Skills: Python, SQL, Pandas
Output: Strong base with Python, SQL, Pandas. It would help to work on SQL, Machine Learning, Deep Learning, TensorFlow, PyTorch, Statistics, Data Visualization, Feature Engineering, Pandas, Numpy, Model Deployment, Spark, A/B Testing, Data Cleaning. Your profile aligns 13% with the role.
--------------------------------------------------------------------------------
Input: Company: Microsoft | Role: Backend Developer | Candidate Skills: Python, Django, SQL
Output: Strong base with Python, Django, SQL. It would help to work on Java, Spring Boot, SQL, Docker, Microservices, APIs, Python, System Design, NoSQL (MongoDB), GraphQL, Scalability, Kafka, Redis, CI/CD, Cloud Platforms. Your profile aligns 13% with the role.
--------------------------------------------------------------------------------
Input: Company: Accenture | Role: Data Scientist | Candidate Skills: Numpy, Te

In [12]:
import torch
from transformers import T5ForConditionalGeneration, T5TokenizerFast
import json

# ===============================
# 🔹 1. Load fine-tuned model
# ===============================
MODEL_DIR = r"C:/Users/WIN11/Intelligent-Resume-Feedback-System/src/t5_skill_feedback"
SKILL_DICT_FILE = r"C:/Users/WIN11/Intelligent-Resume-Feedback-System/data/skill_requirement_dataset.json"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = T5TokenizerFast.from_pretrained(MODEL_DIR)
model = T5ForConditionalGeneration.from_pretrained(MODEL_DIR).to(device)

# ===============================
# 🔹 2. Load skill dictionary JSON
# ===============================
with open(SKILL_DICT_FILE, "r", encoding="utf-8") as f:
    skill_data = json.load(f)   # <-- directly load array

# Convert into role → skills dict
skill_dict = {}
for company, roles in skill_data.items():
    for role, skills in roles.items():
        skill_dict[f"{company}|{role}"] = skills# ===============================
# 🔹 3. Generate raw T5 feedback
# ===============================
def generate_feedback(company, role, candidate_skills):
    input_text = f"Company: {company} | Role: {role} | Candidate Skills: {', '.join(candidate_skills)}"
    inputs = tokenizer.encode(input_text, return_tensors="pt", truncation=True).to(device)
    outputs = model.generate(inputs, max_length=128, num_beams=4, do_sample=True, temperature=0.7)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# ===============================
# 🔹 4. Post-process feedback
# ===============================
def post_process_feedback(candidate_skills, role, raw_output):
    required_skills = skill_dict.get(f"{company}|{role}", [])
    cand_set = set([s.lower() for s in candidate_skills])
    req_set = set([s.lower() for s in required_skills])

    # Missing skills = required - candidate
    missing = [s for s in required_skills if s.lower() not in cand_set]

    # Compute match %
    match_percent = round(len(req_set & cand_set) / len(req_set) * 100, 2) if req_set else 0

    # Build clean feedback
    feedback = (
        f"✅ You already have {', '.join(candidate_skills)}.\n"
        f"📌 To improve your profile for {role}, focus on learning: {', '.join(missing) if missing else 'No extra skills needed!'}.\n"
        f"📊 Profile Match: {match_percent}%"
    )

    return {
        "candidate_skills": candidate_skills,
        "required_skills": required_skills,
        "missing_skills": missing,
        "match_percent": match_percent,
        "raw_output": raw_output,
        "final_feedback": feedback
    }

# ===============================
# 🔹 5. Test
# ===============================
if __name__ == "__main__":
    tests = [
        ("Google", "Data Scientist", ["Python", "SQL", "Pandas"]),
        ("Microsoft", "Backend Developer", ["Python", "Django", "SQL"]),
        ("Accenture", "Data Scientist", ["Numpy", "TensorFlow", "SQL", "Pandas", "Deep Learning"]),
    ]

    for company, role, skills in tests:
        raw = generate_feedback(company, role, skills)
        result = post_process_feedback(skills, role, raw)

        print("\n🔹 Input:", company, "|", role, "|", skills)
        print("📝 Raw Model Output:", result["raw_output"])
        print("✅ Final Feedback:\n", result["final_feedback"])
        print("-" * 80)



🔹 Input: Google | Data Scientist | ['Python', 'SQL', 'Pandas']
📝 Raw Model Output: You already have good expertise in Python, SQL, Pandas. It would help to work on SQL, Machine Learning, Deep Learning, TensorFlow, PyTorch, Statistics, Data Visualization, Feature Engineering, Pandas, Numpy, Model Deployment, Spark, A/B Testing, Data Cleaning. You meet around 26% of the requirements.
✅ Final Feedback:
 ✅ You already have Python, SQL, Pandas.
📌 To improve your profile for Data Scientist, focus on learning: Machine Learning, Deep Learning, TensorFlow, PyTorch, Statistics, Data Visualization, Feature Engineering, Numpy, Model Deployment, Spark, A/B Testing, Data Cleaning.
📊 Profile Match: 20.0%
--------------------------------------------------------------------------------

🔹 Input: Microsoft | Backend Developer | ['Python', 'Django', 'SQL']
📝 Raw Model Output: You already have good expertise in Python, Django, SQL. It would help to work on Java, Spring Boot, SQL, Docker, Microservices, A