In [None]:
import torch
from datasets import Dataset
import evaluate
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer

# -------------------------------
# 1. Load your Phase 3 trained model
# -------------------------------
MODEL_PATH = r"C:\Users\WIN11\OneDrive\Desktop\resume proj\projwithml\models\third_phase_model"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)

# Check model's label2id mapping
print("Model labels:", model.config.label2id)

# -------------------------------
# 2. Build your error-corrected dataset
# -------------------------------
examples = [
    {"tokens": ["John", "knows", "Python", "and", "Java"],
     "ner_tags": ["O", "O", "SKILL", "O", "SKILL"]},
    {"tokens": ["She", "worked", "with", "FastAPI", "and", "MongoDB"],
     "ner_tags": ["O", "O", "O", "SKILL", "O", "SKILL"]},
    {"tokens": ["He", "is", "a", "Software", "Engineer", "at", "Google"],
     "ner_tags": ["O", "O", "O", "ROLE", "ROLE", "O", "COMPANY"]},
    {"tokens": ["They", "used", "NumPy", "and", "Pandas", "for", "ML"],
     "ner_tags": ["O", "O", "SKILL", "O", "SKILL", "O", "SKILL"]}
]

# -------------------------------
# 3. Map dataset labels to BIO format (matches Phase 3 model)
# -------------------------------
def map_labels_to_bio(example):
    bio_tags = []
    for tag in example["ner_tags"]:
        if tag == "O":
            bio_tags.append("O")
        else:
            # Single-token entities → use B-TAG
            bio_tags.append(f"B-{tag}")
    example["ner_tags"] = bio_tags
    return example

dataset = Dataset.from_dict({
    "tokens": [ex["tokens"] for ex in examples],
    "ner_tags": [ex["ner_tags"] for ex in examples]
})

# Apply BIO mapping
dataset = dataset.map(map_labels_to_bio)

# Split train/test
dataset = dataset.train_test_split(test_size=0.25, seed=42)

# -------------------------------
# 4. Tokenize and align labels
# -------------------------------
label_list = list(model.config.label2id.keys())
label2id = model.config.label2id
id2label = model.config.id2label

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True
    )
    
    labels = []
    for i, doc_labels in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[doc_labels[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

encoded_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# -------------------------------
# 5. Data collator and metric
# -------------------------------
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }

# -------------------------------
# 6. Training Arguments
# -------------------------------
training_args = TrainingArguments(
    output_dir="./phase4_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True
)

# -------------------------------
# 7. Trainer
# -------------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# -------------------------------
# 8. Train
# -------------------------------
trainer.train()

# -------------------------------
# 9. Save final model
# -------------------------------
FINAL_MODEL_PATH = r"C:\Users\WIN11\OneDrive\Desktop\resume proj\projwithml\models\fourth_phase_model"
trainer.save_model(FINAL_MODEL_PATH)
tokenizer.save_pretrained(FINAL_MODEL_PATH)

print(f"✅ Phase 4 training completed. Model saved at {FINAL_MODEL_PATH}")


Map: 100%|██████████| 3/3 [00:00<00:00, 120.01 examples/s]
Map: 100%|██████████| 1/1 [00:00<00:00, 111.03 examples/s]
Map: 100%|██████████| 3/3 [00:00<00:00, 22.22 examples/s]
Map: 100%|██████████| 1/1 [00:00<00:00, 100.01 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyError: np.int64(6)

In [1]:
import re
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

# -------------------------------
# 1. Load trained model
# -------------------------------
MODEL_PATH = r"C:\Users\WIN11\OneDrive\Desktop\resume proj\projwithml\models\fourth_phase_model"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)

id2label = model.config.id2label


# -------------------------------
# 2. Entity Cleaner + BIO Merger
# -------------------------------
def clean_and_merge_entities(tokens, predictions, id2label):
    entities = []
    current_entity_tokens = []
    current_label = None

    def stitch_tokens(tok_list):
        """Join WordPiece tokens into clean text with proper spacing"""
        if not tok_list:
            return ""
        
        result = ""
        
        for i, tok in enumerate(tok_list):
            if tok.startswith("##"):
                # This is a WordPiece continuation - attach directly without space
                result += tok[2:]
            elif i == 0:
                # First token - no space needed
                result += tok
            elif tok in [",", ".", ":", ";", "'s"]:
                # Punctuation - attach without space
                result += tok
            else:
                # Regular token - add space before
                result += " " + tok
                
        return result.strip()

    def finalize_entity(tokens, label):
        if not tokens: 
            return None
        word = stitch_tokens(tokens).strip()
        # remove junk like only punctuation
        if re.fullmatch(r"^[,.\-;:]+$", word):
            return None
        # collapse weird spaces
        word = re.sub(r"\s+", " ", word)
        return {"word": word, "entity_group": label}

    def is_wordpiece_continuation(token):
        """Check if token is a WordPiece continuation"""
        return token.startswith("##")

    def should_continue_entity(token, current_label, predicted_label):
        """
        Decide if token should continue current entity based on context
        """
        if not current_label:
            return False
            
        # If it's a WordPiece continuation, it should continue the current entity
        if is_wordpiece_continuation(token):
            return True
            
        # Handle punctuation that should stay with entities
        if token in [",", ".", "'s", "'", "s"] and current_label:
            return True
            
        return False

    # Enhanced BIO merging with WordPiece awareness
    for i, (token, pred_id) in enumerate(zip(tokens, predictions)):
        label = id2label[pred_id]

        if token in tokenizer.all_special_tokens:
            continue

        # Parse label
        if "-" in label:
            prefix, entity_type = label.split("-", 1)
        else:
            prefix, entity_type = "B", label if label != "O" else None

        # Decide how to handle this token
        if label == "O" and not should_continue_entity(token, current_label, label):
            # End current entity and don't start a new one
            ent = finalize_entity(current_entity_tokens, current_label)
            if ent:
                entities.append(ent)
            current_entity_tokens, current_label = [], None
            
        elif should_continue_entity(token, current_label, label):
            # Continue current entity regardless of predicted label
            current_entity_tokens.append(token)
            
        elif prefix == "B" or (prefix == "I" and entity_type != current_label):
            # Start new entity
            ent = finalize_entity(current_entity_tokens, current_label)
            if ent:
                entities.append(ent)
            current_entity_tokens = [token]
            current_label = entity_type
            
        elif prefix == "I" and entity_type == current_label:
            # Continue current entity
            current_entity_tokens.append(token)
            
        else:
            # Fallback: start new entity
            ent = finalize_entity(current_entity_tokens, current_label)
            if ent:
                entities.append(ent)
            current_entity_tokens = [token] if label != "O" else []
            current_label = entity_type if label != "O" else None

    # Finalize last entity
    ent = finalize_entity(current_entity_tokens, current_label)
    if ent:
        entities.append(ent)

    return post_process_entities(entities)


def post_process_entities(entities):
    """
    Post-process entities to extract SKILLS, COMPANY, and ROLE
    """
    if not entities:
        return entities
    
    processed = []
    
    # Define patterns for better classification
    skill_keywords = {
        'python', 'java', 'tensorflow', 'docker', 'kubernetes', 'git', 
        'aws', 'react', 'nodejs', 'javascript', 'c++', 'c#', 'ruby',
        'angular', 'vue', 'mysql', 'postgresql', 'mongodb', 'redis',
        'html', 'css', 'sql', 'nosql', 'flask', 'django', 'spring',
        'pytorch', 'scikit-learn', 'pandas', 'numpy', 'matplotlib',
        'spark', 'hadoop', 'machine learning', 'data processing',
        'google cloud', 'azure', 'tableau', 'power bi'
    }
    
    role_keywords = {
        'scientist', 'engineer', 'developer', 'analyst', 'manager', 
        'director', 'lead', 'senior', 'junior', 'consultant', 
        'architect', 'specialist', 'coordinator', 'supervisor',
        'data scientist', 'software engineer', 'product manager',
        'business analyst', 'project manager', 'team lead'
    }
    
    company_keywords = {
        'netflix', 'google', 'microsoft', 'amazon', 'apple', 'meta',
        'facebook', 'tesla', 'uber', 'airbnb', 'spotify', 'adobe',
        'salesforce', 'oracle', 'ibm', 'intel', 'nvidia'
    }
    
    # Skip educational institutions
    educational_institutions = {
        'mit', 'stanford', 'harvard', 'berkeley', 'cmu', 'caltech',
        'university', 'college', 'institute', 'school'
    }
    
    for entity in entities:
        word = entity["word"].strip()
        entity_type = entity["entity_group"]
        
        # Skip empty or punctuation-only entities
        if not word or re.fullmatch(r"^[,.\-;:'\s]+$", word):
            continue
            
        # Clean up the word
        word = re.sub(r'\s+', ' ', word)  # Normalize spaces
        word = re.sub(r'\s*[,.:;]\s*$', '', word)  # Remove trailing punctuation
        word = re.sub(r'^\s*[,.:;]\s*', '', word)  # Remove leading punctuation
        
        # Skip if still empty after cleaning
        if not word.strip():
            continue
            
        word_lower = word.lower().strip()
        
        # Skip educational institutions
        if any(edu in word_lower for edu in educational_institutions):
            continue
        
        # Skip obviously wrong entities (like single words that don't make sense)
        if word_lower in ['big', 'sarah', 'she', 'holds', 'certifications', 'in']:
            continue
            
        # Reclassify based on content
        new_entity_type = entity_type
        
        # Check if it's a skill
        if any(skill in word_lower for skill in skill_keywords):
            new_entity_type = "SKILL"
        
        # Check if it's a role
        elif any(role in word_lower for role in role_keywords):
            new_entity_type = "ROLE"
            
        # Check if it's a company
        elif any(company in word_lower for company in company_keywords):
            new_entity_type = "COMPANY"
        
        # Additional role detection based on common patterns
        if re.search(r'\b(senior|junior|lead|principal|chief)\s+\w+', word_lower):
            new_entity_type = "ROLE"
        
        # Only keep relevant entities
        if new_entity_type in ["SKILL", "COMPANY", "ROLE"]:
            processed.append({
                "word": word,
                "entity_group": new_entity_type
            })
    
    return remove_duplicates(processed)


def remove_duplicates(entities):
    """Remove duplicate entities while preserving order"""
    seen = set()
    unique_entities = []
    
    for entity in entities:
        # Create a key based on normalized word and entity type
        key = (entity["word"].lower().strip(), entity["entity_group"])
        if key not in seen:
            seen.add(key)
            unique_entities.append(entity)
    
    return unique_entities


# -------------------------------
# 3. Predict Function
# -------------------------------
def extract_entities(text: str, debug=False):
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    outputs = model(**inputs)
    logits = outputs.logits

    predictions = torch.argmax(logits, dim=-1).squeeze().tolist()
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze())

    if debug:
        print("\nDEBUG: Token-Label pairs:")
        for i, (token, pred_id) in enumerate(zip(tokens, predictions)):
            if token not in tokenizer.all_special_tokens:
                print(f"{i:2d}: {token:15} -> {id2label[pred_id]:15}")
        print()

    return clean_and_merge_entities(tokens, predictions, id2label)


# -------------------------------
# 4. Enhanced Output Function
# -------------------------------
def display_extracted_entities(entities, text):
    """Display entities in a clean, organized format"""
    print("Input Text:\n", text)
    print("\nExtracted Entities:")
    print("=" * 50)
    
    # Group entities by type
    skills = [ent for ent in entities if ent["entity_group"] == "SKILL"]
    companies = [ent for ent in entities if ent["entity_group"] == "COMPANY"]
    roles = [ent for ent in entities if ent["entity_group"] == "ROLE"]
    
    print(f"\n🎯 ROLES ({len(roles)}):")
    if roles:
        for role in roles:
            print(f"  - {role['word']}")
    else:
        print("  - No roles detected")
    
    print(f"\n🏢 COMPANIES ({len(companies)}):")
    if companies:
        for company in companies:
            print(f"  - {company['word']}")
    else:
        print("  - No companies detected")
    
    print(f"\n💻 SKILLS ({len(skills)}):")
    if skills:
        for skill in skills:
            print(f"  - {skill['word']}")
    else:
        print("  - No skills detected")
    
    print(f"\n📋 All entities (cleaned and filtered):")
    for ent in entities:
        print(f"  {ent['entity_group']:8} | {ent['word']}")


# -------------------------------
# 5. Example Usage
# -------------------------------
if __name__ == "__main__":
    text = """Michael Rodriguez is employed as a DevOps Engineer at Amazon. He has strong skills in 
Kubernetes, Docker, Jenkins, and Terraform. Michael is experienced with cloud platforms 
like AWS and Azure, and is proficient in scripting languages including Python and Bash. 
He also has knowledge of monitoring tools like Prometheus and Grafana."""

    entities = extract_entities(text, debug=False)
    display_extracted_entities(entities, text)
    
    # Test with another example
    print("\n" + "="*70)
    print("SECOND EXAMPLE:")
    print("="*70)
    
    text2 = """John Smith is a Lead Software Engineer at Google. He specializes in 
JavaScript, React, Node.js, and Docker. Previously worked as a Software Developer 
at Microsoft with experience in C#, .NET, and Azure cloud services."""
    
    entities2 = extract_entities(text2, debug=False)
    display_extracted_entities(entities2, text2)

  return forward_call(*args, **kwargs)


Input Text:
 Michael Rodriguez is employed as a DevOps Engineer at Amazon. He has strong skills in 
Kubernetes, Docker, Jenkins, and Terraform. Michael is experienced with cloud platforms 
like AWS and Azure, and is proficient in scripting languages including Python and Bash. 
He also has knowledge of monitoring tools like Prometheus and Grafana.

Extracted Entities:

🎯 ROLES (2):
  - DevOps Engineer
  - monitoring tools

🏢 COMPANIES (6):
  - Amazon
  - Jenkins
  - cloud platforms
  - languages
  - Bash
  - and Grafana

💻 SKILLS (8):
  - Kubernetes
  - Docker
  - Terraform
  - AWS
  - and Azure
  - scripting
  - Python
  - Prometheus

📋 All entities (cleaned and filtered):
  ROLE     | DevOps Engineer
  COMPANY  | Amazon
  SKILL    | Kubernetes
  SKILL    | Docker
  COMPANY  | Jenkins
  SKILL    | Terraform
  COMPANY  | cloud platforms
  SKILL    | AWS
  SKILL    | and Azure
  SKILL    | scripting
  COMPANY  | languages
  SKILL    | Python
  COMPANY  | Bash
  ROLE     | monitoring tool