# Resume Screening & Ranking System
## Automated Resume Processing from Kaggle Datasets

This notebook automatically loads and processes resumes from:
- **Resume Dataset** (CSV format)
- **Job Description Dataset** (CSV format)
- **PDF/TXT files** (optional)

---

## Required Libraries

In [68]:
# Core libraries
import re
import os
import glob
import pandas as pd
import numpy as np
from pathlib import Path

# NLP libraries
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Scikit-learn for ML
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

# PDF processing
try:
    import PyPDF2
    PDF_AVAILABLE = True
    print("PyPDF2 available for PDF processing")
except ImportError:
    PDF_AVAILABLE = False
    print("PyPDF2 not installed. Install with: pip install PyPDF2")

import warnings
warnings.filterwarnings('ignore')

print("All core libraries imported successfully")

PyPDF2 available for PDF processing
All core libraries imported successfully


## Download NLTK Data and Load spaCy

In [69]:
# Download NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

# Load spaCy model
try:
    nlp = spacy.load('en_core_web_sm')
    print("spaCy model loaded successfully")
except:
    print("spaCy model not found. Run: python -m spacy download en_core_web_sm")
    nlp = None

# Get stopwords
stop_words = set(stopwords.words('english'))
print(f"Loaded {len(stop_words)} stopwords")

spaCy model loaded successfully
Loaded 198 stopwords


## Configuration

Place the CSV files in a `data/` folder or update the paths below.

In [70]:
# Configuration
CONFIG = {
    # Dataset paths (update these based on your setup)
    'resume_csv': 'D:\Padhai\git\FUTURE_ML_03\Resume.csv',  # Resume dataset from Kaggle
    'job_csv': 'D:\Padhai\git\FUTURE_ML_03\monster_com-job_sample.csv',  # Job description dataset
    
    # Alternative: Use PDF/TXT files from directories
    'resume_pdf_dir': 'data/resumes_pdf/',
    'resume_txt_dir': 'data/resumes_txt/',
    
    # Scoring weights
    'text_similarity_weight': 0.3,
    'skill_match_weight': 0.7,
    
    # How many candidates to show in detail
    'top_n_candidates': 10,
    
    # Train/test split ratio
    'test_size': 0.2,  # 20% for testing, 80% for training
}

print("Configuration loaded:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

Configuration loaded:
  resume_csv: D:\Padhai\git\FUTURE_ML_03\Resume.csv
  job_csv: D:\Padhai\git\FUTURE_ML_03\monster_com-job_sample.csv
  resume_pdf_dir: data/resumes_pdf/
  resume_txt_dir: data/resumes_txt/
  text_similarity_weight: 0.3
  skill_match_weight: 0.7
  top_n_candidates: 10
  test_size: 0.2


## Define Comprehensive Skill Database

In [71]:
# Comprehensive skill database
SKILL_DATABASE = {
    'programming_languages': [
        'python', 'java', 'javascript', 'c++', 'c#', 'ruby', 'php', 'swift',
        'kotlin', 'go', 'rust', 'typescript', 'scala', 'r', 'matlab', 'sql',
        'html', 'css', 'bash', 'perl', 'c', 'objective-c', 'vba'
    ],
    'ml_frameworks': [
        'tensorflow', 'pytorch', 'keras', 'scikit-learn', 'sklearn', 'xgboost',
        'lightgbm', 'catboost', 'hugging face', 'transformers', 'opencv'
    ],
    'data_science': [
        'pandas', 'numpy', 'scipy', 'matplotlib', 'seaborn', 'plotly',
        'data analysis', 'data visualization', 'statistics', 'machine learning',
        'deep learning', 'neural networks', 'nlp', 'computer vision',
        'natural language processing', 'feature engineering', 'model deployment',
        'data mining', 'predictive modeling', 'statistical analysis'
    ],
    'web_technologies': [
        'react', 'angular', 'vue', 'vue.js', 'node.js', 'nodejs', 'django', 'flask',
        'spring', 'express', 'fastapi', 'rest', 'api', 'graphql', 'jquery',
        'bootstrap', 'tailwind', 'asp.net', 'laravel', 'rails', 'webpack'
    ],
    'databases': [
        'mysql', 'postgresql', 'mongodb', 'redis', 'cassandra', 'oracle',
        'sql server', 'dynamodb', 'elasticsearch', 'sqlite', 'nosql',
        'mariadb', 'neo4j', 'couchdb'
    ],
    'cloud_devops': [
        'aws', 'azure', 'gcp', 'google cloud', 'docker', 'kubernetes',
        'jenkins', 'gitlab', 'terraform', 'ansible', 'ci/cd', 'devops',
        'linux', 'unix', 'git', 'github', 'nginx', 'apache', 'heroku'
    ],
    'big_data': [
        'hadoop', 'spark', 'kafka', 'airflow', 'hive', 'etl', 'data pipeline',
        'big data', 'mapreduce', 'pig', 'flink', 'storm'
    ],
    'business_tools': [
        'excel', 'powerpoint', 'word', 'tableau', 'power bi', 'sap', 'salesforce',
        'jira', 'confluence', 'microsoft office', 'google analytics', 'crm', 'erp'
    ],
    'soft_skills': [
        'leadership', 'communication', 'teamwork', 'problem solving',
        'analytical', 'critical thinking', 'presentation', 'collaboration',
        'agile', 'scrum', 'project management', 'time management',
        'creative', 'innovative', 'strategic thinking'
    ],
    'design': [
        'figma', 'sketch', 'photoshop', 'illustrator', 'ui', 'ux',
        'user experience', 'user interface', 'wireframing', 'prototyping'
    ],
    'testing': [
        'selenium', 'junit', 'pytest', 'testing', 'qa', 'quality assurance',
        'test automation', 'unit testing', 'integration testing'
    ]
}

# Flatten to single set
ALL_SKILLS = set()
for category, skills in SKILL_DATABASE.items():
    ALL_SKILLS.update(skills)

print(f"Skill database: {len(ALL_SKILLS)} unique skills across {len(SKILL_DATABASE)} categories")

Skill database: 167 unique skills across 11 categories


## File Processing

In [72]:
def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file.
    """
    if not PDF_AVAILABLE:
        print("PyPDF2 not available. Install with: pip install PyPDF2")
        return ""
    
    try:
        text = ""
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
        return text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""


def extract_text_from_txt(txt_path):
    """
    Extract text from a TXT file.
    """
    try:
        with open(txt_path, 'r', encoding='utf-8', errors='ignore') as file:
            return file.read()
    except Exception as e:
        print(f"Error reading {txt_path}: {e}")
        return ""


def load_resumes_from_directory(directory, file_type='txt'):
    """
    Load all resumes from a directory.
    
    Args:
        directory: Path to directory containing resume files
        file_type: 'txt' or 'pdf'
    
    Returns:
        Dictionary with filename: text content
    """
    resumes = {}
    
    if not os.path.exists(directory):
        print(f"Directory not found: {directory}")
        return resumes
    
    if file_type == 'pdf':
        pattern = os.path.join(directory, '*.pdf')
        extract_func = extract_text_from_pdf
    else:
        pattern = os.path.join(directory, '*.txt')
        extract_func = extract_text_from_txt
    
    files = glob.glob(pattern)
    print(f"Found {len(files)} {file_type.upper()} files in {directory}")
    
    for filepath in files:
        filename = os.path.basename(filepath)
        candidate_name = os.path.splitext(filename)[0]
        text = extract_func(filepath)
        
        if text.strip():
            resumes[candidate_name] = text
    
    return resumes


print("File processing functions defined")

File processing functions defined


## Load Resume Dataset from Kaggle CSV

In [73]:
def load_resume_dataset(csv_path):
    """
    Load resume dataset from Kaggle CSV.
    Expected columns: 'Resume' or 'Resume_str', 'Category'
    """
    if not os.path.exists(csv_path):
        print(f"Resume CSV not found at: {csv_path}")
        print("   Download from: https://www.kaggle.com/datasets/snehaanbhawal/resume-dataset")
        return None
    
    try:
        df = pd.read_csv(csv_path)
        print(f"Loaded resume dataset with {len(df)} resumes")
        print(f"  Columns: {list(df.columns)}")
        
        # Check for resume text column (different datasets use different names)
        resume_col = None
        for col in ['Resume', 'Resume_str', 'resume', 'resume_text', 'text']:
            if col in df.columns:
                resume_col = col
                break
        
        if resume_col is None:
            print("Could not find resume text column. Using first text column.")
            resume_col = df.columns[0]
        
        print(f"  Using column '{resume_col}' for resume text")
        
        # Check for category column
        category_col = None
        for col in ['Category', 'category', 'job_category', 'role']:
            if col in df.columns:
                category_col = col
                break
        
        if category_col:
            print(f"  Found {df[category_col].nunique()} unique job categories")
            print(f"  Categories: {df[category_col].unique()[:10]}")
        
        return df, resume_col, category_col
    
    except Exception as e:
        print(f"Error loading resume dataset: {e}")
        return None


# Load the dataset
resume_data = load_resume_dataset(CONFIG['resume_csv'])

if resume_data:
    resume_df, resume_text_col, resume_category_col = resume_data
else:
    resume_df = None
    print("\nWill try to load from PDF/TXT directories...")

Loaded resume dataset with 2484 resumes
  Columns: ['ID', 'Resume_str', 'Resume_html', 'Category']
  Using column 'Resume_str' for resume text
  Found 24 unique job categories
  Categories: <StringArray>
[                    'HR',               'DESIGNER', 'INFORMATION-TECHNOLOGY',
                'TEACHER',               'ADVOCATE',   'BUSINESS-DEVELOPMENT',
             'HEALTHCARE',                'FITNESS',            'AGRICULTURE',
                    'BPO']
Length: 10, dtype: str


## Load Job Description Dataset

In [74]:
def load_job_dataset(csv_path):
    """
    Load job description dataset from Kaggle CSV.
    """
    if not os.path.exists(csv_path):
        print(f"Job description CSV not found at: {csv_path}")
        print("   Download from: https://www.kaggle.com/datasets/PromptCloudHQ/us-jobs-on-monstercom")
        return None
    
    try:
        df = pd.read_csv(csv_path)
        print(f"Loaded job dataset with {len(df)} job postings")
        print(f"  Columns: {list(df.columns)}")
        
        # Find relevant columns
        title_col = None
        desc_col = None
        
        for col in ['job_title', 'title', 'position', 'role']:
            if col in df.columns:
                title_col = col
                break
        
        for col in ['job_description', 'description', 'desc', 'job_details']:
            if col in df.columns:
                desc_col = col
                break
        
        print(f"  Using '{title_col}' for job titles")
        print(f"  Using '{desc_col}' for job descriptions")
        
        return df, title_col, desc_col
    
    except Exception as e:
        print(f"Error loading job dataset: {e}")
        return None


# Load job dataset
job_data = load_job_dataset(CONFIG['job_csv'])

if job_data:
    job_df, job_title_col, job_desc_col = job_data
else:
    job_df = None
    print("\nNo job dataset loaded. Will use sample job description.")

Loaded job dataset with 22000 job postings
  Columns: ['country', 'country_code', 'date_added', 'has_expired', 'job_board', 'job_description', 'job_title', 'job_type', 'location', 'organization', 'page_url', 'salary', 'sector', 'uniq_id']
  Using 'job_title' for job titles
  Using 'job_description' for job descriptions


## Alternative - Load from PDF/TXT Directories

In [75]:
# If CSV not available, try loading from directories
if resume_df is None:
    print("Attempting to load resumes from PDF/TXT directories...")
    
    # Try PDF directory
    pdf_resumes = load_resumes_from_directory(CONFIG['resume_pdf_dir'], 'pdf')
    
    # Try TXT directory
    txt_resumes = load_resumes_from_directory(CONFIG['resume_txt_dir'], 'txt')
    
    # Combine
    all_resumes = {**pdf_resumes, **txt_resumes}
    
    if all_resumes:
        # Convert to DataFrame
        resume_df = pd.DataFrame([
            {'candidate_name': name, 'resume_text': text, 'category': 'Unknown'}
            for name, text in all_resumes.items()
        ])
        resume_text_col = 'resume_text'
        resume_category_col = 'category'
        print(f"Created DataFrame with {len(resume_df)} resumes from files")
    else:
        print("\nNo resume data found!")
        print("   Please either:")
        print("   1. Download Kaggle resume dataset")
        print("   2. Place PDF/TXT resumes in data directories")
        print("   3. Update CONFIG paths")

## Text Preprocessing

In [76]:
def clean_text(text):
    """
    Clean and normalize text.
    """
    if pd.isna(text) or not isinstance(text, str):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # Remove emails
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove phone numbers
    text = re.sub(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '', text)
    
    # Remove special characters but keep important ones for skills
    text = re.sub(r'[^a-z0-9\s\+\#\.]', ' ', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()


def extract_skills(text, skill_set=ALL_SKILLS):
    """
    Extract skills from text using pattern matching.
    """
    if pd.isna(text) or not isinstance(text, str):
        return set()
    
    found_skills = set()
    text_lower = text.lower()
    
    # Direct matching
    for skill in skill_set:
        pattern = r'\b' + re.escape(skill) + r'\b'
        if re.search(pattern, text_lower):
            found_skills.add(skill)
    
    # Handle common variations
    if 'sklearn' in text_lower or 'scikit' in text_lower:
        found_skills.add('scikit-learn')
    if 'node.js' in text_lower or 'nodejs' in text_lower:
        found_skills.add('node.js')
    if 'ml' in text_lower.split():
        found_skills.add('machine learning')
    
    return found_skills


print("Text preprocessing functions ready")

Text preprocessing functions ready


## Process Resume Dataset

In [77]:
if resume_df is not None:
    print(f"Processing {len(resume_df)} resumes...")
    
    # Add candidate ID if not present
    if 'candidate_name' not in resume_df.columns:
        resume_df['candidate_name'] = [f"Candidate_{i+1}" for i in range(len(resume_df))]
    
    # Clean text
    print("  → Cleaning text...")
    resume_df['cleaned_text'] = resume_df[resume_text_col].apply(clean_text)
    
    # Extract skills
    print("  → Extracting skills...")
    resume_df['extracted_skills'] = resume_df[resume_text_col].apply(extract_skills)
    resume_df['num_skills'] = resume_df['extracted_skills'].apply(len)
    
    # Remove empty resumes
    resume_df = resume_df[resume_df['cleaned_text'].str.len() > 50].reset_index(drop=True)
    
    print(f"\nProcessed {len(resume_df)} resumes")
    print(f"  Average skills per resume: {resume_df['num_skills'].mean():.1f}")
    print(f"  Min skills: {resume_df['num_skills'].min()}")
    print(f"  Max skills: {resume_df['num_skills'].max()}")
    
    # Show sample
    print("\nSample processed resume:")
    sample_idx = 0
    print(f"  Candidate: {resume_df.iloc[sample_idx]['candidate_name']}")
    if resume_category_col:
        print(f"  Category: {resume_df.iloc[sample_idx][resume_category_col]}")
    print(f"  Skills found: {resume_df.iloc[sample_idx]['num_skills']}")
    print(f"  Skills: {list(resume_df.iloc[sample_idx]['extracted_skills'])[:10]}...")
else:
    print("No resume data to process!")

Processing 2484 resumes...
  → Cleaning text...
  → Extracting skills...

Processed 2483 resumes
  Average skills per resume: 5.3
  Min skills: 0
  Max skills: 41

Sample processed resume:
  Candidate: Candidate_1
  Category: HR
  Skills found: 6
  Skills: ['statistics', 'data analysis', 'time management', 'leadership', 'swift', 'analytical']...


## Select or Create Job Description

In [78]:
# Option 1: Use job from dataset
if job_df is not None and len(job_df) > 0:
    # Filter for ML/Data Science related jobs
    ml_keywords = ['machine learning', 'data scientist', 'ml engineer', 'ai', 'data science']
    
    job_df['is_ml_job'] = job_df[job_title_col].str.lower().apply(
        lambda x: any(keyword in str(x).lower() for keyword in ml_keywords) if pd.notna(x) else False
    )
    
    ml_jobs = job_df[job_df['is_ml_job']]
    
    if len(ml_jobs) > 0:
        # Select first ML job
        selected_job = ml_jobs.iloc[0]
        job_title = selected_job[job_title_col]
        job_description = selected_job[job_desc_col]
        print(f"Selected job from dataset: {job_title}")
    else:
        # Select any job
        selected_job = job_df.iloc[0]
        job_title = selected_job[job_title_col]
        job_description = selected_job[job_desc_col]
        print(f"Selected job from dataset: {job_title}")
else:
    # Option 2: Use sample job description
    print("Using sample job description...")
    job_title = "Senior Machine Learning Engineer"
    job_description = """
    SENIOR MACHINE LEARNING ENGINEER
    
    We are seeking an experienced Senior Machine Learning Engineer to join our AI team.
    
    RESPONSIBILITIES:
    - Design and implement ML models using TensorFlow and PyTorch
    - Build deep learning architectures for various applications
    - Develop MLOps pipelines for model deployment
    - Work with large datasets using Pandas and NumPy
    - Deploy models on AWS using Docker and Kubernetes
    - Collaborate with cross-functional teams
    - Conduct experiments and statistical analysis
    - Mentor junior team members
    
    REQUIRED:
    - 5+ years experience in machine learning or data science
    - Strong Python programming skills
    - Expert in TensorFlow, PyTorch, or Keras
    - Experience with deep learning and neural networks
    - Proficiency in Pandas, NumPy, scikit-learn
    - Experience deploying ML models to production
    - Knowledge of AWS, Docker, Kubernetes
    - Strong understanding of statistics
    - Git version control
    - Excellent problem-solving skills
    
    PREFERRED:
    - Experience with Spark, Kafka, or Airflow
    - NLP or computer vision experience
    - Master's or Ph.D. in Computer Science
    - Strong communication and leadership skills
    """

# Process job description
job_cleaned = clean_text(job_description)
job_skills = extract_skills(job_description)

print(f"\nJob: {job_title}")
print(f"Required skills found: {len(job_skills)}")
print(f"Skills: {sorted(job_skills)}")

Selected job from dataset: Johnson & Johnson Family of Companies Job Application for Senior Training Leader | Monster.com var MONS_LOG_VARS = {"JobID":

Job: Johnson & Johnson Family of Companies Job Application for Senior Training Leader | Monster.com var MONS_LOG_VARS = {"JobID":
Required skills found: 11
Skills: ['agile', 'collaboration', 'communication', 'excel', 'innovative', 'leadership', 'microsoft office', 'powerpoint', 'project management', 'r', 'word']


## Train/Test Split

Split resumes into training and testing sets.

In [79]:
if resume_df is not None and len(resume_df) > 0:
    # Split data
    train_df, test_df = train_test_split(
        resume_df, 
        test_size=CONFIG['test_size'],
        random_state=42
    )
    
    print(f"Dataset Split:")
    print(f"  Training set: {len(train_df)} resumes ({(1-CONFIG['test_size'])*100:.0f}%)")
    print(f"  Testing set: {len(test_df)} resumes ({CONFIG['test_size']*100:.0f}%)")
    
    # We'll use the test set for ranking (simulates real screening)
    working_df = test_df.copy()
    print(f"\nWill screen {len(working_df)} candidates from test set")
else:
    print("No data for train/test split!")
    working_df = None

Dataset Split:
  Training set: 1986 resumes (80%)
  Testing set: 497 resumes (20%)

Will screen 497 candidates from test set


## Calculate Similarity Scores Using TF-IDF

In [80]:
if working_df is not None and len(working_df) > 0:
    print("Calculating text similarity scores...")
    
    # Prepare texts
    all_texts = [job_cleaned] + working_df['cleaned_text'].tolist()
    
    # TF-IDF Vectorization
    tfidf = TfidfVectorizer(
        max_features=1000,
        ngram_range=(1, 2),
        stop_words='english',
        min_df=1
    )
    
    tfidf_matrix = tfidf.fit_transform(all_texts)
    
    # Calculate cosine similarity
    similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
    
    working_df['text_similarity'] = similarities
    
    print(f"Calculated text similarity for {len(working_df)} resumes")
    print(f"  Mean similarity: {similarities.mean():.4f}")
    print(f"  Max similarity: {similarities.max():.4f}")
    print(f"  Min similarity: {similarities.min():.4f}")
else:
    print(" No data for similarity calculation!")

Calculating text similarity scores...
Calculated text similarity for 497 resumes
  Mean similarity: 0.1406
  Max similarity: 0.3228
  Min similarity: 0.0298


## Calculate Skill Match Scores

In [81]:
if working_df is not None and len(working_df) > 0:
    print("Calculating skill match scores...")
    
    def calculate_skill_match(resume_skills):
        """
        Calculate what % of required skills the candidate has.
        """
        if len(job_skills) == 0:
            return 0.0, set(), job_skills
        
        matching = resume_skills.intersection(job_skills)
        missing = job_skills - resume_skills
        score = len(matching) / len(job_skills)
        
        return score, matching, missing
    
    # Apply to all resumes
    results = working_df['extracted_skills'].apply(calculate_skill_match)
    
    working_df['skill_match_score'] = results.apply(lambda x: x[0])
    working_df['matching_skills'] = results.apply(lambda x: x[1])
    working_df['missing_skills'] = results.apply(lambda x: x[2])
    working_df['num_matching'] = working_df['matching_skills'].apply(len)
    working_df['num_missing'] = working_df['missing_skills'].apply(len)
    
    print(f" Calculated skill matches for {len(working_df)} resumes")
    print(f"  Mean skill match: {working_df['skill_match_score'].mean():.2%}")
    print(f"  Best skill match: {working_df['skill_match_score'].max():.2%}")
else:
    print("No data for skill matching!")

Calculating skill match scores...
 Calculated skill matches for 497 resumes
  Mean skill match: 23.21%
  Best skill match: 72.73%


## Calculate Final Weighted Scores

In [82]:
if working_df is not None and len(working_df) > 0:
    print("Calculating final scores...")
    
    # Weighted combination
    working_df['final_score'] = (
        CONFIG['text_similarity_weight'] * working_df['text_similarity'] +
        CONFIG['skill_match_weight'] * working_df['skill_match_score']
    )
    
    print(f"\nScoring Formula:")
    print(f"  Final Score = ({CONFIG['text_similarity_weight']:.0%} × Text Similarity) + "
          f"({CONFIG['skill_match_weight']:.0%} × Skill Match)")
    
    print(f"\nScore Statistics:")
    print(f"  Mean: {working_df['final_score'].mean():.2%}")
    print(f"  Std Dev: {working_df['final_score'].std():.2%}")
    print(f"  Max: {working_df['final_score'].max():.2%}")
    print(f"  Min: {working_df['final_score'].min():.2%}")
else:
    print(" No data for final scoring!")

Calculating final scores...

Scoring Formula:
  Final Score = (30% × Text Similarity) + (70% × Skill Match)

Score Statistics:
  Mean: 20.47%
  Std Dev: 10.93%
  Max: 58.08%
  Min: 0.90%


## Rank Candidates

In [83]:
if working_df is not None and len(working_df) > 0:
    # Sort by final score
    ranked_df = working_df.sort_values('final_score', ascending=False).reset_index(drop=True)
    ranked_df['rank'] = range(1, len(ranked_df) + 1)
    
    print("="*90)
    print("CANDIDATE RANKINGS".center(90))
    print("="*90)
    
    # Display top candidates
    top_n = min(CONFIG['top_n_candidates'], len(ranked_df))
    
    print(f"\nTop {top_n} Candidates:\n")
    print(f"{'Rank':<6} {'Candidate':<25} {'Final':<12} {'Text Sim':<12} {'Skill Match':<12} {'Skills'}")
    print("-"*90)
    
    for idx, row in ranked_df.head(top_n).iterrows():
        candidate_name = str(row['candidate_name'])[:24]
        print(f"{row['rank']:<6} {candidate_name:<25} "
              f"{row['final_score']:.2%}{'':6} "
              f"{row['text_similarity']:.2%}{'':6} "
              f"{row['skill_match_score']:.2%}{'':6} "
              f"{row['num_matching']}/{len(job_skills)}")
    
    print("="*90)
else:
    print("No data to rank!")
    ranked_df = None

                                    CANDIDATE RANKINGS                                    

Top 10 Candidates:

Rank   Candidate                 Final        Text Sim     Skill Match  Skills
------------------------------------------------------------------------------------------
1      Candidate_2078            58.08%       23.90%       72.73%       8/11
2      Candidate_247             56.79%       19.61%       72.73%       8/11
3      Candidate_893             50.19%       18.80%       63.64%       7/11
4      Candidate_1266            48.55%       13.34%       63.64%       7/11
5      Candidate_1212            45.38%       24.00%       54.55%       6/11
6      Candidate_2451            45.26%       23.58%       54.55%       6/11
7      Candidate_2331            44.16%       19.94%       54.55%       6/11
8      Candidate_211             43.68%       18.32%       54.55%       6/11
9      Candidate_255             43.66%       18.25%       54.55%       6/11
10     Candidate_1928    

## Detailed Analysis of Top Candidates

In [None]:
if ranked_df is not None and len(ranked_df) > 0:
    print("\n" + "="*90)
    print("DETAILED CANDIDATE ANALYSIS".center(90))
    print("="*90)
    
    # Show detailed analysis for top 5
    detailed_top_n = min(5, len(ranked_df))
    
    for idx, row in ranked_df.head(detailed_top_n).iterrows():
        print(f"\nRANK #{row['rank']}: {row['candidate_name']}")
        print("-"*90)
        
        if resume_category_col and resume_category_col in row:
            print(f"Category: {row[resume_category_col]}")
        
        print(f"\nScores:")
        print(f"  Final Score: {row['final_score']:.2%}")
        print(f"  ├─ Text Similarity: {row['text_similarity']:.2%}")
        print(f"  └─ Skill Match: {row['skill_match_score']:.2%} ({row['num_matching']}/{len(job_skills)} skills)")
        
        print(f"\nTotal Skills Identified: {row['num_skills']}")
        
        # Matching skills
        if row['num_matching'] > 0:
            print(f"\nMATCHING SKILLS ({row['num_matching']}):")
            matching_list = sorted(row['matching_skills'])
            for i in range(0, len(matching_list), 5):
                print("  " + ", ".join(matching_list[i:i+5]))
        else:
            print(f"\nMATCHING SKILLS: None")
        
        # Missing skills
        if row['num_missing'] > 0:
            print(f"\nMISSING SKILLS ({row['num_missing']}):")
            missing_list = sorted(row['missing_skills'])
            for i in range(0, len(missing_list), 5):
                print("  " + ", ".join(missing_list[i:i+5]))
        else:
            print(f"\nMISSING SKILLS: None (Has all required skills!)")
        
        print()
    
    print("="*90)
else:
    print("No data for detailed analysis!")


                               DETAILED CANDIDATE ANALYSIS                                

RANK #1: Candidate_2078
------------------------------------------------------------------------------------------
Category: PUBLIC-RELATIONS

Scores:
  Final Score: 58.08%
  ├─ Text Similarity: 23.90%
  └─ Skill Match: 72.73% (8/11 skills)

Total Skills Identified: 14

✓ MATCHING SKILLS (8):
  communication, excel, leadership, microsoft office, powerpoint
  project management, r, word

MISSING SKILLS (3):
  agile, collaboration, innovative


RANK #2: Candidate_247
------------------------------------------------------------------------------------------
Category: INFORMATION-TECHNOLOGY

Scores:
  Final Score: 56.79%
  ├─ Text Similarity: 19.61%
  └─ Skill Match: 72.73% (8/11 skills)

Total Skills Identified: 11

✓ MATCHING SKILLS (8):
  collaboration, communication, excel, leadership, microsoft office
  powerpoint, project management, word

MISSING SKILLS (3):
  agile, innovative, r


RANK #3:

## Summary Statistics and Insights

In [85]:
if ranked_df is not None and len(ranked_df) > 0:
    print("\n" + "="*90)
    print("SCREENING SUMMARY".center(90))
    print("="*90)
    
    print(f"\nJob Position: {job_title}")
    print(f"Total Candidates Screened: {len(ranked_df)}")
    print(f"Required Skills: {len(job_skills)}")
    
    # Score distribution
    print(f"\nScore Distribution:")
    excellent = len(ranked_df[ranked_df['final_score'] >= 0.7])
    good = len(ranked_df[(ranked_df['final_score'] >= 0.5) & (ranked_df['final_score'] < 0.7)])
    fair = len(ranked_df[(ranked_df['final_score'] >= 0.3) & (ranked_df['final_score'] < 0.5)])
    poor = len(ranked_df[ranked_df['final_score'] < 0.3])
    
    print(f"  Excellent (≥70%): {excellent} candidates ({excellent/len(ranked_df)*100:.1f}%)")
    print(f"  Good (50-70%): {good} candidates ({good/len(ranked_df)*100:.1f}%)")
    print(f"  Fair (30-50%): {fair} candidates ({fair/len(ranked_df)*100:.1f}%)")
    print(f"  Poor (<30%): {poor} candidates ({poor/len(ranked_df)*100:.1f}%)")
    
    # Top candidate
    top = ranked_df.iloc[0]
    print(f"\nTOP CANDIDATE:")
    print(f"  Name: {top['candidate_name']}")
    print(f"  Score: {top['final_score']:.2%}")
    print(f"  Skills Matched: {top['num_matching']}/{len(job_skills)}")
    
    # Recommendation
    if top['final_score'] >= 0.7:
        recommendation = "Strong fit - Highly recommended for interview"
    elif top['final_score'] >= 0.5:
        recommendation = "Good fit - Recommended for interview"
    elif top['final_score'] >= 0.3:
        recommendation = "Moderate fit - Consider based on other factors"
    else:
        recommendation = "Weak fit - May not be suitable for this role"
    
    print(f"  Recommendation: {recommendation}")
    
    # Most common skills across all candidates
    all_skills_flat = []
    for skills in ranked_df['extracted_skills']:
        all_skills_flat.extend(skills)
    
    from collections import Counter
    skill_counts = Counter(all_skills_flat)
    
    print(f"\nMost Common Skills Across Candidates:")
    for skill, count in skill_counts.most_common(10):
        percentage = count / len(ranked_df) * 100
        print(f"  {skill}: {count} candidates ({percentage:.1f}%)")
    
    print("\n" + "="*90)
else:
    print("No data for summary!")


                                    SCREENING SUMMARY                                     

Job Position: Johnson & Johnson Family of Companies Job Application for Senior Training Leader | Monster.com var MONS_LOG_VARS = {"JobID":
Total Candidates Screened: 497
Required Skills: 11

Score Distribution:
  Excellent (≥70%): 0 candidates (0.0%)
  Good (50-70%): 3 candidates (0.6%)
  Fair (30-50%): 99 candidates (19.9%)
  Poor (<30%): 395 candidates (79.5%)

TOP CANDIDATE:
  Name: Candidate_2078
  Score: 58.08%
  Skills Matched: 8/11
  Recommendation: Good fit - Recommended for interview

Most Common Skills Across Candidates:
  communication: 285 candidates (57.3%)
  leadership: 196 candidates (39.4%)
  excel: 179 candidates (36.0%)
  word: 142 candidates (28.6%)
  microsoft office: 139 candidates (28.0%)
  testing: 104 candidates (20.9%)
  project management: 94 candidates (18.9%)
  creative: 91 candidates (18.3%)
  powerpoint: 89 candidates (17.9%)
  problem solving: 84 candidates (16.9%

## Export Results to CSV

In [86]:
if ranked_df is not None and len(ranked_df) > 0:
    # Prepare export DataFrame
    export_df = ranked_df[[
        'rank', 'candidate_name', 'final_score', 'text_similarity', 
        'skill_match_score', 'num_skills', 'num_matching', 'num_missing'
    ]].copy()
    
    # Add category if available
    if resume_category_col and resume_category_col in ranked_df.columns:
        export_df['category'] = ranked_df[resume_category_col]
    
    # Convert skills to string for CSV
    export_df['matching_skills'] = ranked_df['matching_skills'].apply(lambda x: ', '.join(sorted(x)))
    export_df['missing_skills'] = ranked_df['missing_skills'].apply(lambda x: ', '.join(sorted(x)))
    
    # Format percentages
    export_df['final_score_%'] = (export_df['final_score'] * 100).round(2)
    export_df['text_similarity_%'] = (export_df['text_similarity'] * 100).round(2)
    export_df['skill_match_%'] = (export_df['skill_match_score'] * 100).round(2)
    
    # Save to CSV
    output_file = 'resume_screening_results.csv'
    export_df.to_csv(output_file, index=False)
    
    print(f"Results exported to: {output_file}")
    print(f"  Rows: {len(export_df)}")
    print(f"  Columns: {len(export_df.columns)}")
    
    # Show preview
    print(f"\nPreview of exported data:")
    print(export_df[['rank', 'candidate_name', 'final_score_%', 'num_matching']].head(10))
else:
    print(" No data to export!")

Results exported to: resume_screening_results.csv
  Rows: 497
  Columns: 14

Preview of exported data:
   rank  candidate_name  final_score_%  num_matching
0     1  Candidate_2078          58.08             8
1     2   Candidate_247          56.79             8
2     3   Candidate_893          50.19             7
3     4  Candidate_1266          48.55             7
4     5  Candidate_1212          45.38             6
5     6  Candidate_2451          45.26             6
6     7  Candidate_2331          44.16             6
7     8   Candidate_211          43.68             6
8     9   Candidate_255          43.66             6
9    10  Candidate_1928          43.63             6
