In [7]:
# 🤖 AI-Powered Resume Screening System
# Advanced BERT-based candidate matching with optimized performance

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# NLP and ML Libraries
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

# BERT and Transformers
import torch
from transformers import AutoTokenizer, AutoModel, logging
logging.set_verbosity_error()  # Suppress transformer warnings

# Progress tracking
from tqdm.auto import tqdm
tqdm.pandas()

# Caching and optimization
import pickle
import os
from functools import lru_cache
import gc

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Configure plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ All libraries imported successfully!")
print(f"🔥 PyTorch version: {torch.__version__}")
print(f"🤗 Transformers available: {'✓' if 'transformers' in globals() else '✗'}")
print(f"⚡ CUDA available: {torch.cuda.is_available()}")


✅ All libraries imported successfully!
🔥 PyTorch version: 2.6.0+cu124
🤗 Transformers available: ✗
⚡ CUDA available: False


In [8]:
# ⚙️ Configuration & Constants for AI Resume Screening System
# Centralized configuration for optimal performance and maintainability

from dataclasses import dataclass # Import dataclass

@dataclass
class Config:
    """Centralized configuration class for the AI Resume Screening System"""

    # Model Configuration
    EMBEDDING_MODEL: str = "all-MiniLM-L6-v2"  # Lightweight, fast BERT model
    MAX_SEQUENCE_LENGTH: int = 512
    BATCH_SIZE: int = 32 if torch.cuda.is_available() else 16 # Use torch.cuda.is_available()

    # Processing Configuration
    MAX_WORKERS: int = 4
    CHUNK_SIZE: int = 1000
    CACHE_EMBEDDINGS: bool = True

    # Scoring Weights (must sum to 1.0)
    SKILL_WEIGHT: float = 0.4
    EXPERIENCE_WEIGHT: float = 0.25
    EDUCATION_WEIGHT: float = 0.2
    SEMANTIC_WEIGHT: float = 0.15

    # Ranking Configuration
    TOP_K_CANDIDATES: int = 20
    SIMILARITY_THRESHOLD: float = 0.3

    # Text Processing
    MIN_SKILL_LENGTH: int = 2
    MAX_SKILLS_PER_RESUME: int = 50
    REMOVE_COMMON_WORDS: bool = True

    # Visualization
    FIGURE_SIZE: tuple[int, int] = (12, 8) # Use tuple for type hint
    DPI: int = 100
    COLOR_PALETTE: str = "viridis"

    # File Paths
    CACHE_DIR: str = "./cache"
    RESULTS_DIR: str = "./results"

    # Performance Monitoring
    ENABLE_TIMING: bool = True
    ENABLE_MEMORY_MONITORING: bool = True
    LOG_LEVEL: str = "INFO"

# Initialize global configuration
config = Config()

# Skills taxonomy for enhanced matching
TECHNICAL_SKILLS = {
    'programming_languages': [
        'python', 'java', 'javascript', 'typescript', 'c++', 'c#', 'go', 'rust',
        'swift', 'kotlin', 'scala', 'r', 'matlab', 'sql', 'html', 'css', 'php'
    ],
    'frameworks_libraries': [
        'react', 'angular', 'vue', 'django', 'flask', 'spring', 'node.js', 'express',
        'tensorflow', 'pytorch', 'scikit-learn', 'pandas', 'numpy', 'opencv', 'keras'
    ],
    'databases': [
        'postgresql', 'mysql', 'mongodb', 'redis', 'elasticsearch', 'cassandra',
        'oracle', 'sqlite', 'neo4j', 'dynamodb'
    ],
    'cloud_platforms': [
        'aws', 'azure', 'gcp', 'docker', 'kubernetes', 'terraform', 'jenkins',
        'gitlab', 'github actions', 'ansible'
    ],
    'data_science': [
        'machine learning', 'deep learning', 'nlp', 'computer vision', 'statistics',
        'data analysis', 'data visualization', 'big data', 'spark', 'hadoop'
    ],
    'soft_skills': [
        'leadership', 'communication', 'teamwork', 'problem solving', 'creativity',
        'adaptability', 'time management', 'critical thinking', 'collaboration'
    ]
}

# Flatten skills for easy lookup
ALL_TECHNICAL_SKILLS = [skill for category in TECHNICAL_SKILLS.values() for skill in category]

# Education level hierarchy for scoring
EDUCATION_HIERARCHY = {
    'phd': 5, 'doctorate': 5, 'ph.d': 5,
    'master': 4, 'masters': 4, 'mba': 4, 'ms': 4, 'ma': 4,
    'bachelor': 3, 'bachelors': 3, 'bs': 3, 'ba': 3, 'bsc': 3,
    'associate': 2, 'diploma': 2,
    'certificate': 1, 'certification': 1,
    'high school': 0, 'highschool': 0
}

# Common job roles and their required skills
JOB_ROLE_SKILLS = {
    'data_scientist': [
        'python', 'r', 'machine learning', 'statistics', 'sql', 'pandas',
        'numpy', 'scikit-learn', 'tensorflow', 'data analysis'
    ],
    'software_engineer': [
        'python', 'java', 'javascript', 'git', 'sql', 'algorithms',
        'data structures', 'testing', 'debugging'
    ],
    'ml_engineer': [
        'python', 'tensorflow', 'pytorch', 'machine learning', 'docker',
        'kubernetes', 'mlops', 'model deployment'
    ],
    'frontend_developer': [
        'javascript', 'react', 'html', 'css', 'typescript', 'vue', 'angular'
    ],
    'backend_developer': [
        'python', 'java', 'node.js', 'sql', 'api', 'microservices', 'docker'
    ],
    'devops_engineer': [
        'docker', 'kubernetes', 'aws', 'jenkins', 'terraform', 'ansible', 'linux'
    ]
}

# Bias detection keywords (for fairness analysis)
BIAS_KEYWORDS = {
    'gender': ['he', 'she', 'his', 'her', 'him', 'male', 'female', 'man', 'woman'],
    'age': ['young', 'old', 'senior', 'junior', 'experienced', 'fresh', 'new grad'],
    'ethnicity': ['asian', 'hispanic', 'african', 'european', 'american', 'native'],
    'education': ['ivy league', 'prestigious', 'top university', 'elite']
}

# Performance benchmarks for evaluation
PERFORMANCE_BENCHMARKS = {
    'processing_time_per_resume': 0.5,  # seconds
    'memory_usage_mb': 2000,  # MB
    'accuracy_threshold': 0.85,
    'bias_threshold': 0.1
}

print("⚙️ Configuration loaded successfully!")
print(f"📊 Model: {config.EMBEDDING_MODEL}")
print(f"🔧 Batch Size: {config.BATCH_SIZE}")
# Remove the line printing 'device' as it's not defined in this cell
# print(f"🎯 Device: {device}")
print(f"💾 Cache Enabled: {config.CACHE_EMBEDDINGS}")
print("✅ Ready for AI resume screening!")

⚙️ Configuration loaded successfully!
📊 Model: all-MiniLM-L6-v2
🔧 Batch Size: 16
💾 Cache Enabled: True
✅ Ready for AI resume screening!


In [9]:
# 📊 Advanced Data Loading & Exploration Module
# Professional-grade data handling with comprehensive validation

class ResumeDataLoader:
    """Advanced data loader with comprehensive validation and exploration"""

    def __init__(self, config: Config):
        self.config = config
        self.df = None
        self.data_stats = {}
        self.validation_results = {}

    def load_dataset(self, file_path: str = "AI_Resume_Screening.csv", df: pd.DataFrame = None) -> pd.DataFrame:
        """Load dataset with advanced error handling and validation"""

        if df is not None:
            print("📥 Loading dataset from DataFrame...")
            self.df = df.copy()
        else:
            print(f"📥 Loading dataset from: {file_path}")
            try:
                self.df = self._load_from_file(file_path)
            except Exception as e:
                print(f"❌ Failed to load {file_path}: {str(e)}")
                print("🎯 Falling back to sample dataset for demonstration...")
                self.df = self._create_sample_dataset()

        if self.df is None:
            raise ValueError("Failed to load dataset")

        # Comprehensive data validation and exploration
        self._validate_dataset()
        self._generate_statistics()
        self._explore_dataset()

        return self.df

    def _load_from_file(self, file_path: str) -> pd.DataFrame:
        """Load dataset from file with multiple encoding attempts"""
        encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252']

        for encoding in encodings:
            try:
                df = pd.read_csv(file_path, encoding=encoding)
                print(f"✅ Dataset loaded successfully with {encoding} encoding")
                print(f"📊 Loaded {len(df)} resumes from CSV file")
                return df
            except UnicodeDecodeError:
                continue
            except Exception as e:
                if encoding == encodings[-1]:  # Last encoding attempt
                    raise Exception(f"Failed to load dataset: {str(e)}")
                continue

        raise Exception("Failed to load dataset with any encoding")

    def _create_sample_dataset(self, n_samples: int = 500) -> pd.DataFrame:
        """Create sample dataset only as fallback (matching CSV structure)"""

        print(f"🎯 Generating {n_samples} sample resumes as fallback...")

        # Create data matching the expected CSV structure
        first_names = ["Alex", "Morgan", "Jordan", "Taylor", "Casey", "Riley", "Avery",
                      "Jamie", "Blake", "Cameron", "Drew", "Emery", "Finley", "Hayden"]
        last_names = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller",
                     "Davis", "Rodriguez", "Martinez", "Hernandez", "Lopez", "Gonzalez"]

        names = [f"{np.random.choice(first_names)} {np.random.choice(last_names)}"
                for _ in range(n_samples)]

        skill_sets = [
            "Python, Machine Learning, TensorFlow, Scikit-learn, Pandas, NumPy, SQL, Git",
            "Java, Spring Boot, Microservices, Docker, Kubernetes, AWS, Jenkins, Maven",
            "JavaScript, React, Node.js, Express, MongoDB, HTML, CSS, TypeScript",
            "Python, Django, PostgreSQL, Redis, Docker, AWS, RESTful APIs, Git",
            "Data Science, R, Statistics, Tableau, Power BI, Excel, SPSS, Jupyter",
            "DevOps, Kubernetes, Terraform, Ansible, Linux, Bash, Monitoring, CI/CD",
            "Machine Learning, PyTorch, Computer Vision, OpenCV, CUDA, Research",
            "Full Stack, React, Python, FastAPI, PostgreSQL, AWS, Docker, Agile"
        ]

        education_options = [
            "Bachelor of Science in Computer Science",
            "Master of Science in Data Science",
            "Bachelor of Engineering in Software Engineering",
            "PhD in Machine Learning",
            "Master of Science in Computer Science",
            "Bachelor of Technology in Information Technology"
        ]

        certification_options = [
            "AWS Certified Solutions Architect",
            "Google Cloud Professional Data Engineer",
            "Certified Kubernetes Administrator (CKA)",
            "Microsoft Azure Fundamentals",
            "TensorFlow Developer Certificate",
            None, None, None  # Some candidates have no certifications
        ]

        job_roles = ["Data Scientist", "Software Engineer", "ML Engineer", "DevOps Engineer",
                    "Frontend Developer", "Backend Developer", "Full Stack Developer",
                    "Data Engineer", "Product Manager"]

        data = []
        for i in range(n_samples):
            experience = max(0, min(20, int(np.random.gamma(2, 2))))
            education = np.random.choice(education_options)
            skills = np.random.choice(skill_sets)
            job_role = np.random.choice(job_roles)
            certification = np.random.choice(certification_options)

            # Salary correlated with experience and education
            base_salary = 60000 + (experience * 5000) + np.random.normal(0, 10000)
            if "PhD" in education:
                base_salary += 20000
            elif "Master" in education:
                base_salary += 15000
            salary = max(int(base_salary), 40000)

            # AI Score with realistic distribution
            ai_score = min(max(
                50 + (experience * 2) + np.random.normal(0, 15) +
                (10 if "PhD" in education else 5 if "Master" in education else 0),
                30), 100)

            recruiter_decision = "Hire" if ai_score > 70 else "No Hire"
            projects = max(1, int(experience / 2 + np.random.poisson(2)))

            data.append({
                'Resume_ID': i + 1,
                'Name': names[i],
                'Skills': skills,
                'Experience (Years)': experience,
                'Education': education,
                'Certifications': certification,
                'Job Role': job_role,
                'Recruiter Decision': recruiter_decision,
                'Salary Expectation ($)': salary,
                'Projects Count': projects,
                'AI Score (0-100)': int(ai_score)
            })

        df = pd.DataFrame(data)
        print(f"⚠️ Using sample dataset with {len(df)} resumes")
        return df

    def _validate_dataset(self):
        """Comprehensive dataset validation"""
        print("\n🔍 Validating dataset...")

        required_columns = ['Skills', 'Experience (Years)', 'Education']
        missing_cols = [col for col in required_columns if col not in self.df.columns]

        if missing_cols:
            print(f"⚠️ Missing required columns: {missing_cols}")
        else:
            print("✅ All required columns present")

        self.validation_results = {
            'total_rows': len(self.df),
            'total_columns': len(self.df.columns),
            'missing_values': self.df.isnull().sum().sum(),
            'duplicate_rows': self.df.duplicated().sum(),
            'memory_usage_mb': self.df.memory_usage(deep=True).sum() / 1024**2
        }

        print(f"📊 Dataset validation results:")
        for key, value in self.validation_results.items():
            if isinstance(value, float):
                print(f"   {key.replace('_', ' ').title()}: {value:.2f}")
            else:
                print(f"   {key.replace('_', ' ').title()}: {value:,}")

    def _generate_statistics(self):
        """Generate comprehensive dataset statistics"""
        print("\n📈 Generating dataset statistics...")

        self.data_stats = {
            'shape': self.df.shape,
            'columns': list(self.df.columns),
            'dtypes': dict(self.df.dtypes),
            'missing_percentage': (self.df.isnull().sum() / len(self.df) * 100).to_dict(),
        }

        # Add numeric summary if numeric columns exist
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns
        if len(numeric_cols) > 0:
            self.data_stats['numeric_summary'] = self.df[numeric_cols].describe().to_dict()

        # Categorical summaries
        categorical_cols = self.df.select_dtypes(include=['object']).columns
        self.data_stats['categorical_summary'] = {}
        for col in categorical_cols:
            if col in self.df.columns and not self.df[col].isnull().all():
                self.data_stats['categorical_summary'][col] = {
                    'unique_count': self.df[col].nunique(),
                    'most_common': self.df[col].value_counts().head(3).to_dict()
                }

    def _explore_dataset(self):
        """Comprehensive dataset exploration with visualizations"""
        print("\n🎨 Creating data exploration visualizations...")

        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=('Missing Values by Column', 'Data Types Distribution',
                           'Experience Distribution', 'AI Score Distribution'),
            specs=[[{"type": "bar"}, {"type": "pie"}],
                   [{"type": "histogram"}, {"type": "histogram"}]]
        )

        # Missing values
        missing_data = self.df.isnull().sum()
        missing_data = missing_data[missing_data > 0]
        if len(missing_data) > 0:
            fig.add_trace(
                go.Bar(x=missing_data.index, y=missing_data.values,
                      name="Missing Values", marker_color='red'),
                row=1, col=1
            )
        else:
            fig.add_trace(
                go.Bar(x=['No Missing'], y=[0], name="No Missing Values",
                      marker_color='green'),
                row=1, col=1
            )

        # Data types
        dtype_counts = self.df.dtypes.value_counts()
        fig.add_trace(
            go.Pie(labels=dtype_counts.index.astype(str), values=dtype_counts.values,
                  name="Data Types"),
            row=1, col=2
        )

        # Experience distribution (if exists)
        if 'Experience (Years)' in self.df.columns:
            fig.add_trace(
                go.Histogram(x=self.df['Experience (Years)'], name="Experience",
                           nbinsx=15, marker_color='blue'),
                row=2, col=1
            )

        # AI Score distribution (if exists)
        if 'AI Score (0-100)' in self.df.columns:
            fig.add_trace(
                go.Histogram(x=self.df['AI Score (0-100)'], name="AI Score",
                           nbinsx=20, marker_color='green'),
                row=2, col=2
            )

        fig.update_layout(
            height=800,
            showlegend=False,
            title_text="📊 Dataset Exploration Dashboard",
            title_x=0.5
        )
        fig.show()

        # Display summary table
        print("\n📋 Dataset Summary Table:")
        summary_data = []
        for col in self.df.columns:
            summary_data.append({
                'Column': col,
                'Type': str(self.df[col].dtype),
                'Non-Null Count': self.df[col].count(),
                'Missing %': round((self.df[col].isnull().sum() / len(self.df)) * 100, 2),
                'Unique Values': self.df[col].nunique()
            })

        summary_df = pd.DataFrame(summary_data)
        display(summary_df)

    def get_data_insights(self) -> Dict[str, Any]:
        """Get comprehensive data insights for reporting"""
        return {
            'validation_results': self.validation_results,
            'statistics': self.data_stats,
            'quality_score': self._calculate_quality_score()
        }

    def _calculate_quality_score(self) -> float:
        """Calculate overall data quality score"""
        if not self.validation_results:
            return 0.0

        # Quality factors
        total_cells = self.validation_results['total_rows'] * self.validation_results['total_columns']
        missing_penalty = (self.validation_results['missing_values'] / total_cells) * 100 if total_cells > 0 else 0
        duplicate_penalty = (self.validation_results['duplicate_rows'] /
                           self.validation_results['total_rows']) * 100 if self.validation_results['total_rows'] > 0 else 0

        quality_score = max(0, 100 - missing_penalty - duplicate_penalty)
        return round(quality_score, 2)

# Initialize data loader and load the actual CSV dataset
print("🚀 Initializing Advanced Data Loader...")
data_loader = ResumeDataLoader(config)

# Load the actual CSV file first
print("📁 Loading AI_Resume_Screening.csv...")
df = data_loader.load_dataset("AI_Resume_Screening.csv")

# Display results
print(f"\n✅ Dataset loaded successfully!")
print(f"📊 Shape: {df.shape}")
print(f"💾 Memory Usage: {data_loader.validation_results['memory_usage_mb']:.2f} MB")
print(f"🏆 Data Quality Score: {data_loader._calculate_quality_score()}/100")

# Show first few rows to confirm data loading
print("\n🔍 First 5 rows of loaded data:")
display(df.head())


🚀 Initializing Advanced Data Loader...
📁 Loading AI_Resume_Screening.csv...
📥 Loading dataset from: AI_Resume_Screening.csv
✅ Dataset loaded successfully with utf-8 encoding
📊 Loaded 1000 resumes from CSV file

🔍 Validating dataset...
✅ All required columns present
📊 Dataset validation results:
   Total Rows: 1,000
   Total Columns: 11
   Missing Values: 274
   Duplicate Rows: 0
   Memory Usage Mb: 0.43

📈 Generating dataset statistics...

🎨 Creating data exploration visualizations...



📋 Dataset Summary Table:


Unnamed: 0,Column,Type,Non-Null Count,Missing %,Unique Values
0,Resume_ID,int64,1000,0.0,1000
1,Name,object,1000,0.0,989
2,Skills,object,1000,0.0,238
3,Experience (Years),int64,1000,0.0,11
4,Education,object,1000,0.0,5
5,Certifications,object,726,27.4,3
6,Job Role,object,1000,0.0,4
7,Recruiter Decision,object,1000,0.0,2
8,Salary Expectation ($),int64,1000,0.0,993
9,Projects Count,int64,1000,0.0,11



✅ Dataset loaded successfully!
📊 Shape: (1000, 11)
💾 Memory Usage: 0.43 MB
🏆 Data Quality Score: 97.51/100

🔍 First 5 rows of loaded data:


Unnamed: 0,Resume_ID,Name,Skills,Experience (Years),Education,Certifications,Job Role,Recruiter Decision,Salary Expectation ($),Projects Count,AI Score (0-100)
0,1,Ashley Ali,"TensorFlow, NLP, Pytorch",10,B.Sc,,AI Researcher,Hire,104895,8,100
1,2,Wesley Roman,"Deep Learning, Machine Learning, Python, SQL",10,MBA,Google ML,Data Scientist,Hire,113002,1,100
2,3,Corey Sanchez,"Ethical Hacking, Cybersecurity, Linux",1,MBA,Deep Learning Specialization,Cybersecurity Analyst,Hire,71766,7,70
3,4,Elizabeth Carney,"Python, Pytorch, TensorFlow",7,B.Tech,AWS Certified,AI Researcher,Hire,46848,0,95
4,5,Julie Hill,"SQL, React, Java",4,PhD,,Software Engineer,Hire,87441,9,100


In [13]:
# 🎯 Advanced Feature Extraction System
# Comprehensive feature engineering for resume screening

from typing import Dict, List, Any, Tuple
import time # Import the time module
import nltk # Import the nltk module
from nltk.tokenize import sent_tokenize # Import sent_tokenize
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from tqdm.auto import tqdm

# Download required NLTK data (including punkt_tab)
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
    nltk.data.find('tokenizers/punkt_tab') # Add this line to check for punkt_tab
except LookupError:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt_tab', quiet=True) # Add this line to download punkt_tab


class AdvancedFeatureExtractor:
    """Advanced feature extraction system for resume analysis"""

    def __init__(self, config: Config):
        self.config = config
        self.feature_stats = {}
        self.tfidf_vectorizer = TfidfVectorizer(
            max_features=5000,
            ngram_range=(1, 2),
            stop_words='english',
            lowercase=True
        )
        self.scaler = StandardScaler()

    def extract_all_features(self, df: pd.DataFrame) -> Dict[str, np.ndarray]:
        """Extract all features from the dataset"""
        print("🎯 Extracting comprehensive features...")
        start_time = time.time()

        features = {}

        # 1. Technical Skills Features
        print("   📚 Extracting technical skills features...")
        features['skills_features'] = self._extract_skills_features(df)

        # 2. Experience Features
        print("   💼 Extracting experience features...")
        features['experience_features'] = self._extract_experience_features(df)

        # 3. Education Features
        print("   🎓 Extracting education features...")
        features['education_features'] = self._extract_education_features(df)

        # 4. Text Similarity Features
        print("   📝 Extracting text similarity features...")
        features['text_features'] = self._extract_text_features(df)

        # 5. Professional Features
        print("   🏢 Extracting professional features...")
        features['professional_features'] = self._extract_professional_features(df)

        # 6. Statistical Features
        print("   📊 Extracting statistical features...")
        features['statistical_features'] = self._extract_statistical_features(df)

        # Store extraction statistics
        extraction_time = time.time() - start_time
        self.feature_stats = {
            'total_features': sum(feat.shape[1] if len(feat.shape) > 1 else 1
                                for feat in features.values()),
            'extraction_time': extraction_time,
            'features_per_second': len(df) / extraction_time,
            'feature_categories': list(features.keys())
        }

        print(f"✅ Feature extraction complete!")
        print(f"   📊 Total features: {self.feature_stats['total_features']}")
        print(f"   ⏱️ Time: {extraction_time:.2f}s")

        return features

    def _extract_skills_features(self, df: pd.DataFrame) -> np.ndarray:
        """Extract advanced skills-based features"""

        skills_features = []

        for _, row in tqdm(df.iterrows(), total=len(df), desc="Extracting skills"):
            features_row = []

            # Get processed skills
            skills_text = row.get('skills_processed', '')
            if pd.isna(skills_text):
                skills_text = row.get('Skills', '')

            skills_list = str(skills_text).lower().split()

            # Technical skills count by category
            for category, category_skills in TECHNICAL_SKILLS.items():
                count = sum(1 for skill in skills_list if skill in category_skills)
                features_row.append(count)

            # Total skills count
            features_row.append(len(skills_list))

            # Skills diversity (unique categories with skills)
            categories_with_skills = sum(1 for category, category_skills in TECHNICAL_SKILLS.items()
                                       if any(skill in skills_list for skill in category_skills))
            features_row.append(categories_with_skills)

            # Advanced skills indicators
            advanced_indicators = ['senior', 'lead', 'architect', 'expert', 'advanced']
            advanced_count = sum(1 for indicator in advanced_indicators
                               if indicator in skills_text.lower())
            features_row.append(advanced_count)

            # Programming languages count
            prog_languages = sum(1 for skill in skills_list
                               if skill in TECHNICAL_SKILLS['programming_languages'])
            features_row.append(prog_languages)

            # Framework knowledge
            frameworks = sum(1 for skill in skills_list
                           if skill in TECHNICAL_SKILLS['frameworks_libraries'])
            features_row.append(frameworks)

            # Cloud platform experience
            cloud_skills = sum(1 for skill in skills_list
                             if skill in TECHNICAL_SKILLS['cloud_platforms'])
            features_row.append(cloud_skills)

            skills_features.append(features_row)

        return np.array(skills_features)

    def _extract_experience_features(self, df: pd.DataFrame) -> np.ndarray:
        """Extract experience-based features"""

        experience_features = []

        for _, row in tqdm(df.iterrows(), total=len(df), desc="Analyzing skills"):
            features_row = []

            # Years of experience
            experience_years = row.get('Experience (Years)', 0)
            if pd.isna(experience_years):
                experience_years = 0
            features_row.append(float(experience_years))

            # Experience level categories
            if experience_years == 0:
                exp_level = [1, 0, 0, 0]  # Entry level
            elif experience_years <= 2:
                exp_level = [0, 1, 0, 0]  # Junior
            elif experience_years <= 5:
                exp_level = [0, 0, 1, 0]  # Mid level
            else:
                exp_level = [0, 0, 0, 1]  # Senior
            features_row.extend(exp_level)

            # Projects count
            projects_count = row.get('Projects Count', 0)
            if pd.isna(projects_count):
                projects_count = 0
            features_row.append(float(projects_count))

            # Experience-to-projects ratio
            if projects_count > 0 and experience_years > 0:
                exp_project_ratio = experience_years / projects_count
            else:
                exp_project_ratio = 0
            features_row.append(exp_project_ratio)

            # Salary expectation (normalized)
            salary = row.get('Salary Expectation ($)', 0)
            if pd.isna(salary):
                salary = 0
            normalized_salary = salary / 100000  # Normalize to 0-2 range typically
            features_row.append(normalized_salary)

            experience_features.append(features_row)

        return np.array(experience_features)

    def _extract_education_features(self, df: pd.DataFrame) -> np.ndarray:
        """Extract education-based features"""

        education_features = []

        for _, row in tqdm(df.iterrows(), total=len(df), desc="Analyzing skills"):
            features_row = []

            education_text = str(row.get('Education', '')).lower()

            # Education level score
            education_score = 0
            for level, score in EDUCATION_HIERARCHY.items():
                if level in education_text:
                    education_score = max(education_score, score)
            features_row.append(education_score)

            # Specific degree types (one-hot encoded)
            degree_types = ['computer science', 'data science', 'engineering',
                          'mathematics', 'statistics', 'business']
            for degree_type in degree_types:
                has_degree = 1 if degree_type in education_text else 0
                features_row.append(has_degree)

            # Advanced degree indicator
            advanced_degrees = ['phd', 'doctorate', 'master', 'mba']
            has_advanced = 1 if any(degree in education_text for degree in advanced_degrees) else 0
            features_row.append(has_advanced)

            # Technical education indicator
            technical_keywords = ['computer', 'engineering', 'technology', 'science', 'data']
            has_technical = 1 if any(keyword in education_text for keyword in technical_keywords) else 0
            features_row.append(has_technical)

            education_features.append(features_row)

        return np.array(education_features)

    def _extract_text_features(self, df: pd.DataFrame) -> np.ndarray:
        """Extract text-based statistical features"""

        text_features = []

        for _, row in tqdm(df.iterrows(), total=len(df), desc="Analyzing skills"):
            features_row = []

            # Combined text analysis
            combined_text = row.get('combined_text', '')
            if pd.isna(combined_text):
                combined_text = ''

            # Text length features
            features_row.append(len(combined_text))  # Character count
            features_row.append(len(combined_text.split()))  # Word count

            # Skills text analysis
            skills_text = str(row.get('Skills', ''))
            features_row.append(len(skills_text))
            features_row.append(len(skills_text.split(',')))  # Comma-separated skills

            # Text complexity features
            sentences = sent_tokenize(combined_text)
            features_row.append(len(sentences))  # Sentence count

            if len(sentences) > 0:
                avg_sentence_length = sum(len(sent.split()) for sent in sentences) / len(sentences)
            else:
                avg_sentence_length = 0
            features_row.append(avg_sentence_length)

            # Vocabulary richness
            words = combined_text.lower().split()
            unique_words = set(words)
            vocabulary_richness = len(unique_words) / len(words) if len(words) > 0 else 0
            features_row.append(vocabulary_richness)

            text_features.append(features_row)

        return np.array(text_features)

    def _extract_professional_features(self, df: pd.DataFrame) -> np.ndarray:
        """Extract professional experience features"""

        professional_features = []

        for _, row in tqdm(df.iterrows(), total=len(df), desc="Analyzing skills"):
            features_row = []

            # Job role analysis
            job_role = str(row.get('Job Role', '')).lower()

            # Job role categories (one-hot encoded)
            role_categories = {
                'data_science': ['data scientist', 'data analyst', 'machine learning'],
                'software_engineering': ['software engineer', 'developer', 'programmer'],
                'devops': ['devops', 'infrastructure', 'cloud engineer'],
                'management': ['manager', 'lead', 'director', 'head'],
                'research': ['researcher', 'scientist', 'research']
            }

            for category, keywords in role_categories.items():
                has_role = 1 if any(keyword in job_role for keyword in keywords) else 0
                features_row.append(has_role)

            # Certifications analysis
            certifications = str(row.get('Certifications', '')).lower()

            # Has certifications
            has_certs = 1 if certifications and certifications != 'none' and certifications != 'nan' else 0
            features_row.append(has_certs)

            # Certification types
            cert_types = ['aws', 'google', 'microsoft', 'azure', 'oracle', 'cisco']
            for cert_type in cert_types:
                has_cert_type = 1 if cert_type in certifications else 0
                features_row.append(has_cert_type)

            # AI Score (if available as ground truth)
            ai_score = row.get('AI Score (0-100)', 0)
            if pd.isna(ai_score):
                ai_score = 0
            features_row.append(float(ai_score) / 100)  # Normalize to 0-1

            professional_features.append(features_row)

        return np.array(professional_features)

    def _extract_statistical_features(self, df: pd.DataFrame) -> np.ndarray:
        """Extract statistical and derived features"""

        statistical_features = []

        # Calculate dataset-wide statistics for normalization
        if 'Experience (Years)' in df.columns:
            exp_mean = df['Experience (Years)'].mean()
            exp_std = df['Experience (Years)'].std()
        else:
            exp_mean = exp_std = 0

        if 'Salary Expectation ($)' in df.columns:
            salary_mean = df['Salary Expectation ($)'].mean()
            salary_std = df['Salary Expectation ($)'].std()
        else:
            salary_mean = salary_std = 0

        for _, row in tqdm(df.iterrows(), total=len(df), desc="Analyzing skills"):
            features_row = []

            # Normalized experience (z-score)
            experience = row.get('Experience (Years)', 0)
            if exp_std > 0:
                norm_experience = (experience - exp_mean) / exp_std
            else:
                norm_experience = 0
            features_row.append(norm_experience)

            # Normalized salary expectation
            salary = row.get('Salary Expectation ($)', 0)
            if salary_std > 0:
                norm_salary = (salary - salary_mean) / salary_std
            else:
                norm_salary = 0
            features_row.append(norm_salary)

            # Experience-salary ratio indicator
            if experience > 0 and salary > 0:
                exp_salary_ratio = salary / (experience * 10000)  # Rough industry standard
            else:
                exp_salary_ratio = 0
            features_row.append(exp_salary_ratio)

            # Completeness score (how much information is provided)
            total_fields = 0
            completed_fields = 0

            key_fields = ['Skills', 'Experience (Years)', 'Education', 'Job Role']
            for field in key_fields:
                total_fields += 1
                if field in row and pd.notna(row[field]) and str(row[field]).strip():
                    completed_fields += 1

            completeness_score = completed_fields / total_fields if total_fields > 0 else 0
            features_row.append(completeness_score)

            statistical_features.append(features_row)

        return np.array(statistical_features)

    def create_feature_matrix(self, features_dict: Dict[str, np.ndarray]) -> Tuple[np.ndarray, List[str]]:
        """Combine all features into a single matrix with feature names"""

        print("🔗 Creating unified feature matrix...")

        # Combine all feature arrays
        feature_arrays = []
        feature_names = []

        # Skills features
        if 'skills_features' in features_dict:
            feature_arrays.append(features_dict['skills_features'])
            skill_categories = list(TECHNICAL_SKILLS.keys())
            skill_names = (skill_categories +
                          ['total_skills', 'skill_diversity', 'advanced_indicators',
                           'programming_languages', 'frameworks', 'cloud_skills'])
            feature_names.extend([f"skills_{name}" for name in skill_names])

        # Experience features
        if 'experience_features' in features_dict:
            feature_arrays.append(features_dict['experience_features'])
            exp_names = ['experience_years', 'entry_level', 'junior_level',
                        'mid_level', 'senior_level', 'projects_count',
                        'exp_project_ratio', 'normalized_salary']
            feature_names.extend([f"exp_{name}" for name in exp_names])

        # Education features
        if 'education_features' in features_dict:
            feature_arrays.append(features_dict['education_features'])
            edu_names = (['education_score'] +
                        [f"degree_{dt.replace(' ', '_')}" for dt in
                         ['computer_science', 'data_science', 'engineering',
                          'mathematics', 'statistics', 'business']] +
                        ['has_advanced_degree', 'has_technical_education'])
            feature_names.extend([f"edu_{name}" for name in edu_names])

        # Text features
        if 'text_features' in features_dict:
            feature_arrays.append(features_dict['text_features'])
            text_names = ['char_count', 'word_count', 'skills_char_count',
                         'skills_comma_count', 'sentence_count',
                         'avg_sentence_length', 'vocabulary_richness']
            feature_names.extend([f"text_{name}" for name in text_names])

        # Professional features
        if 'professional_features' in features_dict:
            feature_arrays.append(features_dict['professional_features'])
            prof_names = (['role_data_science', 'role_software_engineering',
                          'role_devops', 'role_management', 'role_research',
                          'has_certifications'] +
                         [f"cert_{ct}" for ct in ['aws', 'google', 'microsoft',
                                                 'azure', 'oracle', 'cisco']] +
                         ['ai_score_normalized'])
            feature_names.extend([f"prof_{name}" for name in prof_names])

        # Statistical features
        if 'statistical_features' in features_dict:
            feature_arrays.append(features_dict['statistical_features'])
            stat_names = ['norm_experience', 'norm_salary', 'exp_salary_ratio',
                         'completeness_score']
            feature_names.extend([f"stat_{name}" for name in stat_names])

        # Combine all features
        if feature_arrays:
            combined_features = np.hstack(feature_arrays)
        else:
            combined_features = np.array([])

        print(f"✅ Feature matrix created: {combined_features.shape}")
        print(f"   📊 Features: {len(feature_names)}")

        return combined_features, feature_names

    def get_feature_importance(self, features: np.ndarray, feature_names: List[str],
                             target: np.ndarray = None) -> pd.DataFrame:
        """Calculate feature importance scores"""

        if target is None:
            # Use AI Score as target if available
            return pd.DataFrame({
                'feature': feature_names,
                'importance': np.random.random(len(feature_names))  # Placeholder
            }).sort_values('importance', ascending=False)

        # Calculate correlation with target
        correlations = []
        for i, feature_name in enumerate(feature_names):
            if i < features.shape[1]:
                corr = np.corrcoef(features[:, i], target)[0, 1]
                correlations.append(abs(corr) if not np.isnan(corr) else 0)
            else:
                correlations.append(0)

        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': correlations
        }).sort_values('importance', ascending=False)

        return importance_df

    def get_extraction_stats(self) -> Dict[str, Any]:
        """Get feature extraction statistics"""
        return self.feature_stats

# Initialize and run feature extraction
print("🎯 Initializing Advanced Feature Extractor...")
feature_extractor = AdvancedFeatureExtractor(config)

# Extract all features
features_dict = feature_extractor.extract_all_features(df)

# Create unified feature matrix
feature_matrix, feature_names = feature_extractor.create_feature_matrix(features_dict)

# Calculate feature importance (if AI Score available)
if 'AI Score (0-100)' in df.columns:
    feature_importance = feature_extractor.get_feature_importance(
        feature_matrix, feature_names, df['AI Score (0-100)'].values
    )
    print("\n🏆 Top 10 Most Important Features:")
    display(feature_importance.head(10))

# Display extraction statistics
print("\n📊 Feature Extraction Statistics:")
stats = feature_extractor.get_extraction_stats()
for key, value in stats.items():
    if isinstance(value, float):
        print(f"   {key.replace('_', ' ').title()}: {value:.3f}")
    else:
        print(f"   {key.replace('_', ' ').title()}: {value}")

print("✅ Feature extraction system ready!")

🎯 Initializing Advanced Feature Extractor...
🎯 Extracting comprehensive features...
   📚 Extracting technical skills features...


Extracting skills:   0%|          | 0/1000 [00:00<?, ?it/s]

   💼 Extracting experience features...


Analyzing skills:   0%|          | 0/1000 [00:00<?, ?it/s]

   🎓 Extracting education features...


Analyzing skills:   0%|          | 0/1000 [00:00<?, ?it/s]

   📝 Extracting text similarity features...


Analyzing skills:   0%|          | 0/1000 [00:00<?, ?it/s]

   🏢 Extracting professional features...


Analyzing skills:   0%|          | 0/1000 [00:00<?, ?it/s]

   📊 Extracting statistical features...


Analyzing skills:   0%|          | 0/1000 [00:00<?, ?it/s]

✅ Feature extraction complete!
   📊 Total features: 53
   ⏱️ Time: 0.44s
🔗 Creating unified feature matrix...
✅ Feature matrix created: (1000, 53)
   📊 Features: 53

🏆 Top 10 Most Important Features:


Unnamed: 0,feature,importance
48,prof_ai_score_normalized,1.0
49,stat_norm_experience,0.777042
12,exp_experience_years,0.777042
16,exp_senior_level,0.630057
13,exp_entry_level,0.522715
14,exp_junior_level,0.496128
51,stat_exp_salary_ratio,0.375253
17,exp_projects_count,0.358666
18,exp_exp_project_ratio,0.278189
32,text_skills_comma_count,0.120345



📊 Feature Extraction Statistics:
   Total Features: 53
   Extraction Time: 0.437
   Features Per Second: 2289.521
   Feature Categories: ['skills_features', 'experience_features', 'education_features', 'text_features', 'professional_features', 'statistical_features']
✅ Feature extraction system ready!


In [14]:
# 🧠 Advanced BERT Embedding Engine
# Optimized for Google Colab with memory management and batch processing

from typing import List, Dict, Any
import time
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
from transformers import logging
logging.set_verbosity_error()  # Suppress transformer warnings
from sentence_transformers import SentenceTransformer # Import SentenceTransformer
from tqdm.auto import tqdm
import gc

# Determine the device to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


class BERTEmbeddingEngine:
    """Advanced BERT embedding system with GPU optimization"""

    def __init__(self, config: Config):
        self.config = config
        self.device = device
        self.model = None
        self.tokenizer = None
        self.embedding_cache = {}
        self.processing_stats = {}

        # Initialize model
        self._initialize_model()

    def _initialize_model(self):
        """Initialize BERT model with error handling"""
        print(f"🧠 Initializing BERT model: {self.config.EMBEDDING_MODEL}")

        try:
            # Use SentenceTransformers for optimized embeddings
            self.model = SentenceTransformer(self.config.EMBEDDING_MODEL)
            self.model.to(self.device)

            # Set to evaluation mode for inference
            self.model.eval()

            print(f"✅ Model loaded successfully on {self.device}")
            print(f"   📏 Max sequence length: {self.config.MAX_SEQUENCE_LENGTH}")
            print(f"   🔢 Embedding dimension: {self.model.get_sentence_embedding_dimension()}")

        except Exception as e:
            print(f"❌ Error loading model: {str(e)}")
            print("🔄 Falling back to alternative model...")

            # Fallback to a smaller model
            try:
                self.model = SentenceTransformer('all-MiniLM-L6-v2')
                self.model.to(self.device)
                print("✅ Fallback model loaded successfully")
            except Exception as e2:
                print(f"❌ Fallback failed: {str(e2)}")
                raise Exception("Failed to load any BERT model")

    def generate_embeddings(self, texts: List[str],
                          description: str = "Generating embeddings") -> np.ndarray:
        """Generate BERT embeddings with optimized batch processing"""

        if not texts:
            return np.array([])

        print(f"🧠 Generating embeddings for {len(texts)} texts...")
        start_time = time.time()

        # Filter out empty texts
        valid_texts = [text if text and str(text).strip() else "empty text" for text in texts]

        try:
            # Generate embeddings in batches for memory efficiency
            embeddings = []

            with torch.no_grad():
                for i in tqdm(range(0, len(valid_texts), self.config.BATCH_SIZE),
                             desc=description):
                    batch_texts = valid_texts[i:i + self.config.BATCH_SIZE]

                    # Generate embeddings for batch
                    batch_embeddings = self.model.encode(
                        batch_texts,
                        batch_size=len(batch_texts),
                        show_progress_bar=False,
                        convert_to_numpy=True,
                        normalize_embeddings=True  # L2 normalization for better similarity
                    )

                    embeddings.append(batch_embeddings)

                    # Clear GPU cache periodically
                    if torch.cuda.is_available() and i % (self.config.BATCH_SIZE * 4) == 0:
                        torch.cuda.empty_cache()

            # Combine all embeddings
            all_embeddings = np.vstack(embeddings)

            # Store processing statistics
            processing_time = time.time() - start_time
            self.processing_stats.update({
                'last_batch_size': len(texts),
                'last_processing_time': processing_time,
                'embeddings_per_second': len(texts) / processing_time,
                'embedding_dimension': all_embeddings.shape[1]
            })

            print(f"✅ Embeddings generated successfully!")
            print(f"   📊 Shape: {all_embeddings.shape}")
            print(f"   ⏱️ Time: {processing_time:.2f}s ({self.processing_stats['embeddings_per_second']:.1f}/s)")

            return all_embeddings

        except Exception as e:
            print(f"❌ Error generating embeddings: {str(e)}")
            # Return zero embeddings as fallback
            embedding_dim = self.model.get_sentence_embedding_dimension()
            return np.zeros((len(texts), embedding_dim))

    def generate_job_embedding(self, job_description: str) -> np.ndarray:
        """Generate embedding for job description"""

        if not job_description or not job_description.strip():
            job_description = "General position requiring technical skills"

        print(f"💼 Generating job description embedding...")

        # Enhanced job description with context
        enhanced_description = self._enhance_job_description(job_description)

        embedding = self.generate_embeddings([enhanced_description],
                                           description="Processing job description")

        return embedding[0] if len(embedding) > 0 else np.array([])

    def _enhance_job_description(self, job_description: str) -> str:
        """Enhance job description with contextual information"""

        # Add context for better matching
        context_prefix = "Job requirements and responsibilities: "

        # Ensure minimum length for better embeddings
        if len(job_description.split()) < 10:
            job_description += " Looking for qualified candidates with relevant experience and skills."

        return context_prefix + job_description

    def generate_resume_embeddings(self, df: pd.DataFrame) -> np.ndarray:
        """Generate embeddings for all resumes in dataset"""

        print("📄 Preparing resume texts for embedding generation...")

        # Prepare resume texts
        resume_texts = []
        for _, row in tqdm(df.iterrows(), total=len(df), desc="Preparing texts"):
            resume_text = self._create_resume_text(row)
            resume_texts.append(resume_text)

        # Generate embeddings
        embeddings = self.generate_embeddings(resume_texts,
                                            description="Generating resume embeddings")

        return embeddings

    def _create_resume_text(self, row: pd.Series) -> str:
        """Create comprehensive resume text for embedding"""

        text_parts = []

        # Add skills with emphasis
        if 'skills_processed' in row and pd.notna(row['skills_processed']):
            skills_text = f"Technical skills: {row['skills_processed']}"
            text_parts.append(skills_text)
        elif 'Skills' in row and pd.notna(row['Skills']):
            skills_text = f"Technical skills: {row['Skills']}"
            text_parts.append(skills_text)

        # Add experience
        if 'Experience (Years)' in row and pd.notna(row['Experience (Years)']):
            exp_years = row['Experience (Years)']
            exp_text = f"Professional experience: {exp_years} years"
            text_parts.append(exp_text)

        # Add education
        if 'education_processed' in row and pd.notna(row['education_processed']):
            edu_text = f"Education: {row['education_processed']}"
            text_parts.append(edu_text)
        elif 'Education' in row and pd.notna(row['Education']):
            edu_text = f"Education: {row['Education']}"
            text_parts.append(edu_text)

        # Add job role
        if 'Job Role' in row and pd.notna(row['Job Role']):
            role_text = f"Job role: {row['Job Role']}"
            text_parts.append(role_text)

        # Add certifications
        if 'Certifications' in row and pd.notna(row['Certifications']) and str(row['Certifications']).lower() != 'none':
            cert_text = f"Certifications: {row['Certifications']}"
            text_parts.append(cert_text)

        # Add projects information
        if 'Projects Count' in row and pd.notna(row['Projects Count']):
            projects = row['Projects Count']
            if projects > 0:
                project_text = f"Completed {projects} projects"
                text_parts.append(project_text)

        # Combine all parts
        resume_text = ". ".join(text_parts)

        # Ensure minimum length
        if len(resume_text.split()) < 5:
            resume_text += " Candidate seeking new opportunities in technology."

        return resume_text

    def calculate_semantic_similarity(self, resume_embeddings: np.ndarray,
                                    job_embedding: np.ndarray) -> np.ndarray:
        """Calculate semantic similarity between resumes and job description"""

        print("🔍 Calculating semantic similarity scores...")

        if len(resume_embeddings) == 0 or len(job_embedding) == 0:
            print("⚠️ Empty embeddings detected, returning zero similarities")
            return np.zeros(len(resume_embeddings))

        # Reshape job embedding for matrix operations
        job_embedding = job_embedding.reshape(1, -1)

        # Calculate cosine similarity
        similarities = cosine_similarity(resume_embeddings, job_embedding)

        # Flatten to 1D array
        similarity_scores = similarities.flatten()

        print(f"✅ Similarity calculation complete!")
        print(f"   📊 Mean similarity: {similarity_scores.mean():.3f}")
        print(f"   📈 Max similarity: {similarity_scores.max():.3f}")
        print(f"   📉 Min similarity: {similarity_scores.min():.3f}")

        return similarity_scores

    def batch_similarity_calculation(self, resume_embeddings: np.ndarray,
                                   job_descriptions: List[str]) -> np.ndarray:
        """Calculate similarities for multiple job descriptions efficiently"""

        print(f"🔄 Processing {len(job_descriptions)} job descriptions...")

        all_similarities = []

        for i, job_desc in enumerate(job_descriptions):
            print(f"   Processing job {i+1}/{len(job_descriptions)}")

            job_embedding = self.generate_job_embedding(job_desc)
            similarities = self.calculate_semantic_similarity(resume_embeddings, job_embedding)
            all_similarities.append(similarities)

        return np.array(all_similarities).T  # Shape: (n_resumes, n_jobs)

    def get_embedding_statistics(self) -> Dict[str, Any]:
        """Get embedding generation statistics"""

        base_stats = {
            'model_name': self.config.EMBEDDING_MODEL,
            'device': str(self.device),
            'batch_size': self.config.BATCH_SIZE,
            'max_sequence_length': self.config.MAX_SEQUENCE_LENGTH
        }

        if hasattr(self.model, 'get_sentence_embedding_dimension'):
            base_stats['embedding_dimension'] = self.model.get_sentence_embedding_dimension()

        # Add processing stats if available
        base_stats.update(self.processing_stats)

        return base_stats

    def clear_cache(self):
        """Clear embedding cache and GPU memory"""
        self.embedding_cache.clear()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()
        print("🧹 Cache and GPU memory cleared")

# Initialize BERT embedding engine
print("🧠 Initializing BERT Embedding Engine...")
bert_engine = BERTEmbeddingEngine(config)

# Generate embeddings for all resumes
print("\n📄 Generating embeddings for resume dataset...")
resume_embeddings = bert_engine.generate_resume_embeddings(df)

# Display embedding statistics
print("\n📊 BERT Embedding Statistics:")
embedding_stats = bert_engine.get_embedding_statistics()
for key, value in embedding_stats.items():
    if isinstance(value, float):
        print(f"   {key.replace('_', ' ').title()}: {value:.3f}")
    else:
        print(f"   {key.replace('_', ' ').title()}: {value}")

# Memory management
bert_engine.clear_cache()

print("✅ BERT embedding engine ready!")
print(f"🧠 Resume embeddings shape: {resume_embeddings.shape}")

Using device: cpu
🧠 Initializing BERT Embedding Engine...
🧠 Initializing BERT model: all-MiniLM-L6-v2
✅ Model loaded successfully on cpu
   📏 Max sequence length: 512
   🔢 Embedding dimension: 384

📄 Generating embeddings for resume dataset...
📄 Preparing resume texts for embedding generation...


Preparing texts:   0%|          | 0/1000 [00:00<?, ?it/s]

🧠 Generating embeddings for 1000 texts...


Generating resume embeddings:   0%|          | 0/63 [00:00<?, ?it/s]

✅ Embeddings generated successfully!
   📊 Shape: (1000, 384)
   ⏱️ Time: 14.25s (70.2/s)

📊 BERT Embedding Statistics:
   Model Name: all-MiniLM-L6-v2
   Device: cpu
   Batch Size: 16
   Max Sequence Length: 512
   Embedding Dimension: 384
   Last Batch Size: 1000
   Last Processing Time: 14.250
   Embeddings Per Second: 70.178
🧹 Cache and GPU memory cleared
✅ BERT embedding engine ready!
🧠 Resume embeddings shape: (1000, 384)


In [15]:
# 🔍 Advanced Similarity Calculation Module
# Multi-modal similarity scoring with weighted combinations

class AdvancedSimilarityCalculator:
    """Advanced similarity calculation with multiple matching strategies"""

    def __init__(self, config: Config):
        self.config = config
        self.tfidf_vectorizer = None
        self.similarity_stats = {}

    def calculate_comprehensive_similarity(self,
                                         resume_embeddings: np.ndarray,
                                         job_description: str,
                                         df: pd.DataFrame) -> Dict[str, np.ndarray]:
        """Calculate comprehensive similarity scores using multiple methods"""

        print("🔍 Calculating comprehensive similarity scores...")
        start_time = time.time()

        similarities = {}

        # 1. Semantic similarity using BERT embeddings
        print("   🧠 Computing semantic similarity...")
        job_embedding = bert_engine.generate_job_embedding(job_description)
        similarities['semantic'] = bert_engine.calculate_semantic_similarity(
            resume_embeddings, job_embedding
        )

        # 2. TF-IDF based skill similarity
        print("   📚 Computing skill similarity...")
        similarities['skills'] = self._calculate_skill_similarity(job_description, df)

        # 3. Experience-based similarity
        print("   💼 Computing experience similarity...")
        similarities['experience'] = self._calculate_experience_similarity(job_description, df)

        # 4. Education-based similarity
        print("   🎓 Computing education similarity...")
        similarities['education'] = self._calculate_education_similarity(job_description, df)

        # 5. Combined weighted similarity
        print("   ⚖️ Computing weighted similarity...")
        similarities['weighted'] = self._calculate_weighted_similarity(similarities)

        # Store calculation statistics
        calculation_time = time.time() - start_time
        self.similarity_stats = {
            'calculation_time': calculation_time,
            'resumes_processed': len(df),
            'similarity_methods': list(similarities.keys()),
            'processing_speed': len(df) / calculation_time
        }

        print(f"✅ Similarity calculation complete!")
        print(f"   ⏱️ Time: {calculation_time:.2f}s")
        print(f"   🚀 Speed: {self.similarity_stats['processing_speed']:.1f} resumes/second")

        return similarities

    def _calculate_skill_similarity(self, job_description: str, df: pd.DataFrame) -> np.ndarray:
        """Calculate TF-IDF based skill similarity"""

        # Extract skills from job description
        job_skills = self._extract_job_skills(job_description)

        # Prepare resume skills texts
        resume_skills = []
        for _, row in df.iterrows():
            if 'skills_processed' in row and pd.notna(row['skills_processed']):
                skills_text = str(row['skills_processed'])
            else:
                skills_text = str(row.get('Skills', ''))
            resume_skills.append(skills_text.lower())

        # Create TF-IDF vectors
        if self.tfidf_vectorizer is None:
            self.tfidf_vectorizer = TfidfVectorizer(
                max_features=1000,
                ngram_range=(1, 2),
                stop_words='english',
                lowercase=True,
                token_pattern=r'\b[a-zA-Z][a-zA-Z0-9\+\#\.]*\b'  # Include tech symbols
            )

            # Fit on all resume skills + job skills
            all_texts = resume_skills + [job_skills]
            self.tfidf_vectorizer.fit(all_texts)

        # Transform texts to vectors
        resume_vectors = self.tfidf_vectorizer.transform(resume_skills)
        job_vector = self.tfidf_vectorizer.transform([job_skills])

        # Calculate cosine similarity
        skill_similarities = cosine_similarity(resume_vectors, job_vector).flatten()

        # Apply skill-specific bonuses
        skill_similarities = self._apply_skill_bonuses(skill_similarities, job_description, df)

        return skill_similarities

    def _extract_job_skills(self, job_description: str) -> str:
        """Extract and enhance skills from job description"""

        job_text = job_description.lower()

        # Extract technical skills mentioned in job description
        extracted_skills = []

        # Use skill patterns from preprocessor
        for skill in ALL_TECHNICAL_SKILLS:
            if skill.lower() in job_text:
                extracted_skills.append(skill)

        # Add common skill variations and synonyms
        skill_synonyms = {
            'python': ['python', 'py', 'python3'],
            'javascript': ['javascript', 'js', 'node.js', 'nodejs'],
            'machine learning': ['machine learning', 'ml', 'artificial intelligence', 'ai'],
            'data science': ['data science', 'data analysis', 'analytics'],
            'aws': ['aws', 'amazon web services', 'cloud'],
            'docker': ['docker', 'containerization', 'containers']
        }

        for base_skill, synonyms in skill_synonyms.items():
            if any(syn in job_text for syn in synonyms):
                extracted_skills.extend(synonyms)

        # Enhance with context
        if extracted_skills:
            return ' '.join(set(extracted_skills))
        else:
            return job_description  # Fallback to full description

    def _apply_skill_bonuses(self, similarities: np.ndarray,
                           job_description: str, df: pd.DataFrame) -> np.ndarray:
        """Apply bonuses for specific skill matches"""

        job_text = job_description.lower()
        enhanced_similarities = similarities.copy()

        # High-value skills that get bonuses
        high_value_skills = {
            'machine learning': 0.1,
            'deep learning': 0.1,
            'python': 0.05,
            'aws': 0.05,
            'kubernetes': 0.08,
            'tensorflow': 0.08,
            'pytorch': 0.08
        }

        for i, (_, row) in enumerate(df.iterrows()):
            if i >= len(enhanced_similarities):
                break

            skills_text = str(row.get('Skills', '')).lower()

            for skill, bonus in high_value_skills.items():
                # If both job and resume mention this high-value skill
                if skill in job_text and skill in skills_text:
                    enhanced_similarities[i] += bonus

        # Normalize to [0, 1] range
        enhanced_similarities = np.clip(enhanced_similarities, 0, 1)

        return enhanced_similarities

    def _calculate_experience_similarity(self, job_description: str, df: pd.DataFrame) -> np.ndarray:
        """Calculate experience-based similarity"""

        # Extract required experience from job description
        required_experience = self._extract_required_experience(job_description)

        experience_similarities = []

        for _, row in df.iterrows():
            candidate_experience = row.get('Experience (Years)', 0)
            if pd.isna(candidate_experience):
                candidate_experience = 0

            # Calculate experience match score
            if required_experience == 0:
                # No specific requirement, give moderate score
                exp_score = 0.7
            elif candidate_experience >= required_experience:
                # Meets or exceeds requirement
                if candidate_experience <= required_experience * 1.5:
                    exp_score = 1.0  # Perfect match
                else:
                    # Overqualified (might be expensive)
                    exp_score = 0.8
            else:
                # Under-qualified, but might be trainable
                ratio = candidate_experience / required_experience if required_experience > 0 else 0
                exp_score = max(0.3, ratio * 0.8)

            experience_similarities.append(exp_score)

        return np.array(experience_similarities)

    def _extract_required_experience(self, job_description: str) -> int:
        """Extract required years of experience from job description"""

        job_text = job_description.lower()

        # Pattern matching for experience requirements
        experience_patterns = [
            r'(\d+)\+?\s*years?\s*(?:of\s*)?experience',
            r'(\d+)\+?\s*years?\s*(?:of\s*)?expertise',
            r'minimum\s*(\d+)\s*years?',
            r'at\s*least\s*(\d+)\s*years?',
            r'(\d+)\+\s*years?'
        ]

        for pattern in experience_patterns:
            matches = re.findall(pattern, job_text)
            if matches:
                return int(matches[0])

        # If no specific requirement found, check for level indicators
        if any(term in job_text for term in ['senior', 'lead', 'principal']):
            return 5
        elif any(term in job_text for term in ['mid-level', 'intermediate']):
            return 3
        elif any(term in job_text for term in ['junior', 'entry-level', 'graduate']):
            return 1

        return 0  # No specific requirement

    def _calculate_education_similarity(self, job_description: str, df: pd.DataFrame) -> np.ndarray:
        """Calculate education-based similarity"""

        # Extract required education from job description
        required_education = self._extract_required_education(job_description)

        education_similarities = []

        for _, row in df.iterrows():
            candidate_education = str(row.get('Education', '')).lower()

            # Calculate education match score
            candidate_level = 0
            for level, score in EDUCATION_HIERARCHY.items():
                if level in candidate_education:
                    candidate_level = max(candidate_level, score)

            # Compare with required level
            if required_education == 0:
                # No specific requirement
                edu_score = 0.8
            elif candidate_level >= required_education:
                edu_score = 1.0
            elif candidate_level >= required_education - 1:
                # Close match (e.g., Bachelor when Master preferred)
                edu_score = 0.8
            else:
                # Significant gap
                edu_score = 0.5

            # Bonus for relevant field
            relevant_fields = ['computer', 'engineering', 'science', 'technology', 'data']
            if any(field in candidate_education for field in relevant_fields):
                edu_score = min(1.0, edu_score + 0.1)

            education_similarities.append(edu_score)

        return np.array(education_similarities)

    def _extract_required_education(self, job_description: str) -> int:
        """Extract required education level from job description"""

        job_text = job_description.lower()

        # Check for specific degree requirements
        if any(term in job_text for term in ['phd', 'doctorate', 'ph.d']):
            return 5
        elif any(term in job_text for term in ['master', 'masters', 'mba', 'ms', 'ma']):
            return 4
        elif any(term in job_text for term in ['bachelor', 'bachelors', 'bs', 'ba', 'degree']):
            return 3
        elif any(term in job_text for term in ['associate', 'diploma']):
            return 2

        return 0  # No specific requirement

    def _calculate_weighted_similarity(self, similarities: Dict[str, np.ndarray]) -> np.ndarray:
        """Calculate weighted combination of all similarity scores"""

        # Use configured weights
        weights = {
            'semantic': self.config.SEMANTIC_WEIGHT,
            'skills': self.config.SKILL_WEIGHT,
            'experience': self.config.EXPERIENCE_WEIGHT,
            'education': self.config.EDUCATION_WEIGHT
        }

        # Normalize weights to sum to 1
        total_weight = sum(weights.values())
        if total_weight > 0:
            weights = {k: v/total_weight for k, v in weights.items()}

        # Calculate weighted sum
        weighted_scores = np.zeros_like(similarities['semantic'])

        for similarity_type, weight in weights.items():
            if similarity_type in similarities:
                weighted_scores += weight * similarities[similarity_type]

        return weighted_scores

    def rank_candidates(self, similarities: Dict[str, np.ndarray],
                       df: pd.DataFrame, top_k: int = None) -> pd.DataFrame:
        """Rank candidates based on similarity scores"""

        if top_k is None:
            top_k = self.config.TOP_K_CANDIDATES

        print(f"🏆 Ranking top {top_k} candidates...")

        # Create ranking dataframe
        ranking_df = df.copy()

        # Add similarity scores
        for sim_type, scores in similarities.items():
            ranking_df[f'{sim_type}_similarity'] = scores

        # Sort by weighted similarity (primary) and other factors
        ranking_df = ranking_df.sort_values([
            'weighted_similarity',
            'semantic_similarity',
            'skills_similarity'
        ], ascending=False)

        # Add ranking information
        ranking_df['rank'] = range(1, len(ranking_df) + 1)
        ranking_df['match_score'] = (ranking_df['weighted_similarity'] * 100).round(1)

        # Return top K candidates
        top_candidates = ranking_df.head(top_k)

        print(f"✅ Top {len(top_candidates)} candidates ranked!")
        print(f"   🥇 Best match score: {top_candidates['match_score'].iloc[0]:.1f}%")
        print(f"   📊 Average score: {top_candidates['match_score'].mean():.1f}%")

        return top_candidates

    def get_similarity_statistics(self) -> Dict[str, Any]:
        """Get comprehensive similarity calculation statistics"""
        return self.similarity_stats

    def analyze_similarity_distribution(self, similarities: Dict[str, np.ndarray]) -> Dict[str, Dict[str, float]]:
        """Analyze distribution of similarity scores"""

        distributions = {}

        for sim_type, scores in similarities.items():
            distributions[sim_type] = {
                'mean': float(np.mean(scores)),
                'std': float(np.std(scores)),
                'min': float(np.min(scores)),
                'max': float(np.max(scores)),
                'median': float(np.median(scores)),
                'q25': float(np.percentile(scores, 25)),
                'q75': float(np.percentile(scores, 75))
            }

        return distributions

# Initialize similarity calculator
print("🔍 Initializing Advanced Similarity Calculator...")
similarity_calculator = AdvancedSimilarityCalculator(config)

# Example job description for demonstration
sample_job_description = """
We are looking for a Senior Data Scientist with 5+ years of experience in machine learning and Python.
The ideal candidate should have:

- Strong experience with Python, TensorFlow, and scikit-learn
- Master's degree in Computer Science, Data Science, or related field
- Experience with AWS cloud platforms
- Knowledge of deep learning and neural networks
- Strong analytical and problem-solving skills
- Experience with data visualization tools

Responsibilities include developing ML models, analyzing large datasets, and collaborating with cross-functional teams.
"""

print(f"💼 Using sample job description:")
print(f"   📝 Length: {len(sample_job_description.split())} words")
print(f"   🎯 Key requirements: Python, Machine Learning, 5+ years, Master's degree")

# Calculate comprehensive similarities
similarities = similarity_calculator.calculate_comprehensive_similarity(
    resume_embeddings, sample_job_description, df
)

#

🔍 Initializing Advanced Similarity Calculator...
💼 Using sample job description:
   📝 Length: 81 words
   🎯 Key requirements: Python, Machine Learning, 5+ years, Master's degree
🔍 Calculating comprehensive similarity scores...
   🧠 Computing semantic similarity...
💼 Generating job description embedding...
🧠 Generating embeddings for 1 texts...


Processing job description:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Embeddings generated successfully!
   📊 Shape: (1, 384)
   ⏱️ Time: 0.06s (15.7/s)
🔍 Calculating semantic similarity scores...
✅ Similarity calculation complete!
   📊 Mean similarity: 0.613
   📈 Max similarity: 0.766
   📉 Min similarity: 0.410
   📚 Computing skill similarity...
   💼 Computing experience similarity...
   🎓 Computing education similarity...
   ⚖️ Computing weighted similarity...
✅ Similarity calculation complete!
   ⏱️ Time: 0.23s
   🚀 Speed: 4437.6 resumes/second


In [16]:
# 🧪 Test the Resume Screening System
# Rank candidates based on the sample job description

print("🧪 Testing the system with the sample job description...")

# Rank candidates using the calculated similarities
top_candidates_df = similarity_calculator.rank_candidates(
    similarities,
    df,  # Use the original DataFrame to include all candidate info
    top_k=config.TOP_K_CANDIDATES # Use the configured top_k value
)

# Display the top candidates
print("\n🏆 Top Ranked Candidates:")
display(top_candidates_df[['Resume_ID', 'Name', 'Job Role', 'Experience (Years)', 'Education', 'match_score', 'rank']])

print("\n✅ Test complete! Top candidates displayed.")

🧪 Testing the system with the sample job description...
🏆 Ranking top 20 candidates...
✅ Top 20 candidates ranked!
   🥇 Best match score: 70.6%
   📊 Average score: 67.0%

🏆 Top Ranked Candidates:


Unnamed: 0,Resume_ID,Name,Job Role,Experience (Years),Education,match_score,rank
152,153,Dale Williams,Data Scientist,7,PhD,70.599998,1
732,733,Vicki Murphy,Data Scientist,5,PhD,70.5,2
533,534,Peter Mcmahon,Data Scientist,5,MBA,70.300003,3
792,793,Tom Bennett,Data Scientist,6,MBA,69.699997,4
174,175,Lisa Jones,Data Scientist,6,MBA,69.300003,5
902,903,Lisa White,Data Scientist,6,PhD,69.199997,6
910,911,Erin Hull,Data Scientist,6,PhD,68.599998,7
442,443,Jason Cummings,Data Scientist,7,MBA,67.199997,8
739,740,Amanda Shea,Data Scientist,6,PhD,67.0,9
619,620,Mr. Bryan Hernandez,Data Scientist,6,MBA,66.900002,10



✅ Test complete! Top candidates displayed.


In [17]:
# 🧪 Additional Input Testing
# Test the system with a different job description

print("🧪 Performing additional input testing with a different job description...")

# Define a new sample job description (e.g., for a Software Engineer)
# Get job description input from the user
new_job_description = input("Please enter the job description you want to use for screening: ")

print(f"\n💼 Testing with the following job description:")
print(f"   📝 Length: {len(new_job_description.split())} words")
# Attempt to extract key requirements for printing (optional, requires more robust parsing)
# print(f"   🎯 Key requirements: Java, Spring Boot, 3+ years, Microservices, Docker") # Removed hardcoded example
print("-" * 30)

# Calculate comprehensive similarities for the new job description
new_similarities = similarity_calculator.calculate_comprehensive_similarity(
    resume_embeddings,  # Use the same resume embeddings
    new_job_description,
    df  # Use the original DataFrame
)

# Rank candidates based on the new similarity scores
top_candidates_new_job = similarity_calculator.rank_candidates(
    new_similarities,
    df,
    top_k=config.TOP_K_CANDIDATES
)

# Display the top candidates for the new job
print("\n🏆 Top Ranked Candidates for Software Engineer Role:") # Consider making this dynamic based on input
display(top_candidates_new_job[['Resume_ID', 'Name', 'Job Role', 'Experience (Years)', 'Education', 'match_score', 'rank']])

print("\n✅ Additional input testing complete.")

🧪 Performing additional input testing with a different job description...
Please enter the job description you want to use for screening: Senior Developer

💼 Testing with the following job description:
   📝 Length: 2 words
------------------------------
🔍 Calculating comprehensive similarity scores...
   🧠 Computing semantic similarity...
💼 Generating job description embedding...
🧠 Generating embeddings for 1 texts...


Processing job description:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Embeddings generated successfully!
   📊 Shape: (1, 384)
   ⏱️ Time: 0.03s (29.9/s)
🔍 Calculating semantic similarity scores...
✅ Similarity calculation complete!
   📊 Mean similarity: 0.451
   📈 Max similarity: 0.553
   📉 Min similarity: 0.345
   📚 Computing skill similarity...
   💼 Computing experience similarity...
   🎓 Computing education similarity...
   ⚖️ Computing weighted similarity...
✅ Similarity calculation complete!
   ⏱️ Time: 0.18s
   🚀 Speed: 5673.6 resumes/second
🏆 Ranking top 20 candidates...
✅ Top 20 candidates ranked!
   🥇 Best match score: 49.3%
   📊 Average score: 49.0%

🏆 Top Ranked Candidates for Software Engineer Role:


Unnamed: 0,Resume_ID,Name,Job Role,Experience (Years),Education,match_score,rank
610,611,Kelly Scott,Software Engineer,5,MBA,49.299999,1
59,60,Sarah Kane,Software Engineer,6,PhD,49.200001,2
582,583,Brian Dean,Software Engineer,5,PhD,49.200001,3
925,926,Colleen Conner,Software Engineer,5,PhD,49.099998,4
370,371,Timothy Johnson,Software Engineer,7,PhD,49.099998,5
88,89,Jose Mclaughlin,Software Engineer,5,PhD,49.0,6
868,869,Peter Wilson,Software Engineer,6,M.Tech,49.0,7
735,736,Justin Clark,Software Engineer,5,B.Tech,49.0,8
185,186,Anthony Benson,Software Engineer,5,MBA,48.900002,9
831,832,Christina Sosa,Software Engineer,6,B.Tech,48.900002,10



✅ Additional input testing complete.
