In [1]:
pip install pdfplumber numpy pandas joblib torch transformers nltk scikit-learn matplotlib seaborn tqdm

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install --upgrade matplotlib seaborn

Collecting matplotlib
  Downloading matplotlib-3.10.1-cp312-cp312-win_amd64.whl.metadata (11 kB)
Downloading matplotlib-3.10.1-cp312-cp312-win_amd64.whl (8.1 MB)
   ---------------------------------------- 0.0/8.1 MB ? eta -:--:--
   ---------------- ----------------------- 3.4/8.1 MB 20.0 MB/s eta 0:00:01
   ---------------------- ----------------- 4.5/8.1 MB 12.2 MB/s eta 0:00:01
   ------------------------------------- -- 7.6/8.1 MB 13.0 MB/s eta 0:00:01
   ---------------------------------------- 8.1/8.1 MB 12.8 MB/s eta 0:00:00
Installing collected packages: matplotlib
  Attempting uninstall: matplotlib
    Found existing installation: matplotlib 3.10.0
    Uninstalling matplotlib-3.10.0:
      Successfully uninstalled matplotlib-3.10.0
Successfully installed matplotlib-3.10.1
Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
mediapipe 0.10.20 requires protobuf<5,>=4.25.3, but you have protobuf 5.29.3 which is incompatible.


In [12]:
import os
import sys
import logging
import pdfplumber
import numpy as np
import pandas as pd
import joblib
from datetime import datetime
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import torch
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
import nltk
from concurrent.futures import ThreadPoolExecutor
import warnings

# Suppress Hugging Face symlinks warning
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

# Custom progress tracking function to avoid tqdm widget errors
def progress_tracker(iterable, desc=None, total=None):
    """Simple progress tracker that doesn't rely on widgets"""
    if total is None and hasattr(iterable, '__len__'):
        total = len(iterable)
    
    if desc and total:
        print(f"{desc} (total: {total})")
    
    for i, item in enumerate(iterable):
        if i % 10 == 0 and total:  # Print progress every 10 items
            print(f"Progress: {i}/{total} ({i/total*100:.1f}%)", end="\r")
        yield item
    
    if total:
        print(f"Completed: {total}/{total} (100%)")

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("resume_analyzer.log"),
        logging.StreamHandler(sys.stdout)
    ]
)

logger = logging.getLogger("ResumeAnalyzer")


class ResumeAnalyzer:
    def __init__(self, resume_folder, job_description, model_name='distilbert-base-uncased'):
        """
        Initialize the Resume Analyzer with folder path and job description
        
        Args:
            resume_folder (str): Path to folder containing resume PDFs
            job_description (str): Job description text to compare against
            model_name (str): Transformer model to use for embeddings
        """
        self.resume_folder = resume_folder
        self.job_description = job_description
        self.model_name = model_name
        self.resumes = []
        self.resume_names = []
        self.stop_words = None
        self.tokenizer = None
        self.model = None
        self.vectorizer = None
        self.results = None
        self.use_transformers = True
        
        # Create output folder
        self.output_folder = f"resume_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        os.makedirs(self.output_folder, exist_ok=True)
        
        # Initialize NLTK resources
        self._initialize_nltk()
    
    def _initialize_nltk(self):
        """Download and initialize NLTK resources"""
        try:
            # Use quiet mode to avoid unnecessary output
            nltk.download('stopwords', quiet=True)
            nltk.download('punkt', quiet=True)
            self.stop_words = set(stopwords.words("english"))
            logger.info("NLTK resources initialized successfully")
        except Exception as e:
            logger.error(f"Error initializing NLTK resources: {e}")
            raise
    
    def _extract_text_from_pdf(self, pdf_path):
        """
        Extract text from PDF file
        
        Args:
            pdf_path (str): Path to PDF file
            
        Returns:
            str: Extracted text
        """
        text = ""
        try:
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + " "
            return text.lower().strip()
        except Exception as e:
            logger.error(f"Error reading {pdf_path}: {e}")
            return ""
    
    def _preprocess_text(self, text):
        """
        Preprocess text by tokenizing and removing stopwords
        
        Args:
            text (str): Raw text
            
        Returns:
            str: Preprocessed text
        """
        # Remove special characters and extra whitespace
        text = re.sub(r'[^\w\s]', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Tokenize and remove stopwords
        words = word_tokenize(text)
        filtered_words = [word for word in words if word.isalnum() and word not in self.stop_words]
        
        return " ".join(filtered_words)
    
    def _initialize_model(self):
        """Initialize transformer model for embeddings"""
        try:
            logger.info(f"Loading {self.model_name} model")
            
            # Import here to avoid unnecessary loading if transformers aren't used
            from transformers import AutoTokenizer, AutoModel
            
            # Suppress specific transformers warnings
            warnings.filterwarnings("ignore", category=UserWarning, 
                                   message="The cached_file")
            
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = AutoModel.from_pretrained(self.model_name)
            
            # Move model to GPU if available
            if torch.cuda.is_available():
                self.model = self.model.to('cuda')
                logger.info("Model loaded on GPU")
            else:
                logger.info("Model loaded on CPU")
                
            return True
                
        except Exception as e:
            logger.error(f"Error loading transformer model: {e}")
            logger.info("Falling back to TF-IDF vectorization only")
            self.model = None
            self.tokenizer = None
            self.use_transformers = False
            return False
    
    def _get_transformer_embedding(self, text):
        """
        Get embedding from transformer model
        
        Args:
            text (str): Input text
            
        Returns:
            numpy.ndarray: Embedding vector
        """
        try:
            # Tokenize and truncate to model's max length
            tokens = self.tokenizer(
                text, 
                padding=True, 
                truncation=True, 
                max_length=512, 
                return_tensors="pt"
            )
            
            # Move tokens to GPU if available
            if torch.cuda.is_available():
                tokens = {key: val.to('cuda') for key, val in tokens.items()}
            
            # Get embeddings
            with torch.no_grad():
                outputs = self.model(**tokens)
            
            # Use CLS token as sentence embedding
            embeddings = outputs.last_hidden_state[:, 0].cpu().numpy()
            return embeddings
        except Exception as e:
            logger.error(f"Error generating embeddings: {e}")
            return np.zeros((1, 768))  # Return zero vector as fallback
    
    def load_resumes(self):
        """Load and process resumes from folder"""
        logger.info(f"Loading resumes from {self.resume_folder}")
        
        if not os.path.isdir(self.resume_folder):
            logger.error(f"Error: '{self.resume_folder}' is not a valid directory")
            raise ValueError(f"'{self.resume_folder}' is not a valid directory")
        
        # Get all PDF files
        pdf_files = [f for f in os.listdir(self.resume_folder) if f.lower().endswith(".pdf")]
        
        if not pdf_files:
            logger.error("No PDF files found in the directory")
            raise ValueError("No PDF files found in the directory")
        
        logger.info(f"Found {len(pdf_files)} PDF files")
        
        # Process PDFs in parallel with max workers capped to avoid resource issues
        max_workers = min(os.cpu_count() or 4, len(pdf_files), 4)  # Limit to max 4 workers
        
        print(f"Processing {len(pdf_files)} PDF files with {max_workers} workers...")
        
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = []
            for file in pdf_files:
                file_path = os.path.join(self.resume_folder, file)
                futures.append(executor.submit(self._extract_text_from_pdf, file_path))
            
            # Collect results with simple progress tracking
            for i, (file, future) in enumerate(zip(pdf_files, futures)):
                if i % 5 == 0:
                    print(f"Processing file {i+1}/{len(pdf_files)}")
                    
                text = future.result()
                if text:
                    self.resumes.append(text)
                    self.resume_names.append(file)
                else:
                    logger.warning(f"No text extracted from {file}")
        
        logger.info(f"Successfully loaded {len(self.resumes)} resumes")
        
        # Preprocess texts
        logger.info("Preprocessing resume texts")
        self.resumes = [self._preprocess_text(resume) for resume in self.resumes]
        self.job_description = self._preprocess_text(self.job_description)
        
        return len(self.resumes)
    
    def analyze_with_tfidf(self):
        """Analyze resumes using TF-IDF vectorization"""
        logger.info("Analyzing resumes with TF-IDF")
        
        # Create and fit vectorizer
        self.vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
        all_docs = self.resumes + [self.job_description]
        tfidf_matrix = self.vectorizer.fit_transform(all_docs)
        
        # Get job description vector
        job_vector = tfidf_matrix[-1]
        resume_vectors = tfidf_matrix[:-1]
        
        # Calculate similarity scores
        similarities = cosine_similarity(resume_vectors, job_vector).flatten()
        
        # Find important keywords
        feature_names = np.array(self.vectorizer.get_feature_names_out())
        job_keywords = self._extract_important_keywords(job_vector, feature_names)
        
        # Store results
        self.tfidf_results = pd.DataFrame({
            "Resume": self.resume_names,
            "TF-IDF Similarity": similarities,
            "ATS Score": (similarities * 100).round(2)
        })
        
        # Clustering
        num_clusters = min(3, len(self.resumes))
        kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
        cluster_labels = kmeans.fit_predict(resume_vectors)
        
        self.tfidf_results["Cluster"] = cluster_labels
        
        # Sort by similarity score
        self.tfidf_results = self.tfidf_results.sort_values(
            by="TF-IDF Similarity", 
            ascending=False
        )
        
        return self.tfidf_results, job_keywords
    
    def analyze_with_transformer(self):
        """Analyze resumes using transformer embeddings"""
        if not self.tokenizer or not self.model:
            if not self._initialize_model():
                logger.warning("Skipping transformer analysis due to model initialization failure")
                return None, []
        
        logger.info(f"Analyzing resumes with {self.model_name}")
        
        # Get embeddings for all documents
        print(f"Creating embeddings for {len(self.resumes)} resumes...")
        resume_embeddings = []
        for i, resume in enumerate(self.resumes):
            if i % 5 == 0:
                print(f"Processing embedding {i+1}/{len(self.resumes)}")
            resume_embeddings.append(self._get_transformer_embedding(resume))
        
        resume_embeddings = np.vstack(resume_embeddings)
        job_embedding = self._get_transformer_embedding(self.job_description)
        
        # Calculate cosine similarities
        similarities = cosine_similarity(resume_embeddings, job_embedding).flatten()
        
        # Store results
        self.transformer_results = pd.DataFrame({
            "Resume": self.resume_names,
            "Transformer Similarity": similarities,
            "Transformer Score": (similarities * 100).round(2)
        })
        
        # Sort by similarity score
        self.transformer_results = self.transformer_results.sort_values(
            by="Transformer Similarity", 
            ascending=False
        )
        
        return self.transformer_results
    
    def _extract_important_keywords(self, vector, feature_names, top_n=20):
        """
        Extract important keywords from a TF-IDF vector
        
        Args:
            vector: Sparse vector from TF-IDF
            feature_names: Array of feature names
            top_n: Number of top keywords to extract
            
        Returns:
            list: List of important keywords
        """
        # Get indices of highest values
        indices = vector.toarray().argsort()[0, -top_n:][::-1]
        
        # Get feature names for these indices
        top_keywords = [(feature_names[i], vector[0, i]) for i in indices]
        
        return top_keywords
    
    def combine_results(self):
        """Combine results from different analysis methods"""
        if hasattr(self, 'tfidf_results') and hasattr(self, 'transformer_results') and self.transformer_results is not None:
            # Join both result sets
            combined = pd.merge(
                self.tfidf_results, 
                self.transformer_results,
                on="Resume"
            )
            
            # Calculate combined score (average of both methods)
            combined["Combined Score"] = (
                (combined["ATS Score"] + combined["Transformer Score"]) / 2
            ).round(2)
            
            # Sort by combined score
            combined = combined.sort_values(by="Combined Score", ascending=False)
            self.results = combined
            
        elif hasattr(self, 'tfidf_results'):
            self.results = self.tfidf_results
            
        elif hasattr(self, 'transformer_results') and self.transformer_results is not None:
            self.results = self.transformer_results
            
        else:
            logger.error("No analysis results available")
            return None
        
        # Save results to CSV
        output_path = os.path.join(self.output_folder, "resume_analysis_results.csv")
        self.results.to_csv(output_path, index=False)
        logger.info(f"Results saved to {output_path}")
        
        return self.results
    
    def visualize_results(self):
        """Visualize analysis results"""
        if self.results is None:
            logger.error("No results available for visualization")
            return
        
        # Save visualizations to output folder
        logger.info("Creating visualizations")
        
        try:
            # 1. Bar chart of top resumes by score
            plt.figure(figsize=(12, 8))
            top_n = min(10, len(self.results))
            
            if "Combined Score" in self.results.columns:
                score_col = "Combined Score"
            elif "ATS Score" in self.results.columns:
                score_col = "ATS Score"
            else:
                score_col = "Transformer Score"
            
            # Get top resumes
            top_resumes = self.results.head(top_n)
            
            # Truncate long resume names
            top_resumes_plot = top_resumes.copy()
            top_resumes_plot["Resume"] = top_resumes_plot["Resume"].apply(
                lambda x: x[:25] + "..." if len(x) > 25 else x
            )
            
            # Create bar chart
            plt.barh(
                top_resumes_plot["Resume"],
                top_resumes_plot[score_col],
                color=sns.color_palette("viridis", top_n)
            )
            plt.title(f"Top {top_n} Resumes by {score_col}")
            plt.xlabel("Score")
            plt.ylabel("Resume")
            plt.tight_layout()
            plt.savefig(os.path.join(self.output_folder, "top_resumes.png"))
            plt.close()
            
            # 2. Cluster visualization if TF-IDF results available
            if hasattr(self, 'tfidf_results') and "Cluster" in self.tfidf_results.columns and len(self.resumes) >= 3:
                # Get TF-IDF matrix for resumes
                tfidf_matrix = self.vectorizer.transform(self.resumes)
                
                # Use t-SNE for dimensionality reduction
                perplexity = min(30, max(5, len(self.resumes) // 2))  # Adjust perplexity based on data size
                tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity)
                reduced_data = tsne.fit_transform(tfidf_matrix.toarray())
                
                # Create scatter plot
                plt.figure(figsize=(10, 8))
                
                # Get cluster labels
                clusters = self.tfidf_results["Cluster"].values
                unique_clusters = np.unique(clusters)
                
                # Plot each cluster
                for cluster in unique_clusters:
                    indices = np.where(clusters == cluster)[0]
                    plt.scatter(
                        reduced_data[indices, 0],
                        reduced_data[indices, 1],
                        label=f"Cluster {cluster}",
                        alpha=0.7
                    )
                
                # Add resume names as annotations
                for i, name in enumerate(self.resume_names):
                    short_name = os.path.splitext(name)[0][:15]
                    plt.annotate(
                        short_name,
                        (reduced_data[i, 0], reduced_data[i, 1]),
                        fontsize=8
                    )
                
                plt.title("Resume Clusters")
                plt.legend()
                plt.tight_layout()
                plt.savefig(os.path.join(self.output_folder, "resume_clusters.png"))
                plt.close()
                
            logger.info("Visualizations created successfully")
            
        except Exception as e:
            logger.error(f"Error creating visualizations: {e}")
            print(f"Error creating visualizations: {e}")
    
    def save_model(self):
        """Save trained model components for later use"""
        try:
            model_path = os.path.join(self.output_folder, "analyzer_model.joblib")
            
            model_data = {
                "vectorizer": self.vectorizer,
                "job_description": self.job_description,
                "model_name": self.model_name
            }
            
            joblib.dump(model_data, model_path)
            logger.info(f"Model saved to {model_path}")
        except Exception as e:
            logger.error(f"Error saving model: {e}")
    
    def extract_missing_keywords(self, resume_idx):
        """
        Find keywords from job description missing in a specific resume
        
        Args:
            resume_idx (int): Index of resume in self.resumes list
            
        Returns:
            list: List of missing keywords
        """
        if not hasattr(self, 'vectorizer') or self.vectorizer is None:
            logger.error("TF-IDF vectorizer not available")
            return []
        
        # Get feature names
        feature_names = self.vectorizer.get_feature_names_out()
        
        # Get job description vector
        all_docs = self.resumes + [self.job_description]
        tfidf_matrix = self.vectorizer.transform(all_docs)
        job_vector = tfidf_matrix[-1]
        
        # Get resume vector
        resume_vector = tfidf_matrix[resume_idx]
        
        # Find important job keywords
        job_keywords = self._extract_important_keywords(job_vector, feature_names, top_n=30)
        
        # Find missing keywords
        missing_keywords = []
        for keyword, importance in job_keywords:
            if resume_vector[0, self.vectorizer.vocabulary_[keyword]] == 0:
                missing_keywords.append((keyword, importance))
        
        return missing_keywords
    
    def run_full_analysis(self):
        """Run complete analysis pipeline"""
        # Load resumes
        num_resumes = self.load_resumes()
        if num_resumes == 0:
            logger.error("No valid resumes found")
            return None
        
        # Run TF-IDF analysis
        tfidf_results, job_keywords = self.analyze_with_tfidf()
        
        # Try to run transformer analysis
        transformer_results = None
        if self.use_transformers:
            try:
                transformer_results = self.analyze_with_transformer()
            except Exception as e:
                logger.error(f"Transformer analysis failed: {e}")
                logger.info("Continuing with TF-IDF results only")
        
        # Combine results
        combined_results = self.combine_results()
        
        # Create visualizations
        self.visualize_results()
        
        # Save model
        self.save_model()
        
        # Create summary report
        self._create_summary_report(job_keywords)
        
        return combined_results
    
    def _create_summary_report(self, job_keywords):
        """
        Create a summary report of the analysis
        
        Args:
            job_keywords: List of important keywords from job description
        """
        try:
            report_path = os.path.join(self.output_folder, "analysis_summary.txt")
            
            with open(report_path, 'w') as f:
                f.write("RESUME ANALYSIS SUMMARY\n")
                f.write("======================\n\n")
                
                f.write(f"Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
                f.write(f"Total Resumes Analyzed: {len(self.resumes)}\n\n")
                
                f.write("JOB DESCRIPTION\n")
                f.write("--------------\n")
                f.write(f"{self.job_description[:500]}...\n\n")
                
                f.write("TOP JOB KEYWORDS\n")
                f.write("--------------\n")
                for keyword, importance in job_keywords[:15]:
                    f.write(f"- {keyword} (importance: {importance:.4f})\n")
                f.write("\n")
                
                f.write("TOP 5 RESUMES\n")
                f.write("-----------\n")
                for i, (_, row) in enumerate(self.results.head(5).iterrows(), 1):
                    f.write(f"{i}. {row['Resume']}\n")
                    
                    if "Combined Score" in row:
                        f.write(f"   Combined Score: {row['Combined Score']:.2f}\n")
                    if "ATS Score" in row:
                        f.write(f"   ATS Score: {row['ATS Score']:.2f}\n")
                    if "Transformer Score" in row:
                        f.write(f"   Transformer Score: {row['Transformer Score']:.2f}\n")
                    
                    # Get missing keywords for this resume
                    resume_idx = self.resume_names.index(row['Resume'])
                    missing_keywords = self.extract_missing_keywords(resume_idx)
                    
                    if missing_keywords:
                        f.write("   Missing Keywords:\n")
                        for keyword, importance in missing_keywords[:5]:
                            f.write(f"   - {keyword}\n")
                    
                    f.write("\n")
                
                f.write("\nAnalysis complete. For full details, see the CSV results file.\n")
            
            logger.info(f"Summary report saved to {report_path}")
        except Exception as e:
            logger.error(f"Error creating summary report: {e}")


def main():
    """Main function"""
    print("=" * 50)
    print("Resume Analysis System")
    print("=" * 50)
    
    # Example usage with compatibility check
    resume_folder = input("Enter resume folder path (or press Enter for default D:\\resume_folder): ")
    if not resume_folder:
        resume_folder = r"D:\resume_folder"  # Default path
    
    if not os.path.isdir(resume_folder):
        print(f"Error: '{resume_folder}' is not a valid directory.")
        print("Please create this directory and add PDF resumes, or specify a different path.")
        return
    
    job_description = """
Job Description

Job Title: Python Developer Intern (Remote)

Location: Remote

About Us: We are an innovative tech company focused on delivering cutting-edge solutions to a wide range of industries. Our team thrives on creativity, problem-solving, and a shared passion for building impactful products. We are looking for a motivated and eager-to-learn Python Developer Intern to join our dynamic team and contribute to exciting projects.

Job Description:

As a Python Developer Intern, you will have the opportunity to gain hands-on experience working on real-world projects, learning from senior developers, and contributing to the development of scalable applications. You will work closely with our engineering team to help improve and optimize Python-based solutions, and gain exposure to the latest tools, technologies, and industry best practices.

Responsibilities:

Assist in developing and maintaining Python applications, scripts, and modules.
Write clean, efficient, and well-documented code.
Collaborate with team members to troubleshoot, debug, and resolve software defects.
Contribute to the design and implementation of new features and functionality.
Participate in code reviews and contribute to the improvement of coding standards.
Learn and apply best practices for testing, optimization, and performance tuning.
Stay up to date with Python developments and emerging technologies in software development.
Communicate effectively with team members to ensure smooth project progress.

Requirements

Requirements:

Currently pursuing or recently completed a degree in Computer Science, Software Engineering, or a related field.
Basic knowledge of Python programming and its libraries (e.g., Pandas, Flask, Django, etc.).
Familiarity with Git or other version control systems.
Strong problem-solving skills and a keen eye for detail.
Ability to work independently and collaborate in a remote team environment.
Eagerness to learn, take on challenges, and improve coding skills.
Good communication skills in English (both written and verbal).

Preferred Skills (optional But Advantageous):

Familiarity with web development frameworks such as Django or Flask.
Experience with databases like MySQL, PostgreSQL, or MongoDB.
Understanding of REST APIs and web services.
Knowledge of front-end technologies (HTML, CSS, JavaScript) is a plus.

Requirements

Requirements: Currently pursuing or recently completed a degree in Computer Science, Software Engineering, or a related field. Basic knowledge of Python programming and its libraries (e.g., Pandas, Flask, Django, etc.). Familiarity with Git or other version control systems. Strong problem-solving skills and a keen eye for detail. Ability to work independently and collaborate in a remote team environment. Eagerness to learn, take on challenges, and improve coding skills. Good communication skills in English (both written and verbal). Preferred Skills (optional but advantageous): Familiarity with web development frameworks such as Django or Flask. Experience with databases like MySQL, PostgreSQL, or MongoDB. Understanding of REST APIs and web services. Knowledge of front-end technologies (HTML, CSS, JavaScript) is a plus.
    """
    
    print("\nUsing the following job description:")
    print("-" * 40)
    print(job_description.strip())
    print("-" * 40)
    
    use_custom_desc = input("\nWould you like to use a custom job description? (y/n): ").lower()
    if use_custom_desc == 'y':
        print("\nEnter your job description (type 'END' on a new line when finished):")
        lines = []
        while True:
            line = input()
            if line == 'END':
                break
            lines.append(line)
        job_description = '\n'.join(lines)
    
    try:
        # Check for Hugging Face transformers library
        try:
            import transformers
            use_transformers = True
            print("\nTransformers library detected. Will use transformer models for analysis.")
        except ImportError:
            use_transformers = False
            print("\nTransformers library not found. Will use TF-IDF only for analysis.")
            print("To enable transformer models, install with: pip install transformers")
        
        # Create analyzer
        print("\nInitializing analyzer...")
        analyzer = ResumeAnalyzer(resume_folder, job_description)
        analyzer.use_transformers = use_transformers
        
        # Run analysis
        print("\nRunning analysis...")
        results = analyzer.run_full_analysis()
        
        if results is not None:
            print("\nAnalysis completed successfully!")
            print("\nTop 5 Resume Matches:")
            display_cols = ["Resume", "Combined Score" if "Combined Score" in results.columns else "ATS Score"]
            print(results.head(5)[display_cols].to_string(index=False))
            print(f"\nComplete results saved to: {analyzer.output_folder}")
            print(f"Check '{os.path.join(analyzer.output_folder, 'analysis_summary.txt')}' for a detailed report.")
        
    except Exception as e:
        logger.error(f"Analysis failed: {e}", exc_info=True)
        print(f"\nAnalysis failed: {str(e)}")
        print("See resume_analyzer.log for more details.")
    
    print("\nDone!")


if __name__ == "__main__":
    main()

Resume Analysis System


Enter resume folder path (or press Enter for default D:\resume_folder):  D:\resume_folder



Using the following job description:
----------------------------------------
Job Description

Job Title: Python Developer Intern (Remote)

Location: Remote

About Us: We are an innovative tech company focused on delivering cutting-edge solutions to a wide range of industries. Our team thrives on creativity, problem-solving, and a shared passion for building impactful products. We are looking for a motivated and eager-to-learn Python Developer Intern to join our dynamic team and contribute to exciting projects.

Job Description:

As a Python Developer Intern, you will have the opportunity to gain hands-on experience working on real-world projects, learning from senior developers, and contributing to the development of scalable applications. You will work closely with our engineering team to help improve and optimize Python-based solutions, and gain exposure to the latest tools, technologies, and industry best practices.

Responsibilities:

Assist in developing and maintaining Python a


Would you like to use a custom job description? (y/n):  n



Transformers library detected. Will use transformer models for analysis.

Initializing analyzer...
2025-02-28 15:43:45,269 - ResumeAnalyzer - INFO - NLTK resources initialized successfully

Running analysis...
2025-02-28 15:43:45,270 - ResumeAnalyzer - INFO - Loading resumes from D:\resume_folder
2025-02-28 15:43:45,271 - ResumeAnalyzer - INFO - Found 3 PDF files
Processing 3 PDF files with 3 workers...
Processing file 1/3
2025-02-28 15:43:45,748 - ResumeAnalyzer - INFO - Successfully loaded 3 resumes
2025-02-28 15:43:45,749 - ResumeAnalyzer - INFO - Preprocessing resume texts
2025-02-28 15:43:45,756 - ResumeAnalyzer - INFO - Analyzing resumes with TF-IDF
2025-02-28 15:43:45,797 - ResumeAnalyzer - INFO - Loading distilbert-base-uncased model
2025-02-28 15:43:46,457 - ResumeAnalyzer - INFO - Model loaded on CPU
2025-02-28 15:43:46,457 - ResumeAnalyzer - INFO - Analyzing resumes with distilbert-base-uncased
Creating embeddings for 3 resumes...
Processing embedding 1/3
2025-02-28 15:43:4