In [2]:
import pdfplumber
import re
import pandas as pd

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file using pdfplumber."""
    with pdfplumber.open(pdf_path) as pdf:
        text = "".join(page.extract_text() or "" for page in pdf.pages)
    return text

def parse_resume_sections(resume_text, resume_name):
    """Parses different sections from a resume text."""
    section_headers = ["SUMMARY", "CONTACT", "OBJECTIVE", "REFERENCES", "SKILLS", "EDUCATION", "EXPERIENCE", "PROJECTS"]
    normalized_text = re.sub(r'[\r\u2022\u200b]', '', resume_text)  # Remove special characters
    normalized_text = re.sub(r'-\n', '', normalized_text)  # Fix line breaks
    normalized_text = "\n" + normalized_text + "\n"  # Buffer for boundary matching
    
    sections = {}
    section_positions = []
    for header in section_headers:
        pattern = re.compile(rf'\n\s*{re.escape(header)}[\s:•\-]*\n+', re.IGNORECASE)
        for match in pattern.finditer(normalized_text):
            section_positions.append((match.start(), header))
    
    section_positions.sort()
    prev_end, prev_header = 0, "BASIC_INFO"
    
    for start, header in section_positions:
        sections[prev_header] = normalized_text[prev_end:start].strip()
        prev_end = start
        prev_header = header.upper()
    
    sections[prev_header] = normalized_text[prev_end:].strip()
    
    print(f"✅ Extracted sections of {resume_name}: {list(sections.keys())}")
    return sections

def load_dataset(file_path):
    """Loads a dataset from a CSV file."""
    return pd.read_csv(file_path)

def match_keywords(section_text, dataset, column_name, metric_column):
    """Finds matching entries from a dataset in the given section text."""
    if not section_text:
        return pd.DataFrame(columns=[metric_column])
    section_text = section_text.lower()
    return dataset[dataset[column_name].str.lower().apply(lambda x: x in section_text)][[metric_column]]

def calculate_resume_score(company_ranks, skills_scores, university_rankings, has_work_experience):
    # Use raw scores instead of normalized scores
    company_score = company_ranks['Rank'].min() if not company_ranks.empty else 0
    skills_score = skills_scores['Score'].sum() if not skills_scores.empty else 0
    university_score = university_rankings['ranking'].min() if not university_rankings.empty else 0
    
    # Assign weights
    company_weight = 2.0  # Adjust as needed
    skills_weight = 1.5   # Skills might be more important
    university_weight = 1.0  # Universities might be less important
    
    # Calculate weighted total score
    total_score = (
        (company_score * company_weight) +
        (skills_score * skills_weight) +
        (university_score * university_weight)
    )
    
    # Apply penalty for resumes without work experience
    if not has_work_experience:
        total_score *= 0.8  # Reduce score by 20% if no work experience
    
    # Scale the final score to a range of 0 to 100
    scaled_score = (total_score / 10)  # Adjust the divisor based on expected score range
    return round(scaled_score, 2)

def process_resume(pdf_path, company_df, skills_df, universities_df):
    """Processes a resume and returns its score."""
    resume_text = extract_text_from_pdf(pdf_path)
    resume_sections = parse_resume_sections(resume_text, pdf_path.split("/")[-1])
    
    company_ranks = match_keywords(resume_sections.get("EXPERIENCE", ""), company_df, 'Name', 'Rank')
    skills_scores = match_keywords(resume_sections.get("SKILLS", ""), skills_df, 'Skill', 'Score')
    university_rankings = match_keywords(resume_sections.get("EDUCATION", ""), universities_df, 'University', 'ranking')
    
    has_experience = "EXPERIENCE" in resume_sections and bool(resume_sections["EXPERIENCE"].strip())
    
    return calculate_resume_score(company_ranks, skills_scores, university_rankings, has_experience)

if __name__ == "__main__":
    company_df = load_dataset('/workspaces/SIMS-Project/Resume_Scrapper/Datasets/Companies_Dataset.csv')
    skills_df = load_dataset('/workspaces/SIMS-Project/Resume_Scrapper/Datasets/Skills_Dataset.csv')
    universities_df = load_dataset('/workspaces/SIMS-Project/Resume_Scrapper/Datasets/Universities_Dataset.csv')
    
    resume_files = [
        '/workspaces/SIMS-Project/Resume_Scrapper/Resumes/autoCV (1).pdf',
        '/workspaces/SIMS-Project/Resume_Scrapper/Resumes/autoCV (2).pdf',
        '/workspaces/SIMS-Project/Resume_Scrapper/Resumes/autoCV (3).pdf',
        '/workspaces/SIMS-Project/Resume_Scrapper/Resumes/autoCV (4).pdf',
        '/workspaces/SIMS-Project/Resume_Scrapper/Resumes/me.pdf'
    ]
    
    resume_scores = [(file, process_resume(file, company_df, skills_df, universities_df)) for file in resume_files]
    resume_scores.sort(key=lambda x: x[1], reverse=True)
    
    print("Ranked Resumes (Highest to Lowest):")
    for rank, (file, score) in enumerate(resume_scores, start=1):
        print(f"Rank {rank}: {file[24:]} (Score: {score})")


✅ Extracted sections of autoCV (1).pdf: ['BASIC_INFO', 'SUMMARY', 'SKILLS', 'PROJECTS', 'EDUCATION']
✅ Extracted sections of autoCV (2).pdf: ['BASIC_INFO', 'SUMMARY', 'SKILLS', 'EXPERIENCE', 'PROJECTS', 'EDUCATION']
✅ Extracted sections of autoCV (3).pdf: ['BASIC_INFO', 'SUMMARY', 'SKILLS', 'EXPERIENCE', 'PROJECTS', 'EDUCATION']
✅ Extracted sections of autoCV (4).pdf: ['BASIC_INFO', 'SUMMARY', 'SKILLS', 'EXPERIENCE', 'PROJECTS', 'EDUCATION']
✅ Extracted sections of me.pdf: ['BASIC_INFO']
Ranked Resumes (Highest to Lowest):
Rank 1: /Resume_Scrapper/Resumes/autoCV (3).pdf (Score: 337.45)
Rank 2: /Resume_Scrapper/Resumes/autoCV (4).pdf (Score: 290.29)
Rank 3: /Resume_Scrapper/Resumes/autoCV (2).pdf (Score: 155.29)
Rank 4: /Resume_Scrapper/Resumes/autoCV (1).pdf (Score: 123.43)
Rank 5: /Resume_Scrapper/Resumes/me.pdf (Score: 0.0)


In [7]:
# import pdfplumber
import read_resume
import File_downloader_from_github as file_downloader
import re
import pandas as pd
# import pdfplumber
# import pdfminer.high_level as pm

# def extract_text_and_links_from_pdf(pdf_path):
#     text = pm.extract_text(pdf_path)
#     links = []

#     github_link_pattern = r"https://github\.com/[a-zA-Z0-9-_]+/[a-zA-Z0-9-_]+"
#     links += re.findall(github_link_pattern, text)

#     with pdfplumber.open(pdf_path) as pdf:
#         for page in pdf.pages:
#             hyperlinks = page.hyperlinks
#             if hyperlinks:
#                 for hyperlink in hyperlinks:
#                     if 'uri' in hyperlink:
#                         links.append(hyperlink['uri'])

#     links = list(set(links))
#     return text, links

def parse_resume_sections(resume_text, resume_name):
    """Parses different sections from a resume text."""
    section_headers = ["SUMMARY", "CONTACT", "OBJECTIVE", "REFERENCES", "SKILLS", "EDUCATION", "EXPERIENCE", "PROJECTS"]
    normalized_text = re.sub(r'[\r\u2022\u200b]', '', resume_text)  # Remove special characters
    normalized_text = re.sub(r'-\n', '', normalized_text)  # Fix line breaks
    normalized_text = "\n" + normalized_text + "\n"  # Buffer for boundary matching
    
    sections = {}
    section_positions = []
    for header in section_headers:
        pattern = re.compile(rf'\n\s*{re.escape(header)}[\s:•\-]*\n+', re.IGNORECASE)
        for match in pattern.finditer(normalized_text):
            section_positions.append((match.start(), header))
    
    section_positions.sort()
    prev_end, prev_header = 0, "BASIC_INFO"
    
    for start, header in section_positions:
        sections[prev_header] = normalized_text[prev_end:start].strip()
        prev_end = start
        prev_header = header.upper()
    
    sections[prev_header] = normalized_text[prev_end:].strip()
    
    print(f"✅ Extracted sections of {resume_name}: {list(sections.keys())}")
    return sections

def load_dataset(file_path):
    """Loads a dataset from a CSV file."""
    return pd.read_csv(file_path)

def match_keywords(section_text, dataset, column_name, metric_column):
    """Finds matching entries from a dataset in the given section text."""
    if not section_text:
        return pd.DataFrame(columns=[metric_column])
    section_text = section_text.lower()
    return dataset[dataset[column_name].str.lower().apply(lambda x: x in section_text)][[metric_column]]

def calculate_resume_score(company_ranks, skills_scores, university_rankings, has_work_experience):
    # Use raw scores instead of normalized scores
    company_score = company_ranks['Rank'].min() if not company_ranks.empty else 0
    skills_score = skills_scores['Score'].sum() if not skills_scores.empty else 0
    university_score = university_rankings['ranking'].min() if not university_rankings.empty else 0
    
    # Assign weights
    company_weight = 2.0  # Adjust as needed
    skills_weight = 1.5   # Skills might be more important
    university_weight = 1.0  # Universities might be less important
    
    # Calculate weighted total score
    total_score = (
        (company_score * company_weight) +
        (skills_score * skills_weight) +
        (university_score * university_weight)
    )
    
    # Apply penalty for resumes without work experience
    if not has_work_experience:
        total_score *= 0.8  # Reduce score by 20% if no work experience
    
    # Scale the final score to a range of 0 to 100
    scaled_score = (total_score / 10)  # Adjust the divisor based on expected score range
    return round(scaled_score, 2)

def process_resume(pdf_path, company_df, skills_df, universities_df):
    """Processes a resume and returns its score."""
    resume_text, extracted_links = read_resume.extract_text_and_links_from_pdf(pdf_path)
    resume_sections = parse_resume_sections(resume_text, pdf_path.split("/")[-1])
    
    company_ranks = match_keywords(resume_sections.get("EXPERIENCE", ""), company_df, 'Name', 'Rank')
    skills_scores = match_keywords(resume_sections.get("SKILLS", ""), skills_df, 'Skill', 'Score')
    university_rankings = match_keywords(resume_sections.get("EDUCATION", ""), universities_df, 'University', 'ranking')
    
    has_experience = "EXPERIENCE" in resume_sections and bool(resume_sections["EXPERIENCE"].strip())
    
    # print("Extracted Links:")
    # for link in extracted_links:
    #     print(link)

    for link in extracted_links:
        if "github" in link:
            file_downloader.Downloader(link)

    return calculate_resume_score(company_ranks, skills_scores, university_rankings, has_experience)

if __name__ == "__main__":
    company_df = load_dataset('/workspaces/SIMS-Project/Resume_Scrapper/Datasets/Companies_Dataset.csv')
    skills_df = load_dataset('/workspaces/SIMS-Project/Resume_Scrapper/Datasets/Skills_Dataset.csv')
    universities_df = load_dataset('/workspaces/SIMS-Project/Resume_Scrapper/Datasets/Universities_Dataset.csv')
    
    resume_files = [
        '/workspaces/SIMS-Project/Resume_Scrapper/Resumes/autoCV (1).pdf',
        '/workspaces/SIMS-Project/Resume_Scrapper/Resumes/autoCV (3).pdf',
        '/workspaces/SIMS-Project/Resume_Scrapper/Resumes/autoCV (4).pdf',
        '/workspaces/SIMS-Project/Resume_Scrapper/Resumes/Resume_2.pdf',
        '/workspaces/SIMS-Project/Resume_Scrapper/Resumes/me.pdf',
    ]
    
    resume_scores = [(file, process_resume(file, company_df, skills_df, universities_df)) for file in resume_files]
    resume_scores.sort(key=lambda x: x[1], reverse=True)
    
    print("Ranked Resumes (Highest to Lowest):")
    for rank, (file, score) in enumerate(resume_scores, start=1):
        print(f"Rank {rank}: {file[24:]} (Score: {score})")


✅ Extracted sections of autoCV (1).pdf: ['BASIC_INFO', 'SUMMARY', 'SKILLS', 'PROJECTS', 'EDUCATION']
Found 5 files in https://github.com/Karthik0000007/Disease-Predictor


FileNotFoundError: [Errno 2] No such file or directory: 'Resume_Scrapper/Downloaded/code_files/-Disease_Predictor.py'

In [7]:
import pdfplumber
import re
import pandas as pd
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file.
    """
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

def parse_resume_sections(resume_text):
    """
    Parses the resume text into sections based on common headers.
    """
    section_headers = [
        'Summary', 'Skills', 'Education', 'Experience', 'Projects'
    ]
    
    # Preprocess text for consistent matching
    normalized_text = re.sub(r'[\r\u2022\u200b]', '', resume_text)  # Remove special chars
    normalized_text = re.sub(r'-\n', '', normalized_text)  # Handle hyphenated line breaks
    normalized_text = "\n" + normalized_text + "\n"  # Add buffer for boundary matches
    
    # Find all section positions
    section_positions = []
    for header in section_headers:
        pattern = re.compile(
            rf'\n\s*{re.escape(header)}\s*\n',
            re.IGNORECASE | re.DOTALL
        )
        for match in pattern.finditer(normalized_text):
            section_positions.append((match.start(), header.upper()))
    
    # Sort sections by appearance order
    section_positions.sort()
    sorted_sections = [header for _, header in section_positions]
    
    # Add Basic Info as first section if no sections found
    if not sorted_sections:
        sorted_sections = ["BASIC_INFO"]
    
    # Extract content between sections
    sections = {}
    prev_end = 0
    prev_header = "BASIC_INFO"
    
    for start, header in section_positions:
        sections[prev_header] = normalized_text[prev_end:start].strip()
        prev_end = start
        prev_header = header
    
    # Add final section
    sections[prev_header] = normalized_text[prev_end:].strip()
    
    logging.info(f"✅ Extracted sections: {list(sections.keys())}")
    return sections

def save_sections_to_files(sections):
    """
    Saves each section's content to a separate text file.
    """
    for section_name, section_content in sections.items():
        with open(f'{section_name.lower().replace(" ", "_")}.txt', 'w') as file:
            file.write(section_content)

def load_dataset(file_path):
    """
    Loads a dataset from a CSV file.
    """
    return pd.read_csv(file_path)

def compare_and_extract_metrics(section_file, dataset, column_name, metric_column):
    """
    Compares section content with a dataset and extracts relevant metrics.
    """
    with open(section_file, 'r') as file:
        section_content = file.read()
    
    # Find matches where the column_name appears in the section content
    matches = dataset[dataset[column_name].apply(lambda x: str(x).lower() in section_content.lower())]
    return matches[[metric_column]]

def calculate_resume_score(company_ranks, skills_scores, university_rankings, has_work_experience):
    """
    Calculates a resume score based on extracted metrics.
    """
    # Use raw scores instead of normalized scores
    company_score = company_ranks['Rank'].min() if not company_ranks.empty else 0
    skills_score = skills_scores['Score'].sum() if not skills_scores.empty else 0
    university_score = university_rankings['ranking'].min() if not university_rankings.empty else 0
    
    # Assign weights
    company_weight = 2.0  # Adjust as needed
    skills_weight = 1.5   # Skills might be more important
    university_weight = 1.0  # Universities might be less important
    
    # Calculate weighted total score
    total_score = (
        (company_score * company_weight) +
        (skills_score * skills_weight) +
        (university_score * university_weight)
    )
    
    # Apply penalty for resumes without work experience
    if not has_work_experience:
        total_score *= 0.8  # Reduce score by 20% if no work experience
    
    # Scale the final score to a range of 0 to 100
    scaled_score = (total_score / 10)  # Adjust the divisor based on expected score range
    return round(scaled_score, 2)

def process_resume(pdf_path):
    """
    Processes a resume PDF and calculates its score.
    """
    # Extract text from PDF
    resume_text = extract_text_from_pdf(pdf_path)
    
    # Parse resume sections
    resume_sections = parse_resume_sections(resume_text)
    
    # Log missing sections
    expected_sections = ['Experience', 'Skills', 'Education']
    for section in expected_sections:
        if section.upper() not in resume_sections:
            logging.warning(f"Section '{section}' not found in resume.")
    
    # Save sections to files
    save_sections_to_files(resume_sections)
    
    # Load datasets
    company_df = load_dataset('Resume_Scrapper/Datasets/Companies_Dataset.csv')
    skills_df = load_dataset('Resume_Scrapper/Datasets/Skills_Dataset.csv')
    universities_df = load_dataset('Resume_Scrapper/Datasets/Universities_Dataset.csv')
    
    # Compare and extract metrics
    company_ranks = compare_and_extract_metrics('experience.txt', company_df, 'Name', 'Rank')
    skills_scores = compare_and_extract_metrics('skills.txt', skills_df, 'Skill', 'Score')
    university_rankings = compare_and_extract_metrics('education.txt', universities_df, 'University', 'ranking')
    
    # Check if work experience exists
    has_work_experience = 'EXPERIENCE' in resume_sections and bool(resume_sections['EXPERIENCE'])
    
    # Calculate resume score
    resume_score = calculate_resume_score(company_ranks, skills_scores, university_rankings, has_work_experience)
    
    return resume_score

# List of resume files
resume_files = [
    '/workspaces/SIMS-Project/Resume_Scrapper/Resumes/autoCV (3).pdf',
    '/workspaces/SIMS-Project/Resume_Scrapper/Resumes/resume_2 (1).pdf',
    '/workspaces/SIMS-Project/Resume_Scrapper/Resumes/Resume.pdf',
    '/workspaces/SIMS-Project/Resume_Scrapper/Resumes/Resume_2.pdf'
]

# Calculate scores for all resumes
resume_scores = []
for resume_file in resume_files:
    try:
        score = process_resume(resume_file)
        resume_scores.append((resume_file, score))  # Store as (resume_file, score) tuple
    except Exception as e:
        logging.error(f"Error processing {resume_file}: {e}")

# Sort resumes by score (highest to lowest)
resume_scores.sort(key=lambda x: x[1], reverse=True)

# Print ranked resumes
print("Ranked Resumes (Highest to Lowest):")
for rank, (resume_file, score) in enumerate(resume_scores, start=1):
    print(f"Rank {rank}: {resume_file[24:]} (Score: {score})")

INFO:root:✅ Extracted sections: ['BASIC_INFO', 'SUMMARY', 'SKILLS', 'EXPERIENCE', 'PROJECTS', 'EDUCATION']
ERROR:root:Error processing /workspaces/SIMS-Project/Resume_Scrapper/Resumes/autoCV (3).pdf: [Errno 2] No such file or directory: 'Resume_Scrapper/Datasets/Companies_Dataset.csv'
INFO:root:✅ Extracted sections: ['BASIC_INFO', 'SUMMARY', 'EXPERIENCE', 'PROJECTS', 'EDUCATION']
ERROR:root:Error processing /workspaces/SIMS-Project/Resume_Scrapper/Resumes/resume_2 (1).pdf: [Errno 2] No such file or directory: 'Resume_Scrapper/Datasets/Companies_Dataset.csv'
INFO:root:✅ Extracted sections: ['BASIC_INFO', 'SUMMARY', 'SKILLS', 'PROJECTS', 'EDUCATION']
ERROR:root:Error processing /workspaces/SIMS-Project/Resume_Scrapper/Resumes/Resume.pdf: [Errno 2] No such file or directory: 'Resume_Scrapper/Datasets/Companies_Dataset.csv'
INFO:root:✅ Extracted sections: ['BASIC_INFO', 'EDUCATION', 'SKILLS']
ERROR:root:Error processing /workspaces/SIMS-Project/Resume_Scrapper/Resumes/Resume_2.pdf: [Errno 

Ranked Resumes (Highest to Lowest):


In [1]:
import pdfplumber
import re
import pandas as pd

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

def parse_resume_sections(resume_text):
    sections = {
        'Summary': '',
        'Skills': '',
        'Education': '',
        'Work Experience': '',
        'Projects': ''
    }
    
    # Improved regex to handle section extraction
    summary_match = re.search(r'Summary(.*?)(Skills|Education|Work Experience|Projects)', resume_text, re.DOTALL)
    if summary_match:
        sections['Summary'] = summary_match.group(1).strip()
    
    skills_match = re.search(r'Skills(.*?)(Education|Work Experience|Projects)', resume_text, re.DOTALL)
    if skills_match:
        sections['Skills'] = skills_match.group(1).strip()
    
    education_match = re.search(r'Education(.*?)(Work Experience|Projects)', resume_text, re.DOTALL)
    if education_match:
        sections['Education'] = education_match.group(1).strip()
    
    experience_match = re.search(r'Work Experience(.*?)Projects', resume_text, re.DOTALL)
    if experience_match:
        sections['Work Experience'] = experience_match.group(1).strip()
    
    projects_match = re.search(r'Projects(.*?)$', resume_text, re.DOTALL)
    if projects_match:
        sections['Projects'] = projects_match.group(1).strip()
    
    return sections

def save_sections_to_files(sections):
    for section_name, section_content in sections.items():
        with open(f'{section_name.lower().replace(" ", "_")}.txt', 'w') as file:
            file.write(section_content)

def load_dataset(file_path):
    return pd.read_csv(file_path)

def compare_and_extract_metrics(section_file, dataset, column_name, metric_column):
    with open(section_file, 'r') as file:
        section_content = file.read()
    
    # Find matches where the column_name appears in the section content
    matches = dataset[dataset[column_name].apply(lambda x: str(x).lower() in section_content.lower())]
    return matches[[metric_column]]

def calculate_resume_score(company_ranks, skills_scores, university_rankings, has_work_experience):
    # Use raw scores instead of normalized scores
    company_score = company_ranks['Rank'].min() if not company_ranks.empty else 0
    skills_score = skills_scores['Score'].sum() if not skills_scores.empty else 0
    university_score = university_rankings['ranking'].min() if not university_rankings.empty else 0
    
    # Assign weights
    company_weight = 2.0  # Adjust as needed
    skills_weight = 1.5   # Skills might be more important
    university_weight = 1.0  # Universities might be less important
    
    # Calculate weighted total score
    total_score = (
        (company_score * company_weight) +
        (skills_score * skills_weight) +
        (university_score * university_weight)
    )
    
    # Apply penalty for resumes without work experience
    if not has_work_experience:
        total_score *= 0.8  # Reduce score by 20% if no work experience
    
    # Scale the final score to a range of 0 to 100
    scaled_score = (total_score / 10)  # Adjust the divisor based on expected score range
    return round(scaled_score, 2)

# Function to process a resume and return its score
def process_resume(pdf_path):
    # Extract text from PDF
    resume_text = extract_text_from_pdf(pdf_path)
    
    # Parse resume sections
    resume_sections = parse_resume_sections(resume_text)
    
    # Save sections to files
    save_sections_to_files(resume_sections)
    
    # Load datasets
    company_df = load_dataset('Resume_Scrapper/Datasets/Companies_Dataset.csv')
    skills_df = load_dataset('Resume_Scrapper/Datasets/Skills_Dataset.csv')
    universities_df = load_dataset('Resume_Scrapper/Datasets/Universities_Dataset.csv')
    
    # Compare and extract metrics
    company_ranks = compare_and_extract_metrics('work_experience.txt', company_df, 'Name', 'Rank')
    skills_scores = compare_and_extract_metrics('skills.txt', skills_df, 'Skill', 'Score')
    university_rankings = compare_and_extract_metrics('education.txt', universities_df, 'University', 'ranking')
    
    # Check if work experience exists
    has_work_experience = bool(resume_sections['Work Experience'])
    
    # Calculate resume score
    resume_score = calculate_resume_score(company_ranks, skills_scores, university_rankings, has_work_experience)
    
    return resume_score

# List of resume files
resume_files = [
    'Resume_Scrapper/Resumes/autoCV (3).pdf',
    'Resume_Scrapper/Resumes/resume_2 (1).pdf',
    'Resume_Scrapper/Resumes/Resume.pdf'
]

# Calculate scores for all resumes
resume_scores = []
for resume_file in resume_files:
    score = process_resume(resume_file)
    resume_scores.append((resume_file, score))  # Store as (resume_file, score) tuple

# Sort resumes by score (highest to lowest)
resume_scores.sort(key=lambda x: x[1], reverse=True)

# Print ranked resumes
print("Ranked Resumes (Highest to Lowest):")
for rank, (resume_file, score) in enumerate(resume_scores, start=1):
    print(f"Rank {rank}: {resume_file[24:]} (Score: {score})")

FileNotFoundError: [Errno 2] No such file or directory: 'Resume_Scrapper/Resumes/autoCV (3).pdf'

In [None]:
def Resume_score_calc_1(resume_text):
    # Split the resume text into lines
    resume_lines = resume_text.split("\n")
    
    # Split each line into words and store them in a list
    final_text = []
    for line in resume_lines:
        words = line.split(" ")
        final_text.extend(word.lower() for word in words)

    # Track the character position in the original resume_text
    char_idx = 0

    # Iterate through the words to find keywords
    for word in final_text:
        for keyword in keywords:
            if word == keyword.lower():  # Case-insensitive comparison
                # Find the position of the keyword in the original resume_text
                keyword_pos = resume_text.lower().find(keyword.lower(), char_idx)
                if keyword_pos != -1:  # If the keyword is found
                    # Write the text AFTER the keyword to a file
                    with open(f'Resume_Scrapper/Downloaded/resume_text/{keyword}.txt', "w", encoding='utf-8') as f:
                        f.write(resume_text[keyword_pos + len(keyword):].strip())
                    print(f"Downloaded: {keyword}.txt")
        # Update char_idx to the end of the current word
        char_idx += len(word) + 1  # +1 for the space after the word

def resume(resume_text):
    count = 0
    for keyword in keywords:
        if keyword+"\n" in resume_text:
            print(resume_text[:resume_text.index(keyword + "\n")])
            count += 1

def Resume_score_calc(resume_text):
    # Split the resume text into lines
    resume_lines = resume_text.split("\n")
    
    # Split each line into words and store them in a list
    final_text = []
    for line in resume_lines:
        words = line.split(" ")
        final_text.extend(word.lower() for word in words)

    # Track the character position in the original resume_text
    char_idx = 0

    # Dictionary to track if a keyword has been processed
    keywords_processed = {keyword: False for keyword in keywords}

    # Variable to store the previous keyword
    prev_keyword = "Basic_Info"

    # Iterate through the words to find keywords
    for word in final_text:
        for keyword in keywords:
            if not keywords_processed[keyword] and word == keyword.lower():  # Case-insensitive comparison
                # Write the text up to the current character position to a file
                with open(f'Resume_Scrapper/Downloaded/resume_text/{prev_keyword}.txt', "w", encoding='utf-8') as f:
                    f.write(resume_text[:char_idx].strip())
                print(f"Downloaded: {prev_keyword}.txt")
                
                # Mark the keyword as processed
                keywords_processed[keyword] = True
                
                # Reset resume_text and char_idx after writing
                resume_text = resume_text[char_idx:]
                char_idx = 0
                prev_keyword = keyword
                break  # Stop after the first keyword is found
        # Update char_idx to the end of the current word
        char_idx += len(word) + 1  # +1 for the space after the word

    # Write the remaining text (after the last keyword) to the file for the last keyword
    with open(f'Resume_Scrapper/Downloaded/resume_text/{prev_keyword}.txt', "w", encoding='utf-8') as f:
        f.write(resume_text.strip())
    print(f"Downloaded: {prev_keyword}.txt")

import os
import re

def Resume_score_calc_new(resume_text):
    # Define keywords in the order they should appear
    keywords_final = []
    resume_lines = resume_text.split("\n")
    
    # Split each line into words and store them in a list
    final_text = []
    for line in resume_lines:
        words = line.split(" ")
        final_text.extend(word.lower() for word in words)

    # Track the character position in the original resume_text
    char_idx = 0

    # Dictionary to track if a keyword has been processed
    keywords_processed = {keyword: False for keyword in keywords}

    # Variable to store the previous keyword
    prev_keyword = "Basic_Info"

    # Iterate through the words to find keywords
    for word in final_text:
        for keyword in keywords:
            if not keywords_processed[keyword] and word == keyword.lower(): 
                keywords_final.append(keyword)
    print(keywords_final)
    
    # Create directory if needed
    os.makedirs("Resume_Scrapper/Downloaded/resume_text", exist_ok=True)
    
    # Preprocess text for better matching
    processed_text = "\n" + resume_text.replace("\r", "") + "\n"
    
    sections = {}
    remaining_text = processed_text
    prev_section = "Basic_Info"
    
    # Find sections in order
    for keyword in keywords_final:
        # Regex pattern for section header detection
        pattern = re.compile(
            rf'\n\s*{re.escape(keyword)}\s*\n+',
            re.IGNORECASE | re.DOTALL
        )
        
        match = pattern.search(remaining_text)
        if match:
            # Extract content between previous section and this section
            content = remaining_text[:match.start()].strip()
            sections[prev_section] = content
        
            # Update remaining text and current section
            remaining_text = remaining_text[match.end():]
            prev_section = keyword.upper()
    
    # Add remaining content to last section
    sections[prev_section] = remaining_text.strip()
    
    # Save to files
    for section, content in sections.items():
        filename = f"{section.replace(' ', '_')}.txt"
        with open(f'Resume_Scrapper/Downloaded/resume_text/{filename}', 'w', encoding='utf-8') as f:
            f.write(content)
    
    print(f"Successfully extracted sections: {list(sections.keys())}")


In [None]:
def Resume_score_calc_new(resume_text):
    # Define keywords in the order they should appear
    keywords_final = []
    resume_lines = resume_text.split("\n")
    
    # Split each line into words and store them in a list
    final_text = []
    for line in resume_lines:
        words = line.split(" ")
        final_text.extend(word.lower() for word in words)

    # Track the character position in the original resume_text
    char_idx = 0

    # Dictionary to track if a keyword has been processed
    keywords_processed = {keyword: False for keyword in keywords}

    # Variable to store the previous keyword
    prev_keyword = "Basic_Info"

    # Iterate through the words to find keywords
    for word in final_text:
        for keyword in keywords:
            if not keywords_processed[keyword] and word == keyword.lower(): 
                keywords_final.append(keyword)
    
    # Create directory if needed
    os.makedirs("Resume_Scrapper/Downloaded/resume_text", exist_ok=True)
    
    # Preprocess text for better matching
    processed_text = "\n" + resume_text.replace("\r", "") + "\n"
    
    sections = {}
    remaining_text = processed_text
    prev_section = "Basic_Info"
    
    # Find sections in order
    for keyword in keywords_final:
        # Regex pattern for section header detection
        pattern = re.compile(
            rf'\n\s*{re.escape(keyword)}\s*\n+',
            re.IGNORECASE | re.DOTALL
        )
        
        match = pattern.search(remaining_text)
        if match:
            # Extract content between previous section and this section
            content = remaining_text[:match.start()].strip()
            sections[prev_section] = content
        
            # Update remaining text and current section
            remaining_text = remaining_text[match.end():]
            prev_section = keyword.upper()
    
    # Add remaining content to last section
    sections[prev_section] = remaining_text.strip()
    
    # Save to files
    for section, content in sections.items():
        filename = f"{section.replace(' ', '_')}.txt"
        with open(f'Resume_Scrapper/Downloaded/resume_text/{filename}', 'w', encoding='utf-8') as f:
            f.write(content)
    
    print(f"Successfully extracted sections: {list(sections.keys())}")


In [None]:
    # company_score = 0
    # for company, rank in company_ranking.items():
    #     # print(company)
    #     for comp in final_text:
    #         if company.lower() == comp:
    #             print(f"found {company}")
    #             company_score += rank
    #             break
    #             # for company_tier, score_tier in company_tiers:
    #             #     if rank > company_tier[0] and rank < company_tier[1]:
    #             #         company_score += score_tier

    # if "nvidia" in final_text:
    #     print("Found")

    # print(company_score)

In [None]:
resume_text = resume_text.split("\n")
final_text = []
for line in resume_text:
    line = line.split(" ")
    final_text.extend(word.lower() for word in line)

# print(final_text)

company_score = 0
for company, rank in company_ranking.items():
    # print(company)
    for comp in final_text:
        if company.lower() == comp:
            print(f"found {company}")
            company_score += rank
            break
            # for company_tier, score_tier in company_tiers:
            #     if rank > company_tier[0] and rank < company_tier[1]:
            #         company_score += score_tier

if "nvidia" in final_text:
    print("Found")

print(company_score)