In [8]:
import re
import fitz  # PyMuPDF
import base64
import streamlit as st
import spacy
import csv
import nltk

nltk.download('punkt')

# Load spaCy model
nlp = spacy.load('en_core_web_sm')
nlp_skills = spacy.load('TrainedModel/skills')  # Custom NER model for skills

# Load keywords from CSV
def load_keywords(file_path):
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        return set(row[0] for row in reader)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [45]:
# Extract Name

def extract_name(text):
    # Take top 10 lines from the text
    lines = text.split('\n')
    top_lines = '\n'.join(lines[:10])

    # Run spaCy on top lines
    top_doc = nlp(top_lines)

    # List of keywords to avoid (to prevent false positives like "Query Rewrite")
    bad_keywords = ['query', 'rewrite', 'data', 'sql', 'analytics', 'project', 'objective']

    for ent in top_doc.ents:
        if ent.label_ == 'PERSON':
            name_candidate = ent.text.strip()
            if not any(word.lower() in bad_keywords for word in name_candidate.split()):
                names = name_candidate.split()
                if len(names) >= 2 and all(name[0].isupper() for name in names):
                    return names[0], ' '.join(names[1:])
    return "", ""

In [46]:
# Extract Email

# Extract Email
def extract_email(text):
    email_match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
    return email_match.group() if email_match else ""

In [47]:
# Extract Ph No

# Extract Contact Number
def extract_contact_number(text):
    pattern = r"\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b"
    match = re.search(pattern, text)
    return match.group() if match else ""

In [48]:
# # Extract Education

# Extract Education
def extract_education(text):
    doc = nlp(text)
    universities = []
    for ent in doc.ents:
        if ent.label_ == "ORG" and any(x in ent.text.lower() for x in ["university", "college", "institute"]):
            universities.append(ent.text)
    return universities

In [49]:
# Extract Skills

# CSV-based Skills Extraction
def csv_skills(text):
    skills_keywords = load_keywords('newSkills.csv')
    return {kw for kw in skills_keywords if kw.lower() in text.lower()}

# NER-based Skills Extraction
def extract_skills_from_ner(text):
    non_skill_labels = {'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL', 'EMAIL'}
    doc = nlp_skills(text)
    return {ent.text.strip() for ent in doc.ents if ent.label_ == 'SKILL' and ent.label_ not in non_skill_labels}

# Combine and Clean Skills
def extract_skills(text):
    skills = csv_skills(text).union(extract_skills_from_ner(text))
    return [s for s in skills if s and s.isalpha()]

In [50]:
# # Extract Major
# Major/Degree Extraction
def extract_major(text):
    majors = load_keywords('majors.csv')
    for keyword in majors:
        if keyword.lower() in text.lower():
            return keyword
    return ""

In [60]:
# # Extract Experience
# Experience Extraction
def extract_experience(text):
    doc = nlp(text)
    verbs = [token.lemma_.lower() for token in doc if token.pos_ == 'VERB']
    senior = ['lead', 'manage', 'direct', 'oversee']
    mid = ['develop', 'design', 'analyze', 'implement']
    junior = ['assist', 'support', 'contribute']

    if any(v in verbs for v in senior):
        level = "Senior"
    elif any(v in verbs for v in mid):
        level = "Mid-Senior"
    elif any(v in verbs for v in junior):
        level = "Mid-Junior"
    else:
        level = "Entry Level"

    # position = suggest_position(verbs)
    return {'level_of_experience': level}

In [62]:
# # Suggest Position

# Required imports
import pickle
import re

# Load TF-IDF, classifier, and encoder
tfidf = pickle.load(open('tfidf.pkl', 'rb'))
svc_model = pickle.load(open('clf.pkl', 'rb'))
le = pickle.load(open('encoder.pkl', 'rb'))

# Clean the resume text
def cleanResume(txt):
    cleanText = re.sub('http\S+\s', ' ', txt)
    cleanText = re.sub('RT|cc', ' ', cleanText)
    cleanText = re.sub('#\S+\s', ' ', cleanText)
    cleanText = re.sub('@\S+', '  ', cleanText)  
    cleanText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', cleanText)
    cleanText = re.sub(r'[^\x00-\x7f]', ' ', cleanText) 
    cleanText = re.sub('\s+', ' ', cleanText)
    return cleanText.strip()

# Predict the position from resume text
def predict_position_from_text(text):
    cleaned_text = cleanResume(text)
    vectorized = tfidf.transform([cleaned_text]).toarray()
    prediction = svc_model.predict(vectorized)
    return le.inverse_transform(prediction)[0]


  cleanText = re.sub('http\S+\s', ' ', txt)
  cleanText = re.sub('#\S+\s', ' ', cleanText)
  cleanText = re.sub('@\S+', '  ', cleanText)
  cleanText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', cleanText)
  cleanText = re.sub('\s+', ' ', cleanText)


In [63]:
# def calculate_resume_score(resume_info):
# Resume Scoring
def calculate_resume_score(info):
    score = 0
    score += 25 if info['first_name'] and info['last_name'] else 0
    score += 25 if info['email'] else 0
    score += 25 if info['degree_major'] else 0
    score += 25 if info['skills'] else 0
    return score

In [64]:
# Suggested Skills for Job Title
def suggest_skills_for_job(job):
    job_skills = {}
    with open('sugestedSkills.csv') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            job_title = row[0].strip().lower()
            skills = [s.strip() for s in row[1:] if s]
            job_skills[job_title] = skills
    return job_skills.get(job.lower(), [])

In [65]:
## testing

In [1]:
import fitz  # PyMuPDF

# Load resume PDF and extract text
doc = fitz.open(r"C:\Users\HP\Desktop\sample_of_a_resume4.pdf")
text = "\n".join([page.get_text() for page in doc])

# Extract all info
first_name, last_name = extract_name(text)
email = extract_email(text)
phone = extract_contact_number(text)
skills = extract_skills(text)
degree_major = extract_major(text)
experience = extract_experience(text)

# Use ML model to predict position
predicted_position = predict_position_from_text(text)

resume_info = {
    'first_name': first_name,
    'last_name': last_name,
    'email': email,
    'phone': phone,
    'skills': skills,
    'degree_major': degree_major,
    'experience': {
        'level_of_experience': experience['level_of_experience'],
        'suggested_position': predicted_position   # REPLACED OLD POSITION WITH ML ONE
    }
}

# Display
print("First Name:", resume_info['first_name'])
print("Last Name:", resume_info['last_name'])
print("Email:", resume_info['email'])
print("Phone:", resume_info['phone'])
print("Degree/Major:", resume_info['degree_major'])
print("Skills:", ', '.join(resume_info['skills']))
print("Experience Level:", resume_info['experience']['level_of_experience'])
print("Suggested Position:", resume_info['experience']['suggested_position'])

score = calculate_resume_score(resume_info)
print("Resume Score:", score)

suggested_skills = suggest_skills_for_job(resume_info['experience']['suggested_position'])
print("Suggested Skills for This Role:", ', '.join(suggested_skills) if suggested_skills else "None")


NameError: name 'extract_name' is not defined

In [3]:
import re
import fitz  # PyMuPDF
import base64
import streamlit as st
import spacy
import csv
import nltk
nltk.download('punkt')
# Load spaCy model
nlp = spacy.load('en_core_web_sm')
nlp_skills = spacy.load('TrainedModel/skills')  # Custom NER model for skills
# Load keywords from CSV
def load_keywords(file_path):
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        return set(row[0] for row in reader)



# Extract Name
def extract_name(text):
    # Take top 10 lines from the text
    lines = text.split('\n')
    top_lines = '\n'.join(lines[:10])

    # Run spaCy on top lines
    top_doc = nlp(top_lines)

    # List of keywords to avoid (to prevent false positives like "Query Rewrite")
    bad_keywords = ['query', 'rewrite', 'data', 'sql', 'analytics', 'project', 'objective']

    for ent in top_doc.ents:
        if ent.label_ == 'PERSON':
            name_candidate = ent.text.strip()
            if not any(word.lower() in bad_keywords for word in name_candidate.split()):
                names = name_candidate.split()
                if len(names) >= 2 and all(name[0].isupper() for name in names):
                    return names[0], ' '.join(names[1:])
    return "", ""



# Extract Email
def extract_email(text):
    email_match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
    return email_match.group() if email_match else ""



# Extract Contact Number
def extract_contact_number(text):
    pattern = r"\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b"
    match = re.search(pattern, text)
    return match.group() if match else ""



# Extract Education
def extract_education(text):
    doc = nlp(text)
    universities = []
    for ent in doc.ents:
        if ent.label_ == "ORG" and any(x in ent.text.lower() for x in ["university", "college", "institute"]):
            universities.append(ent.text)
    return universities



# Extract Skills
# CSV-based Skills Extraction
def csv_skills(text):
    skills_keywords = load_keywords('newSkills.csv')
    return {kw for kw in skills_keywords if kw.lower() in text.lower()}

# NER-based Skills Extraction
def extract_skills_from_ner(text):
    non_skill_labels = {'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL', 'EMAIL'}
    doc = nlp_skills(text)
    return {ent.text.strip() for ent in doc.ents if ent.label_ == 'SKILL' and ent.label_ not in non_skill_labels}

# Combine and Clean Skills
def extract_skills(text):
    skills = csv_skills(text).union(extract_skills_from_ner(text))
    return [s for s in skills if s and s.isalpha()]



# Extract Major
def extract_major(text):
    majors = load_keywords('majors.csv')
    for keyword in majors:
        if keyword.lower() in text.lower():
            return keyword
    return ""



# Extract Experience
def extract_experience(text):
    doc = nlp(text)
    verbs = [token.lemma_.lower() for token in doc if token.pos_ == 'VERB']
    senior = ['lead', 'manage', 'direct', 'oversee']
    mid = ['develop', 'design', 'analyze', 'implement']
    junior = ['assist', 'support', 'contribute']

    if any(v in verbs for v in senior):
        level = "Senior"
    elif any(v in verbs for v in mid):
        level = "Mid-Senior"
    elif any(v in verbs for v in junior):
        level = "Mid-Junior"
    else:
        level = "Entry Level"

    # position = suggest_position(verbs)
    return {'level_of_experience': level}



# Suggest Position
# Required imports
import pickle
import re
# Load TF-IDF, classifier, and encoder
tfidf = pickle.load(open('tfidf.pkl', 'rb'))
svc_model = pickle.load(open('clf.pkl', 'rb'))
le = pickle.load(open('encoder.pkl', 'rb'))

# Clean the resume text
def cleanResume(txt):
    cleanText = re.sub('http\S+\s', ' ', txt)
    cleanText = re.sub('RT|cc', ' ', cleanText)
    cleanText = re.sub('#\S+\s', ' ', cleanText)
    cleanText = re.sub('@\S+', '  ', cleanText)  
    cleanText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', cleanText)
    cleanText = re.sub(r'[^\x00-\x7f]', ' ', cleanText) 
    cleanText = re.sub('\s+', ' ', cleanText)
    return cleanText.strip()

# Predict the position from resume text
def predict_position_from_text(text):
    cleaned_text = cleanResume(text)
    vectorized = tfidf.transform([cleaned_text]).toarray()
    prediction = svc_model.predict(vectorized)
    return le.inverse_transform(prediction)[0]



# Resume Scoring
def calculate_resume_score(info):
    score = 0
    score += 25 if info['first_name'] and info['last_name'] else 0
    score += 25 if info['email'] else 0
    score += 25 if info['degree_major'] else 0
    score += 25 if info['skills'] else 0
    return score



# Suggested Skills for Job Title
def suggest_skills_for_job(job):
    job_skills = {}
    with open('sugestedSkills.csv') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            job_title = row[0].strip().lower()
            skills = [s.strip() for s in row[1:] if s]
            job_skills[job_title] = skills
    return job_skills.get(job.lower(), [])



## testing
import fitz  # PyMuPDF

# Load resume PDF and extract text
doc = fitz.open(r"C:\Users\HP\Desktop\sample_of_a_resume5.pdf")
text = "\n".join([page.get_text() for page in doc])

# Extract all info
first_name, last_name = extract_name(text)
email = extract_email(text)
phone = extract_contact_number(text)
skills = extract_skills(text)
degree_major = extract_major(text)
experience = extract_experience(text)

# Use ML model to predict position
predicted_position = predict_position_from_text(text)

resume_info = {
    'first_name': first_name,
    'last_name': last_name,
    'email': email,
    'phone': phone,
    'skills': skills,
    'degree_major': degree_major,
    'experience': {
        'level_of_experience': experience['level_of_experience'],
        'suggested_position': predicted_position   # REPLACED OLD POSITION WITH ML ONE
    }
}

# Display
print("First Name:", resume_info['first_name'])
print("Last Name:", resume_info['last_name'])
print("Email:", resume_info['email'])
print("Phone:", resume_info['phone'])
print("Degree/Major:", resume_info['degree_major'])
print("Skills:", ', '.join(resume_info['skills']))
print("Experience Level:", resume_info['experience']['level_of_experience'])
print("Suggested Position:", resume_info['experience']['suggested_position'])

score = calculate_resume_score(resume_info)
print("Resume Score:", score)

suggested_skills = suggest_skills_for_job(resume_info['experience']['suggested_position'])
print("Suggested Skills for This Role:", ', '.join(suggested_skills) if suggested_skills else "None")

  cleanText = re.sub('http\S+\s', ' ', txt)
  cleanText = re.sub('#\S+\s', ' ', cleanText)
  cleanText = re.sub('@\S+', '  ', cleanText)
  cleanText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', cleanText)
  cleanText = re.sub('\s+', ' ', cleanText)
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


First Name: 
Last Name: 
Email: itsomsarraf@gmail.com
Phone: 91 9868540784
Degree/Major: 
Skills: TypeScript, Developing, Django, CFD, Intern, SQL, JavaScript, Java, Redis, PostgreSQL, multipage, GitHub, Lighthouse, MLH, OS, Git, June, Led, MongoDB, Hackathon, Lead, Go, React, Python, Docker, R, messages, Built
Experience Level: Senior
Suggested Position: Blockchain
Resume Score: 50
Suggested Skills for This Role: None
