In [5]:
#Step 1: Install Required Libraries

!pip install pymupdf python-docx nltk



In [6]:
#Step 2: Download NLTK Resources

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [7]:
#Step 3: Upload Resume

from google.colab import files

# Upload resume file
uploaded = files.upload()

# Get file path
file_path = list(uploaded.keys())[0]
print(f" File uploaded: {file_path}")


Saving data-engineer-resume-example.pdf to data-engineer-resume-example.pdf
 File uploaded: data-engineer-resume-example.pdf


In [8]:
#Step 4: Complete Resume Analyzer Code

import os
import re
import fitz  # PyMuPDF
from docx import Document
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# 1. Extract Text
def extract_text_from_file(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        return extract_text_from_pdf(file_path)
    elif ext == ".docx":
        return extract_text_from_docx(file_path)
    else:
        raise ValueError(" Unsupported file format. Please upload PDF or DOCX.")

def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def extract_text_from_docx(file_path):
    doc = Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

# 2. Preprocess Text
def preprocess_text(text):
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    tokens = word_tokenize(cleaned_text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    return lemmatized_tokens

# 3. Analyze Resume
def analyze_resume(text):
    required_skills = {
        "python", "sql", "excel", "machine", "learning", "data", "analysis",
        "communication", "tableau", "statistics", "presentation"
    }

    tokens = preprocess_text(text)
    matched_skills = required_skills.intersection(tokens)
    missing_skills = required_skills.difference(tokens)

    print("\n=============================")
    print(" RESUME ANALYSIS REPORT ")
    print("=============================")
    print(f" Skills Found ({len(matched_skills)}):")
    print(", ".join(sorted(matched_skills)) if matched_skills else "None")

    print(f"\n Skills Missing ({len(missing_skills)}):")
    print(", ".join(sorted(missing_skills)) if missing_skills else "None")

    if missing_skills:
        print("\n Suggestions to Improve Resume:")
        for skill in sorted(missing_skills):
            print(f" Add or highlight: {skill}")
    else:
        print("\n Your resume covers all required skills. Excellent!")

# 4. Run Analyzer
resume_text = extract_text_from_file(file_path)
analyze_resume(resume_text)



 RESUME ANALYSIS REPORT
 Skills Found (4):
data, python, sql, tableau

 Skills Missing (7):
analysis, communication, excel, learning, machine, presentation, statistics

 Suggestions to Improve Resume:
 Add or highlight: analysis
 Add or highlight: communication
 Add or highlight: excel
 Add or highlight: learning
 Add or highlight: machine
 Add or highlight: presentation
 Add or highlight: statistics
