In [None]:
# Install necessary libraries for OCR and NLP
!pip install pytesseract Pillow pdfminer.six tesserocr regex spacy poppler-utils
!pip install transformers torch
# Install Tesseract OCR engine
!sudo apt-get install tesseract-ocr
# Verify Tesseract installation by checking its version
!tesseract --version

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdfminer.six
  Downloading pdfminer.six-20240706-py3-none-any.whl.metadata (4.1 kB)
Collecting tesserocr
  Downloading tesserocr-2.7.1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (10 kB)
Collecting poppler-utils
  Downloading poppler_utils-0.1.0-py3-none-any.whl.metadata (883 bytes)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading pdfminer.six-20240706-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tesserocr-2.7.1-cp310-cp310-manylinux_2_28_x86_64.whl (5.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m41.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading poppler_utils-0.1.0-py3-none-any.whl (9.2 kB)
Installing collected packages: tesserocr, pytesseract, poppler-utils, pdfminer.six
Successfully installed pdfminer.si

In [None]:
# This code performs resume data extraction from PDFs or images using OCR, text processing, and NLP techniques and JSON Schema.
# It first extracts text from a PDF or image, cleans the text, and uses regex and NLP to identify key sections
# like personal information, work experience, education, skills, and certifications. The extracted information
# is then structured into a JSON format, which can be further processed or analyzed.
import re
import json  ## json v imp
import spacy  ## NLP library / NLTK
import pytesseract   ## OCR
from pdfminer.high_level import extract_text
from PIL import Image ## image

# Load the pre-trained NLP model
nlp = spacy.load("en_core_web_sm")

# Define the function
def extract_resume_data(file_path):
    # Helper function to extract text from PDF or image
    def ocr_text_extraction(file_path):
        if file_path.endswith('.pdf'):
            text = extract_text(file_path)
            return text
        elif file_path.endswith(('.png', '.jpg', '.jpeg')):
            # Apply OCR directly on image
            img = Image.open(file_path)
            return pytesseract.image_to_string(img)
        else:
            raise ValueError("Unsupported file format. Use PDF or image.")

    # Helper function for text preprocessing
    def preprocess_text(text):
        text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
        text = text.strip()  # Remove leading/trailing spaces
        return text

    # Helper function to extract sections using regex
    def extract_section(text, keyword):
        pattern = rf'({keyword}.*?)(\n\n|\Z)'
        match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
        if match:
            return match.group(1).strip()
        return None

    # Extract text using OCR
    raw_text = ocr_text_extraction(file_path)

    # Preprocess the text
    clean_text = preprocess_text(raw_text)

    # Use regex and NLP to identify sections
    personal_info = extract_section(clean_text, 'Personal Information|Contact Information')
    work_experience = extract_section(clean_text, 'Work Experience|Professional Experience|Employment')
    education = extract_section(clean_text, 'Education|Academic Background|Qualifications')
    skills = extract_section(clean_text, 'Skills|Technical Skills|Core Competencies')
    certifications = extract_section(clean_text, 'Certifications|Licenses|Accreditations')

    # If NLP is needed for further classification or entity extraction
    doc = nlp(clean_text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]

    # Create structured JSON output
    resume_data = {
        "personal_information": personal_info if personal_info else "N/A",
        "work_experience": work_experience if work_experience else "N/A",
        "education": education if education else "N/A",
        "skills": skills if skills else "N/A",
        "certifications": certifications if certifications else "N/A",
        "entities": entities  # Captured using SpaCy
    }

    # Convert to JSON format
    json_output = json.dumps(resume_data, indent=4)
    return json_output, clean_text

# Example usage
file_path = '/content/Resume 1.png'
resume_json,extracted_text = extract_resume_data(file_path)
print(resume_json)


{
    "personal_information": "N/A",
    "work_experience": "N/A",
    "education": "N/A",
    "skills": "N/A",
    "certifications": "N/A",
    "entities": [
        [
            "MARIANA",
            "PERSON"
        ],
        [
            "2008",
            "DATE"
        ],
        [
            "Creative Design Wardiere University",
            "ORG"
        ],
        [
            "German",
            "NORP"
        ],
        [
            "Spanish",
            "NORP"
        ],
        [
            "Any City @\u00ae",
            "GPE"
        ],
        [
            "Lorem",
            "PERSON"
        ],
        [
            "nisl",
            "ORG"
        ],
        [
            "nec",
            "ORG"
        ],
        [
            "Fusce",
            "ORG"
        ],
        [
            "consequat sapien",
            "PERSON"
        ],
        [
            "eu",
            "PERSON"
        ],
        [
            "Fusce",
            "ORG"
       

In [None]:
extracted_text

'MARIANA NAPOLITANI GRAPHIC DESIGNER EDUCATION Wardiere University 2008 Master of Arts in Creative Design Wardiere University 2004 Bachelor of Arts in Creative Design SKILLS Web Design Branding Graphic Design Motion Graphics Illustration LANGUAGE English German Spanish helloereallygreatsite.com +123-456-7890 123 Anywhere St., Any City @®o Ff R reallygreatsite.com ABOUT ME Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam sagittis pretium nisl, nec commodo est. Fusce laoreet consequat sapien, eu fermentum ex pulvinar eget. Praesent hendrerit nulla in varius pharetra. Fusce facilisis venenatis lacus in lobortis. Fusce vulputate iaculis mauris. Nune risus arcu, tempor vel dignissim porta, vulputate id quam. Vestibulum pellentesque augue in lobortis ullamcorper. In eleifend nisl faucibus molestie porttitor. augue in lobortis ullamcorper. In eleifend nisl faucibus. WORK EXPERIENCE SENIOR GRAPHIC DESIGNER 2019 - 2022 STUDIO SHODWE Lorem ipsum dolor sit amet, consectetur adipis

In [None]:
# load a pre-trained BERT model for sequence classification.
# It loads the BERT tokenizer and model, specifically the 'bert-base-uncased' version, which can be replaced with a fine-tuned model if available.
# A classification pipeline is created to easily classify text input using the pre-trained BERT model and tokenizer.
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'  # Replace with your fine-tuned model name if available
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

# Create a pipeline for classification
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# This function, `classify_sections`, takes a block of text and a classifier to categorize the text into predefined sections.
# The text is split into individual sentences, and each sentence is classified using the provided classifier.
# Based on the classification label (e.g., PERSONAL_INFORMATION, WORK_EXPERIENCE), sentences are added to the respective section in a dictionary.
# The function returns a dictionary with categorized sentences for each section, such as personal information, work experience, education, skills, and certifications.


def classify_sections(text, classifier):
    sentences = text.split('\n')  # Simplistic sentence splitting; can be customized as needed

    classified_data = {
        "personal_information": [],
        "work_experience": [],
        "education": [],
        "skills": [],
        "certifications": []
    }

    for sentence in sentences:
        result = classifier(sentence)
        label = result[0]['label']

        # Add sentence to the corresponding section
        if label == "PERSONAL_INFORMATION":
            classified_data["personal_information"].append(sentence)
        elif label == "WORK_EXPERIENCE":
            classified_data["work_experience"].append(sentence)
        elif label == "EDUCATION":
            classified_data["education"].append(sentence)
        elif label == "SKILLS":
            classified_data["skills"].append(sentence)
        elif label == "CERTIFICATIONS":
            classified_data["certifications"].append(sentence)

    return classified_data


In [None]:
# This code snippet installs the `transformers` library, essential for using pre-trained models like BERT.
# It imports necessary modules from `torch` and `transformers` to load a pre-trained BERT model and tokenizer for sequence classification.
# A text classification pipeline is created using BERT, enabling sentence-level classification of resume sections.
# The `classify_sections` function splits the input text into sentences, truncates any sentence longer than 510 tokens to fit within BERT's token limit,
# and classifies each sentence into one of five categories: personal information, work experience, education, skills, or certifications.
# The classified sentences are then organized into their respective categories.

!pip install transformers

import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'  # Replace with your fine-tuned model name if available
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

# Create a pipeline for classification
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)

def classify_sections(text, classifier):
    sentences = text.split('\n')  # Simplistic sentence splitting; customize as needed

    classified_data = {
        "personal_information": [],
        "work_experience": [],
        "education": [],
        "skills": [],
        "certifications": []
    }

    for sentence in sentences:
        # Truncate the sentence if it exceeds the maximum length
        tokens = tokenizer.tokenize(sentence)
        if len(tokens) > 510:  # Account for [CLS] and [SEP] tokens
            tokens = tokens[:510]
        sentence = tokenizer.convert_tokens_to_string(tokens)

        result = classifier(sentence)
        label = result[0]['label']

        # Add sentence to the corresponding section
        if label == "PERSONAL_INFORMATION":
            classified_data["personal_information"].append(sentence)
        elif label == "WORK_EXPERIENCE":
            classified_data["work_experience"].append(sentence)
        elif label == "EDUCATION":
            classified_data["education"].append(sentence)
        elif label == "SKILLS":
            classified_data["skills"].append(sentence)
        elif label == "CERTIFICATIONS":
            classified_data["certifications"].append(sentence)

    return classified_data



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
classified_data = classify_sections(extracted_text, classifier)
classified_data

{'personal_information': [],
 'work_experience': [],
 'education': [],
 'skills': [],
 'certifications': []}