In [1]:
import os, json

In [2]:
path = "backend/tmp/8qibnwee_res.json"

In [3]:
with open(path, 'r') as file:
    data = json.load(file)

In [4]:
ocr_data = data.get("rec_texts")

In [5]:
from pydantic import BaseModel
from typing import List, Optional

class ResumeData(BaseModel):
    # Personal Information
    full_name: Optional[str] = None
    current_position: Optional[str] = None
    
    # Contact Information
    email: Optional[str] = None
    phone: Optional[str] = None
    linkedin: Optional[str] = None
    github: Optional[str] = None
    address: Optional[str] = None
    
    # Professional Summary
    professional_summary: Optional[str] = None
    
    # Work Experience (list of dictionaries)
    work_experience: List[str]  = []
    
    # Education (list of dictionaries)
    education: List[str] = []
    
    # Skills
    technical_skills: List[str] = []
    soft_skills: List[str] = []
    
    # Additional Sections
    certifications: List[str] = []
    projects: List[str] = []
    languages: List[str] = []
    achievements: List[str] = []

In [6]:
RESUME_EXTRACTION_PROMPT = """
You are a resume information extraction specialist. You will receive a list of strings from OCR (Optical Character Recognition) processing of a resume document.

IMPORTANT: OCR data often contains errors such as:
- Words concatenated without spaces (e.g., "scalableAWSinfrastructuresupportingLLMoperationsthrough")
- Missing spaces between words, sentences, or sections
- Words cut off or split incorrectly
- Inconsistent formatting and spacing
- Some text may be garbled or incomplete

Your task is to extract structured information from this imperfect OCR data and return a JSON object that matches the provided schema.

INSTRUCTIONS:
1. Carefully read through all OCR text lines to understand the resume structure
2. Use context clues to separate concatenated words and fix spacing issues
3. Extract information even if it's imperfect - do your best to interpret the meaning
4. For work experience, try to identify job titles, company names, dates, and responsibilities
5. Look for education information including degrees and institutions
6. Extract technical skills, programming languages, tools, and technologies mentioned
7. If information is unclear or missing, set the field to null or empty list as appropriate
8. Return only valid JSON - no additional text or explanations

OCR Data to process:
{ocr_data}

Return the extracted information as a JSON object:
"""

In [7]:
# Usage example
final_prompt = RESUME_EXTRACTION_PROMPT.format(ocr_data=ocr_data)

In [8]:
from google import genai
from google.genai import types
# The client gets the API key from the environment variable `GEMINI_API_KEY`.
client = genai.Client(api_key="AIzaSyBNY0Ys8BqiMnPQ6ajbmtfgqqjcjMf0VLw")

response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents=final_prompt,
    config={
        "response_mime_type": "application/json",
        "response_schema": ResumeData,
    }
)
print(response.text)

{
  "full_name": "Larissa Pereira",
  "current_position": "Frontend Developer at Codewave",
  "email": "larissa.pereira@email.com",
  "phone": "(11)98748-1868",
  "linkedin": null,
  "github": null,
  "address": null,
  "professional_summary": "Creative Frontend Developer specializing in React and responsive design with a passion for clean UI.",
  "work_experience": [
    "Frontend Developer - InovaData (2020-2021) - Developed and maintained systems. - Collaborated with teams. - Delivered projects on time.",
    "Frontend Developer - SecureX (2021-2022) - Developed and maintained systems. - Collaborated with teams. - Delivered projects on time."
  ],
  "education": [
    "Bachelor of Computer Science - Federal Institute of Technology (2016 - 2020)"
  ],
  "technical_skills": [
    "HTML",
    "CSS",
    "JavaScript",
    "React",
    "Tailwind CSS",
    "Figma"
  ],
  "soft_skills": [],
  "certifications": [],
  "projects": [],
  "languages": [],
  "achievements": []
}


In [17]:
res = json.loads(response.text)

In [24]:
import re

if res["full_name"] is not None:
    # Convert to lowercase
    file_name = res["full_name"].lower()
    # Remove spaces
    file_name = file_name.replace(" ", "")
    # Remove special characters using regex (keeps only alphanumeric characters)
    file_name = re.sub(r'[^a-z0-9]', '', file_name)
else:
    file_name = str(uuid.uuid4())

In [25]:
file_name

'larissapereira'

In [20]:
import uuid

In [18]:
res

{'full_name': 'Larissa Pereira',
 'current_position': 'Frontend Developer at Codewave',
 'email': 'larissa.pereira@email.com',
 'phone': '(11)98748-1868',
 'linkedin': None,
 'github': None,
 'address': None,
 'professional_summary': 'Creative Frontend Developer specializing in React and responsive design with a passion for clean UI.',
 'work_experience': ['Frontend Developer - InovaData (2020-2021) - Developed and maintained systems. - Collaborated with teams. - Delivered projects on time.',
  'Frontend Developer - SecureX (2021-2022) - Developed and maintained systems. - Collaborated with teams. - Delivered projects on time.'],
 'education': ['Bachelor of Computer Science - Federal Institute of Technology (2016 - 2020)'],
 'technical_skills': ['HTML',
  'CSS',
  'JavaScript',
  'React',
  'Tailwind CSS',
  'Figma'],
 'soft_skills': [],
 'certifications': [],
 'projects': [],
 'languages': [],
 'achievements': []}