In [None]:
%pip install pdfplumber
%pip install python-docx
%pip install pytesseract
%pip install opencv-python
%pip install pillow
%pip install transformers
%pip install torch
%pip install request

In [19]:
import pdfplumber
import os
import cv2
import pytesseract
from PIL import Image
import docx
import re
import json

In [20]:
def pdf_cv(pdf_path):
    extracted_info = ''
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                extracted_info += page_text + '\n'
    return extracted_info

In [21]:
def docx_cv(docx_path):
    doc = docx.Document(docx_path)
    extracted_info = '\n'.join([para.text for para in doc.paragraphs])
    return extracted_info

In [39]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

def image_cv(image_path):
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    extracted_info = pytesseract.image_to_string(img)
    return extracted_info

In [40]:
def process_cv(file_path):
    if not os.path.exists(file_path):
        return 'File not found!'

    file_estension = os.path.splitext(file_path)[-1].lower()

    if file_estension == '.pdf':
        return pdf_cv(file_path)
    elif file_estension == '.docx':
        return docx_cv(file_path)
    elif file_estension in ['.png', '.jpg', '.jpeg']:
        return image_cv(file_path)
    else:
        return 'file format not supported!'

In [None]:
import requests

api_key = 'your google api key'
api_url = ('your google api URL' + api_key)

def parse_cv(cv_text):
    prompt = f"""
Extract structured details from the following CV text:

{cv_text}

Return the result in JSON format including:
- Name
- Contact information (email, phone, LinkedIn)
- Education (list of degrees/institutions and years)
- Work Experience (list of job titles, companies, and durations)
- Skills & Certifications

If any field is missing, mark it as "N/A".
"""
    payload = {'contents':[{'parts':[{'text': prompt}]}]}
    headers = {'Content-Type': 'application/json'}
    resp = requests.post(api_url, headers=headers, json=payload)
    resp.raise_for_status()

    txt = resp.json()['candidates'][0]['content']['parts'][0]['text']
    
    # extracting response json
    m = re.search(r"```json\s*(\{.*?\})\s*```", txt, re.S)
    raw = m.group(1) if m else txt
    
    # parsing json
    try:
        structured = json.loads(raw)
    except json.JSONDecodeError:
        raise ValueError('Failed to parse JSON from Gemini response:\n' + raw)
    
    # ensure all keys exist
    for k in ['Name','Contact Information','Education','Work Experience','Skills & Certifications']:
        structured.setdefault(k, 'N/A')
    return structured

In [None]:
def print_output(data: dict):
    # name
    print(f"\n=== {data['Name']} ===\n")
    
    # Contact
    ci = data.get("Contact Information", {})
    print("Contact Information:")
    if isinstance(ci, dict):
        for k,v in ci.items():
            print(f"  {k}: {v}")
    else:
        print(f"  {ci}")
    print()
    
    # education
    print("Education:")
    ed_list = data.get("Education", [])
    if isinstance(ed_list, list):
        for ed in ed_list:
            print("  • " + ", ".join(f"{field}: {val}" for field,val in ed.items()))
    else:
        print(f"  {ed_list}")
    print()
    
    # work Experience
    print("Work Experience:")
    we = data.get("Work Experience", [])
    if isinstance(we, list):
        for job in we:
            print("  • " + ", ".join(f"{field}: {val}" for field,val in job.items()))
    else:
        print(f"  {we}")
    print()
    
    # skills & Certifications
    sc = data.get("Skills & Certifications", {})
    print("Skills & Certifications:")
    if isinstance(sc, dict):
        for cat, items in sc.items():
            print(f"  {cat}:")
            if isinstance(items, list):
                print("    " + ", ".join(items))
            else:
                print(f"    {items}")
    else:
        print(f"  {sc}")
    print()

# main block
if __name__ == "__main__":
    cv_path = input("Enter CV file path: ").strip()
    extracted = process_cv(cv_path) 
    if extracted.startswith("Error") or extracted.startswith("Not Supported"):
        print(extracted)
    else:
        structured = parse_cv(extracted)
        print_output(structured)