In [None]:
!pip install easyocr
!pip install boto3
!pip install transformers
!pip install pdf2image
!pip install langdetect

In [2]:
import easyocr
from pdf2image import convert_from_path
import os
import numpy as np
import boto3
import json
import re
from botocore.exceptions import ClientError
from langdetect.lang_detect_exception import LangDetectException
from langdetect import detect 

In [3]:
def ocr_doc(path):
    ocr_text = ''
    reader = easyocr.Reader(ocr_langs)
    if path.endswith('.pdf'):
        images = convert_from_path(path)
        ocr_result = []
        for image in images:
            ocr_result.extend(reader.readtext(np.array(image)))
    else:
        ocr_result = reader.readtext(path)
    for res in ocr_result:
        ocr_text += res[1] + '\n'
    return ocr_text.strip()

In [4]:
cv_prompt_template = f"""
You are an expert at extracting structured information from unstructured text.
Do not make up any information.
Extract the following information from the resume text below:
1. Full Name
2. Email Address
3. Phone Number
4. Location
5. Skills (list of skills)
6. Work Experience (list of job titles and companies)
7. Education (list of degrees and institutions)
8. Certifications (list of certifications)
9. Languages (list of languages spoken)
10. Projects (list of notable projects with descriptions and duration)
11. Summary (a brief summary of the candidate)
12. LinkedIn Profile (URL of the LinkedIn profile)
13. GitHub Profile (URL of the GitHub profile)
14. Portfolio Website (URL of the portfolio website)
15. References (list of references if available)
Format the output as a JSON object with the above fields. If any information is not available, use "Not Available" as the value.
The JSON format must adhere to the following structure:
{{
  "full_name": "string",
  "email_address": "string",
  "phone_number": "string",
  "location": "string",
  "summary": "string",
  "linkedin_profile": "URL string or null",
  "github_profile": "URL string or null",
  "portfolio_website": "URL string or null",
  "skills": ["string", "string", "..."],
  "work_experience": [
    {{
      "job_title": "string",
      "company": "string",
      "duration": "string",
      "description": "string"
    }}
  ],
  "education": [
    {{
      "degree": "string",
      "institution": "string",
      "graduation_year": "string"
    }}
  ],
  "certifications": ["string", "string", "..."],
  "languages": ["string", "string", "..."],
  "projects": ["string", "string", "..."],
  "references": ["string", "string", "..."]
}}
Here is the resume text:
"""

job_prompt_template = f"""
You are an expert at extracting structured information from unstructured text.
Do not make up any information.
Extract the following information from the job description text below:
1. Job Title
2. Company Name
3. Location
4. Job Type (e.g., full-time, part-time, remote)
5. Salary Range
6. Responsibilities (list of key responsibilities)
7. Requirements (list of required skills, education, and experience)
8. Benefits (list of benefits offered)
9. Application Instructions (how to apply)
10. Contact Information (email or phone if available)
11. Company Website (URL if available)
12. Posting Date (date the job was posted if available)
13. Required Certifications (list if any)
14. Preferred Languages (list if any)
Format the output as a JSON object with the above fields. If any information is not available, use "Not Available" as the value.
The JSON format must adhere to the following structure:
{{
  "job_title": "string",
  "company_name": "string",
  "location": "string",
  "job_type": "string",
  "salary_range": "string",
  "company_website": "URL string or null",
  "posting_date": "string",
  "contact_information": "string",
  "responsibilities": ["string", "string", "..."],
  "requirements": ["string", "string", "..."],
  "benefits": ["string", "string", "..."],
  "application_instructions": "string",
  "required_certifications": ["string", "string", "..."],
  "preferred_languages": ["string", "string", "..."]
}}
Here is the job description text:
"""

In [13]:
def extract_info_with_bedrock(raw_txt, prompt_template, model_arn=None, region_name='ap-southeast-1'):
    """
    Hàm tổng quát để extract structured info từ text bằng AWS Bedrock.
    
    Args:
        raw_txt (str): Văn bản đầu vào (CV, Job Description, v.v.)
        prompt_template (str): Prompt template của CV hoặc Job Description
        model_arn (str): ARN của inference profile hoặc modelId Bedrock
        region_name (str): Vùng AWS nơi Bedrock được triển khai
    """
    if model_arn is None:
        return {"error": "Model ARN is required"}

    #  thêm raw text vào prompt
    prompt = prompt_template + "\n" + raw_txt

    client = boto3.client('bedrock-runtime', region_name=region_name)

    try:
        response = client.converse(
            modelId=model_arn,
            messages=[{"role": "user", "content": [{"text": prompt}]}],
            inferenceConfig={"maxTokens": 2000, "temperature": 0.1, "topP": 0.9}
        )

        # Lấy response text
        response_text = response["output"]["message"]["content"][0]["text"].strip()

        # Clean markdown
        if response_text.startswith("```json"):
            response_text = response_text.replace("```json", "").strip()
        if response_text.endswith("```"):
            response_text = response_text.rstrip("```").strip()

        # Parse JSON
        try:
            extracted_json = json.loads(response_text)
        except json.JSONDecodeError:
            extracted_json = {"error": "Failed to parse JSON", "raw_output": response_text}

        return extracted_json

    except Exception as e:
        return {"error": f"Bedrock invocation failed: {str(e)}"}


In [21]:
def translate_with_bedrock(raw_txt, model_arn=None, region_name='ap-southeast-1'):
    """
    Hàm dịch văn bản sang tiếng Anh bằng AWS Bedrock.
    
    Args:
        raw_txt (str): Văn bản đầu vào cần dịch
        model_arn (str): ARN của inference profile hoặc modelId Bedrock
        region_name (str): Vùng AWS nơi Bedrock được triển khai
    """
    if model_arn is None:
        return {"error": "Model ARN is required"}

    prompt = f'''
    Translate the following text to English:\n{raw_txt}\n
    Do not make up any information.
    '''

    client = boto3.client('bedrock-runtime', region_name=region_name)

    try:
        response = client.converse(
            modelId=model_arn,
            messages=[{"role": "user", "content": [{"text": prompt}]}],
            inferenceConfig={"maxTokens": 2000, "temperature": 0.1, "topP": 0.9}
        )
        
        response_text = response["output"]["message"]["content"][0]["text"].strip()
        return response_text

    except Exception as e:
        return {"error": f"Bedrock invocation failed: {str(e)}"}

In [15]:
def detect_and_translate(raw_txt, model_arn, region_name='ap-southeast-1'):
    try:
        lang = detect(raw_txt)
        if lang != 'en':
            translation_res = translate_with_bedrock(raw_txt, model_arn=model_arn, region_name=region_name)
            return translation_res
        return raw_txt  
    except LangDetectException:
        return raw_txt  


In [16]:
model_arn = "arn:aws:bedrock:ap-southeast-1:677020944766:inference-profile/apac.amazon.nova-lite-v1:0"

In [19]:
cv_path = "../data/cv/test.jpg"
ocr_langs = ['en']
cv_txt = ocr_doc(cv_path)
cv_info = extract_info_with_bedrock(cv_txt, cv_prompt_template, model_arn=model_arn, region_name='ap-southeast-1')
with open("cv_info.json", "w") as f:
    json.dump(cv_info, f, indent=4)



In [20]:
jd_path = "../data/cv/sample-job-description.pdf"
jd_text = ocr_doc(jd_path)
jd_info = extract_info_with_bedrock(jd_text, job_prompt_template, model_arn=model_arn)
with open("jd_info.json", "w") as f:
    json.dump(jd_info, f, indent=4)

