In [None]:
!pip install easyocr
!pip install boto3
!pip install transformers
!pip install pdf2image

In [67]:
import easyocr
from pdf2image import convert_from_path
import os
import numpy as np
import boto3
import json
import re
from botocore.exceptions import ClientError

In [74]:
def ocr_doc(pdf_path):
    ocr_text = ''
    reader = easyocr.Reader(ocr_langs)
    if pdf_path.endswith('.pdf'):
        images = convert_from_path(pdf_path)
        ocr_result = []
        for image in images:
            ocr_result.extend(reader.readtext(np.array(image)))
    else:
        ocr_result = reader.readtext(path)
    for res in ocr_result:
        ocr_text += res[1] + '\n'
    return ocr_text.strip()
    

In [119]:
def extract_cv_info_with_bedrock(raw_txt):
    prompt = f"""
You are an expert at extracting structured information from unstructured text.
Do not make up any information.
Extract the following information from the resume text below:
1. Full Name
2. Email Address
3. Phone Number
4. Location
5. Skills (list of skills)
6. Work Experience (list of job titles and companies)
7. Education (list of degrees and institutions)
8. Certifications (list of certifications)
9. Languages (list of languages spoken)
10. Projects (list of notable projects with descriptions and duration)
11. Summary (a brief summary of the candidate)
12. LinkedIn Profile (URL of the LinkedIn profile)
13. GitHub Profile (URL of the GitHub profile)
14. Portfolio Website (URL of the portfolio website)
15. References (list of references if available)
Format the output as a JSON object with the above fields. If any information is not available, use "Not Available" as the value.
The JSON format must adhere to the following structure:
{{
  "full_name": "string",
  "email_address": "string",
  "phone_number": "string",
  "location": "string",
  "summary": "string",
  "linkedin_profile": "URL string or null",
  "github_profile": "URL string or null",
  "portfolio_website": "URL string or null",
  "skills": ["string", "string", "..."],
  "work_experience": [
    {{
      "job_title": "string",
      "company": "string",
      "duration": "string",
      "description": "string"
    }}
  ],
  "education": [
    {{
      "degree": "string",
      "institution": "string",
      "graduation_year": "string"
    }}
  ],
  "certifications": ["string", "string", "..."],
  "languages": ["string", "string", "..."],
  "projects": ["string", "string", "..."],
  "references": ["string", "string", "..."]
}}
Here is the resume text: {raw_txt}
"""
    client = boto3.client('bedrock-runtime', region_name='ap-southeast-1')
    try:
      request = {
          "modelId": "arn:aws:bedrock:ap-southeast-1:677020944766:inference-profile/apac.amazon.nova-lite-v1:0",
          "messages": [
              {
                  "role": "user",
                  "content": [{"text": prompt}]
              }
          ],
          "inferenceConfig": {
              "maxTokens": 2000,
              "temperature": 0.1,
              "topP": 0.9
          }
      }

      # Gọi Converse API
      response = client.converse(
          modelId=request["modelId"],
          messages=request["messages"],
          inferenceConfig=request["inferenceConfig"]
      )

      # Lấy text response
      response_text = response["output"]["message"]["content"][0]["text"].strip()

      # Parse JSON từ response
      try:
          if response_text.startswith("```json"):
              response_text = response_text.replace("```json", "").strip()
          if response_text.endswith("```"):
                response_text = response_text.rstrip("```").strip()
          extracted_json = json.loads(response_text)
      except json.JSONDecodeError:
          extracted_json = {"error": "Failed to parse JSON"}

      return extracted_json

    except Exception as e:
      return {"error": f"Bedrock invocation failed: {str(e)}"}
    

In [125]:
def extract_job_desc_info_with_bedrock(raw_txt):
    prompt = f"""
You are an expert at extracting structured information from unstructured text.
Do not make up any information.
Extract the following information from the job description text below:
1. Job Title
2. Company Name
3. Location
4. Job Type (e.g., full-time, part-time, remote)
5. Salary Range
6. Responsibilities (list of key responsibilities)
7. Requirements (list of required skills, education, and experience)
8. Benefits (list of benefits offered)
9. Application Instructions (how to apply)
10. Contact Information (email or phone if available)
11. Company Website (URL if available)
12. Posting Date (date the job was posted if available)
13. Required Certifications (list if any)
14. Preferred Languages (list if any)
Format the output as a JSON object with the above fields. If any information is not available, use "Not Available" as the value.
The JSON format must adhere to the following structure:
{{
  "job_title": "string",
  "company_name": "string",
  "location": "string",
  "job_type": "string",
  "salary_range": "string",
  "company_website": "URL string or null",
  "posting_date": "string",
  "contact_information": "string",
  "responsibilities": ["string", "string", "..."],
  "requirements": ["string", "string", "..."],
  "benefits": ["string", "string", "..."],
  "application_instructions": "string",
  "required_certifications": ["string", "string", "..."],
  "preferred_languages": ["string", "string", "..."]
}}
Here is the job description text: {raw_txt}
"""
    client = boto3.client('bedrock-runtime', region_name='ap-southeast-1')
    try:
      request = {
          "modelId": "arn:aws:bedrock:ap-southeast-1:677020944766:inference-profile/apac.amazon.nova-lite-v1:0",
          "messages": [
              {
                  "role": "user",
                  "content": [{"text": prompt}]
              }
          ],
          "inferenceConfig": {
              "maxTokens": 2000,
              "temperature": 0.1,
              "topP": 0.9
          }
      }

      # Gọi Converse API
      response = client.converse(
          modelId=request["modelId"],
          messages=request["messages"],
          inferenceConfig=request["inferenceConfig"]
      )

      # Lấy text response
      response_text = response["output"]["message"]["content"][0]["text"].strip()

      # Parse JSON từ response
      try:
          if response_text.startswith("```json"):
              response_text = response_text.replace("```json", "").strip()
          if response_text.endswith("```"):
              response_text = response_text.rstrip("```").strip()
          extracted_json = json.loads(response_text)
      except json.JSONDecodeError:
          extracted_json = {"error": "Failed to parse JSON"}

      return extracted_json

    except Exception as e:
      return {"error": f"Bedrock invocation failed: {str(e)}"}

In [118]:
cv_path = "../data/cv/test.jpg"
ocr_langs = ['en']
raw_txt = ocr_doc(cv_path)



In [120]:
res = extract_cv_info_with_bedrock(raw_txt)
print(res)

{'full_name': 'Remy Bertrand', 'email_address': 'JosephFavreau@gmail.com', 'phone_number': '0485435365', 'location': '15, boulevard Admiral Courbet, 69600 OULUNS, Country', 'summary': 'Dynamic manager with more than X years experience in sales and business development, negotiation, account management. Proven ability to achieve sales targets and significantly increase revenue. Results-oriented, motivated, and focused on customer satisfaction.', 'linkedin_profile': 'Not Available', 'github_profile': 'Not Available', 'portfolio_website': 'Not Available', 'skills': ['Sales management', 'Business development', 'Commercial negotiating', 'Account management', 'Customer service', 'Market analysis', 'Sales forecasting', 'Contract negotiation', 'Team leadership', 'Sales reporting', 'Performance analysis'], 'work_experience': [{'job_title': 'Sales Manager', 'company': 'Company ABC', 'duration': 'January 20XX - present', 'description': 'Lead team of X sales reps, develop and implement sales strate

In [128]:
jd_path = "../data/cv/sample-job-description.pdf"
ocr_langs = ['en']
raw_txt = ocr_doc(jd_path)
print(raw_txt.strip())



Sample Job Description
Job Title:
Human Resources Assistant
Job Description:
This position reports to the Human Resources (HR) director and
interfaces with company managers and HR staff: Company XYZ is
committed to an employee-orientated, high performance culture that
emphasizes empowerment; quality, continuous improvement, and the
recruitment and ongoing development of a superior workforce.
The intern will gain exposure
to these functional areas:
HR Information Systems; Employee relations; Training and development;
Benefits; Compensation; Organization development; Employment
Specific responsibilities:
Employee orientation and training logistics and recordkeeping
Company-wide committee facilitation and participation
Employee safety, welfare, wellness and health reporting
Provide direct support to employees during implementation of HR
services, policies and programs
What skills will the
intern learn:
Active participation in strategic planning process, including
developing goals, objecti

In [129]:
res = extract_job_desc_info_with_bedrock(raw_txt)
print(res)

{'job_title': 'Human Resources Assistant', 'company_name': 'Company XYZ', 'location': 'Not Available', 'job_type': 'Not Available', 'salary_range': 'Not Available', 'company_website': 'Not Available', 'posting_date': 'Not Available', 'contact_information': 'Not Available', 'responsibilities': ['Employee orientation and training logistics and recordkeeping', 'Company-wide committee facilitation and participation', 'Employee safety, welfare, wellness and health reporting', 'Provide direct support to employees during implementation of HR services, policies and programs'], 'requirements': ['Proficient with Microsoft Word and Excel', 'General knowledge of employment law and practices', 'Able to maintain a high level of confidentiality', 'Effective oral and written management communication skills'], 'benefits': ['Not Available'], 'application_instructions': 'Not Available', 'required_certifications': ['Not Available'], 'preferred_languages': ['Not Available']}
