> # Extracting Text from PDF 
***
In this section we will be extracting a bunch of informations from the resumes stored in the Azure Data Lake Gen2.
Than structure the data into a csv file and store it back in a container in Azure Data Lake Gen2.
The Dependencies needed for this notebook :
- **`azure-ai-documentintelligence==1.0.0b4`**
- **`azure-storage-blob`**
- **`azure-ai-formrecognizer`**
- **`groq`**


**01. Mounting the Containers from Azure Data Lake Gen2**

In [0]:
dbutils.fs.mount(
    source='',
    mount_point='',
    extra_configs={'': dbutils.secrets.get('keyvaultScope', 'DataLakeSecretKey')}
)
dbutils.fs.mount(
    source='',
    mount_point='',
    extra_configs={'': dbutils.secrets.get('keyvaultScope', 'DataLakeSecretKey')}
)

**02. Importing the necessary libraries**

In [0]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.storage.blob import BlobServiceClient

from groq import Groq

from dotenv import load_dotenv
import json
import re
import os

load_dotenv()

**03. Import the Secret Keys from the `.env` file**

In [0]:
DI_ENDPOINT = os.getenv("DI_ENDPOINT")
DI_KEY = os.getenv("DI_KEY") 
API_KEY = os.getenv("API_KEY")  

**04. Optical Character Recognition OCR operation on the resumes**


In [0]:
def resume_ocr(file_path,endpoint,key):


    document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(key)
    )
    with open(file_path, "rb") as document:  
        poller = document_analysis_client.begin_analyze_document("prebuilt-document", document)
        result = poller.result()

    document_content = result.content
    return document_content

**05. Extracting the needed informations from the text**

In [0]:
def extract_information(API_KEY,text:str):


    client = Groq(api_key=API_KEY)
    completion = client.chat.completions.create(
    model="llama3-70b-8192",
    messages=[
        {
            "role": "user",
            "content": f"""
                Extract the following information in a JSON format from this resume:
                {text}
                Information to extract: name, position, email, phone, address, linkedin, summary, skills, education, experience, projects, certifications, languages. 
                If any of these information are not found, replace them with None
                        """
       },
        {
            "role": "assistant",
            "content": ""
        }
    ],
        temperature=1,
        max_tokens=1024,
        top_p=1,
        stream=True,
        stop=None,
    )
    
    full_output = ""
    for chunk in completion:
        output = chunk.choices[0].delta.content or ""
        full_output += output

    start_index = full_output.find("{")
    end_index = full_output.rfind("}") + 1
    json_string = full_output[start_index:end_index]

    data = json.loads(json_string.replace("None", "null"))

    return data


**06. Applying the functions on the existing pdf resumes in the container `resumecontainer` in Gen 2**

In [0]:
resumes = []
for file_info in dbutils.fs.ls(''):

    file_path = f"/dbfs{file_info.path.replace('dbfs:/', '/')}"

    try:
        print(f"Processing file: {file_info.name}")
        ocr_result = resume_ocr(file_path, DI_ENDPOINT, DI_KEY)
        resume = extract_information(API_KEY,ocr_result)
        resumes.append(resume)
    except Exception as e:
        print(f"Error processing {file_info.name}: {e}")


**07. Save the json file that has the clean data into `ocr-data-container` container in Gen2**

In [0]:
output_file_path = ""
try:
    with open(output_file_path, "w") as json_file:
        json.dump(resumes, json_file, indent=4)
    print(f"Resumes saved to {output_file_path}")
except Exception as e:
    print(f"Error saving JSON file: {e}")