### Practical Prompting Examples #2

Sample taken from [here](https://github.com/JohnSnowLabs/pdf-deid-dataset/blob/main/PDF_Original/Hard/PDF_Deid_Deidentification_Hard_0.pdf)

In [None]:
prompt = """ You are an information extraction system.

Your task: Extract patient information according to the following schema.

{
    "Patient Name": "string | null",
    "Date of Birth": "string | null",
    "Social Security Number": "string | null",
    "Encounter Participant": "string | null",
}
"""

In [None]:
from pyspark.ml import PipelineModel
import pyspark.sql.functions as f
from sparkocr.transformers import *
from sparkocr.enums import *
from sparkocr.utils import display_images
from sparkocr.dataextraction.visual_prescriptions_recognition import VisualPrescriptionsRecognition

pdf_to_img = PdfToImage() \
.setKeepInput(False)

ocr = VisualPrescriptionsRecognition() \
.setInputCol("image") \
.setOutputCol("text") \
.setKeepInput(False) \
.setPrompt(f"{prompt}") \
.setMaxNewTokens(4096)

image_path = "dbfs:/FileStore/pdfs/PDF_Deid_Deidentification_Medium_0.pdf"
pdf_df = spark.read.format("binaryFile").load(image_path)


In [None]:
image_df = pdf_to_img.transform(pdf_df).limit(8).cache()
image_df.show()

+--------------------+-------------------+------+--------------------+-----------+---------+-------+-----------+
|                path|   modificationTime|length|               image|total_pages|exception|pagenum|documentnum|
+--------------------+-------------------+------+--------------------+-----------+---------+-------+-----------+
|dbfs:/FileStore/p...|2025-10-22 16:01:12|461436|{dbfs:/FileStore/...|          4|         |      1|          0|
|dbfs:/FileStore/p...|2025-10-22 16:01:12|461436|{dbfs:/FileStore/...|          4|         |      0|          0|
|dbfs:/FileStore/p...|2025-10-22 16:01:12|461436|{dbfs:/FileStore/...|          4|         |      3|          0|
|dbfs:/FileStore/p...|2025-10-22 16:01:12|461436|{dbfs:/FileStore/...|          4|         |      2|          0|
+--------------------+-------------------+------+--------------------+-----------+---------+-------+-----------+



In [None]:
result = ocr.transform(image_df).cache()
result.show()

+--------------------+-------------------+------+-----------+---------+-------+-----------+--------------------+
|                path|   modificationTime|length|total_pages|exception|pagenum|documentnum|                text|
+--------------------+-------------------+------+-----------+---------+-------+-----------+--------------------+
|dbfs:/FileStore/p...|2025-10-22 16:01:12|461436|          4|         |      1|          0|{'Patient Name': ...|
|dbfs:/FileStore/p...|2025-10-22 16:01:12|461436|          4|         |      0|          0|{'Patient Name': ...|
|dbfs:/FileStore/p...|2025-10-22 16:01:12|461436|          4|         |      3|          0|                    |
|dbfs:/FileStore/p...|2025-10-22 16:01:12|461436|          4|         |      2|          0|{'Patient Name': ...|
+--------------------+-------------------+------+-----------+---------+-------+-----------+--------------------+



In [None]:
result.select("text").show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                                                                                                                                                                                                |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{'Patient Name': 'Susan Frances Martin', 'Date of Birth': '09/03/1