In [2]:
# Load model directly
from transformers import AutoProcessor, AutoModelForImageTextToText

processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformers.image_utils import load_image


In [4]:
img_path = "20240625_154117.jpg"
image = load_image(img_path)

images = [image]

In [5]:
import requests
import torch
from PIL import Image
from io import BytesIO


# Create inputs
message = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "what is written in the image?"},
        ]
    }
]

prompts = [processor.apply_chat_template([m], add_generation_prompt=True) for m in message]


In [6]:

inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to(model.device)

# Generate
generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

OCR_text = generated_texts[0] 

print(generated_texts[0])





User:



what is written in the image?
Assistant: Professor Rajaai Cherkaoui El Moursli member of Hassan II Academy of Science and Technology . <ocr> University Mohammed V Universite Faculte des Sciences Rabat Professor Rajaai Cherkaoui El Moursli Member of Hassan II Academy of Science and Technology Avenue Ibn Batouta. BP 1014 . Agdal.Rabat Tel : + 212 (0) 5 37 77 18 34/35/38 r.cherkaoui@academiesciences.ma Fax : + 212 (0) 5 37 77 42 61 Gsm : +212 661 47 11 85 / 06 62 07 94 00 Site web : www.fsr.ac.ma scholar.um5.ac.ma/rajaa.cherkaoui E-mail : rajaa.cherkaoui@um5.ac.ma r.cherkaoui@academiesciences.ma Gsm : +212 661 47 11 


In [14]:
OCR_text = OCR_text.split("Assistant")[1]
OCR_text

': Professor Rajaai Cherkaoui El Moursli member of Hassan II Academy of Science and Technology . <ocr> University Mohammed V Universite Faculte des Sciences Rabat Professor Rajaai Cherkaoui El Moursli Member of Hassan II Academy of Science and Technology Avenue Ibn Batouta. BP 1014 . Agdal.Rabat Tel : + 212 (0) 5 37 77 18 34/35/38 r.cherkaoui@academiesciences.ma Fax : + 212 (0) 5 37 77 42 61 Gsm : +212 661 47 11 85 / 06 62 07 94 00 Site web : www.fsr.ac.ma scholar.um5.ac.ma/rajaa.cherkaoui E-mail : rajaa.cherkaoui@um5.ac.ma r.cherkaoui@academiesciences.ma Gsm : +212 661 47 11 '

In [7]:


try:
    import groq
    from groq import Groq
except:
    !uv pip install groq

In [8]:
GROQ_API_KEY = "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"

class LLM:

    def __init__(self) -> None:
        
        self.llm  = Groq(
                    api_key=GROQ_API_KEY,
                    )   
        self.response_count : int = 0
        
        self.response_limit : int = 100

    def get_response(self  , prompt : str , ) -> str:
        
        if self.response_count <= self.response_limit:
            
            response  = self.llm.chat.completions.create(
                        messages=[
                            {
                                "role": "user",
                                "content": f"{prompt}",
                            }
                        ],
                        model="llama-3.3-70b-versatile",
                    )
            
            self.response_count += 1
        
        else : response = "rate limit exceedee"
        
        
        return response
        
llm = LLM()

    

In [12]:
structure_prompt = """{
  "type": "person",              //"person" or "organization"
  "name": "Full Name or Organization Name",
  "title": "Job Title or Role",  // e.g., "Professor", "CEO"
  "affiliations": [
    {
      "institution": "Institution Name",
      "department": "Department Name",  // Optional
      "role": "Position/Role",
      "start_date": "YYYY-MM-DD",      // Optional
      "end_date": "YYYY-MM-DD"         // Optional
    }
  ],
  "contact_details": {
    "addresses": [
      {
        "type": "work",               // "work", "home", "branch"
        "street": "123 Main St",
        "city": "City",
        "state": "State/Province",     // Optional
        "postal_code": "12345",
        "country": "Country"
      }
    ],
    "phones": [
      {
        "type": "work",               // "work", "mobile", "fax"
        "number": "+1234567890",
        "extension": "123"            // Optional
      }
    ],
    "emails": [
      {
        "type": "work",               // "work", "personal"
        "address": "name@domain.com"
      }
    ],
    "websites": [
      {
        "type": "official",           // "official", "portfolio", "social"
        "url": "https://..."
      }
    ],
    "social_media": [                 // Optional
      {
        "platform": "linkedin",       // "twitter", "github", etc.
        "url": "https://linkedin.com/..."
      }
    ]
  },
  "tags": ["physics", "academia"],   // Optional: For categorization
  "metadata": {
    "source": "ocr",                 // "manual", "api", "csv_import"
    "confidence": 0.95,              // Optional: OCR confidence score
    "created_at": "YYYY-MM-DD",
    "last_updated": "YYYY-MM-DD"
  },
  "notes": "Additional context..."   // Optional
}
"""



In [10]:
structure = {
  "type": "person",              #"person" or "organization"
  "name": "Full Name or Organization Name",
  "title": "Job Title or Role",  # e.g., "Professor", "CEO"
  "photo_url": "https://...",    # Optional: Link to image/avatar
  "affiliations": [
    {
      "institution": "Institution Name",
      "department": "Department Name",  # Optional
      "role": "Position/Role",
      "start_date": "YYYY-MM-DD",      # Optional
      "end_date": "YYYY-MM-DD"         # Optional
    }
  ],
  "contact_details": {
    "addresses": [
      {
        "type": "work",               # "work", "home", "branch"
        "street": "123 Main St",
        "city": "City",
        "state": "State/Province",     # Optional
        "postal_code": "12345",
        "country": "Country"
      }
    ],
    "phones": [
      {
        "type": "work",               # "work", "mobile", "fax"
        "number": "+1234567890",
        "extension": "123"           # Optional
      }
    ],
    "emails": [
      {
        "type": "work",              # "work", "personal"
        "address": "name@domain.com"
      }
    ],
    "websites": [
      {
        "type": "official",           # "official", "portfolio", "social"
        "url": "https://..."
      }
    ],
    "social_media": [                 # Optional
      {
        "platform": "linkedin",       # "twitter", "github", etc.
        "url": "https://linkedin.com/..."
      }
    ]
  },
  "tags": ["physics", "academia"],   # Optional: For categorization
  "metadata": {
    "source": "ocr",                 # "manual", "api", "csv_import"
    "confidence": 0.95,              # Optional: OCR confidence score
    "created_at": "YYYY-MM-DD",
    "last_updated": "YYYY-MM-DD"
  },
  "notes": "Additional context..."   # Optional
}




In [64]:
prompt = f""" Here's your **LLM system prompt** version for an OCR-to-Structured Contact Card Parsing agent. This is formatted to guide the LLM's behavior and outputs consistently:

---

**System Prompt: OCR-to-Structured Contact Card Parser**

You are an expert information extraction agent. Your task is to **convert OCR-scanned text into a structured contact card** object by extracting and categorizing all visible entities. Follow the exact field definitions and formatting instructions strictly.

---

### **Task**

Parse the given OCR text into a JSON object representing a structured contact card.

---

### **Field Definitions**

* **name** (`string`):
  Full name of the person or organization.
  Format: `"FirstName LastName"` or `"Organization Name"`.
  Example: `"Rajaai Cherkaoui El Moursli"`.

* **title** (`string | null`):
  Job title or role (e.g., `"Professor"`, `"CEO"`). Omit if not clearly present.

* **type** (`string`):
  `"person"` or `"organization"` depending on the subject of the contact card.

* **affiliations** (`list of objects`):
  Each affiliation must include:

  * `institution` (`string`)
  * `department` (`string | null`)
  * `role` (`string | null`)

* **contact\_details** (`object`):

  * `addresses` (`list of objects`):
    Each object must include:

    * `type` (`string`): `"work"`, `"home"`, or `"branch"`
    * `street` (`string`)
    * `city` (`string`)
    * `country` (`string`)
    * `postal_code` (`string | null`)

  * `phones` (`list of objects`):
    Each object must include:

    * `type` (`string`): `"work"`, `"mobile"`, or `"fax"`
    * `number` (`string`): Include country code (e.g., `"+212 661 47 11 85"`)
    * `extension` (`string | null`)

  * `emails` (`list of objects`):
    Each object must include:

    * `type` (`string`): `"work"` or `"personal"`
    * `address` (`string`): Must be valid email format

  * `websites` (`list of objects`):
    Each object must include:

    * `type` (`string`): `"official"`, `"portfolio"`, or `"social"`
    * `url` (`string`): Full URL (e.g., `"https://www.fsr.ac.ma"`)

* **tags** (`list of strings | null`):
  Domain categories, such as `["academia", "physics" , etc]`. Omit if unclear.

* **metadata** (`object`):

  * `source` (`string`): `"ocr"`, `"manual"`, or `"api"`
  * `confidence` (`float | null`): OCR confidence score between `0.0` and `1.0`

---

### **Instructions**

* **Extract only from the provided OCR text**. Never invent or infer data.
* **Normalize** noisy formats (e.g., `"Tel:" → "+212..."`).
* **Prioritize** official contact details if multiple are present.
* **Validate** data: Omit entries that are ambiguous or unreadable.
* **Strict typing**: All data must match exactly the expected JSON schema.
* **Omit** any field that has no visible or validated data.

---

### **Output Format**

Respond with a **single JSON object** following the structure and types above. Do not include explanatory text, markdown, or commentary — just valid, formatted JSON.


OCR CONTACT INFORMATION = {OCR_text}

STRUCTURED_JSON : 
"""

  """


In [22]:
response = llm.get_response(prompt)

response.choices

[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='```json\n{\n  "name": "Rajaai Cherkaoui El Moursli",\n  "title": "Professor",\n  "type": "person",\n  "affiliations": [\n    {\n      "institution": "Hassan II Academy of Science and Technology",\n      "department": null,\n      "role": "Member"\n    },\n    {\n      "institution": "University Mohammed V",\n      "department": "Faculte des Sciences",\n      "role": "Professor"\n    }\n  ],\n  "contact_details": {\n    "addresses": [\n      {\n        "type": "work",\n        "street": "Avenue Ibn Batouta",\n        "city": "Rabat",\n        "country": null,\n        "postal_code": "BP 1014"\n      }\n    ],\n    "phones": [\n      {\n        "type": "work",\n        "number": "+212 537 77 18 34",\n        "extension": null\n      },\n      {\n        "type": "work",\n        "number": "+212 537 77 18 35",\n        "extension": null\n      },\n      {\n        "type": "work",\n        "number":

In [24]:
output = response.choices[0].message.content

In [29]:
cleaned = output.replace("\n" , "")
cleaned = cleaned.replace("json" , "")
cleaned = cleaned.replace("```" , "")

In [30]:
cleaned 

'{  "name": "Rajaai Cherkaoui El Moursli",  "title": "Professor",  "type": "person",  "affiliations": [    {      "institution": "Hassan II Academy of Science and Technology",      "department": null,      "role": "Member"    },    {      "institution": "University Mohammed V",      "department": "Faculte des Sciences",      "role": "Professor"    }  ],  "contact_details": {    "addresses": [      {        "type": "work",        "street": "Avenue Ibn Batouta",        "city": "Rabat",        "country": null,        "postal_code": "BP 1014"      }    ],    "phones": [      {        "type": "work",        "number": "+212 537 77 18 34",        "extension": null      },      {        "type": "work",        "number": "+212 537 77 18 35",        "extension": null      },      {        "type": "work",        "number": "+212 537 77 18 38",        "extension": null      },      {        "type": "fax",        "number": "+212 537 77 42 61",        "extension": null      },      {        "type": "m

In [31]:
import json
from typing import Dict, Any

def json_string_to_dict(json_str: str) -> Dict[str, Any]:
    """Converts a JSON string to a Python dictionary with validation."""
    try:
        # Convert JSON string to dictionary
        contact_dict = json.loads(json_str)
        
        # Validate required fields
        required_fields = ['name', 'type', 'contact_details']
        for field in required_fields:
            if field not in contact_dict:
                raise ValueError(f"Missing required field: {field}")
        
        # Validate type
        if contact_dict['type'] not in ['person', 'organization']:
            raise ValueError("Type must be either 'person' or 'organization'")
            
        return contact_dict
    
    except json.JSONDecodeError as e:
        raise ValueError(f"Invalid JSON format: {str(e)}")

contact_dict = json_string_to_dict(cleaned)
print(contact_dict)

{'name': 'Rajaai Cherkaoui El Moursli', 'title': 'Professor', 'type': 'person', 'affiliations': [{'institution': 'Hassan II Academy of Science and Technology', 'department': None, 'role': 'Member'}, {'institution': 'University Mohammed V', 'department': 'Faculte des Sciences', 'role': 'Professor'}], 'contact_details': {'addresses': [{'type': 'work', 'street': 'Avenue Ibn Batouta', 'city': 'Rabat', 'country': None, 'postal_code': 'BP 1014'}], 'phones': [{'type': 'work', 'number': '+212 537 77 18 34', 'extension': None}, {'type': 'work', 'number': '+212 537 77 18 35', 'extension': None}, {'type': 'work', 'number': '+212 537 77 18 38', 'extension': None}, {'type': 'fax', 'number': '+212 537 77 42 61', 'extension': None}, {'type': 'mobile', 'number': '+212 661 47 11 85', 'extension': None}, {'type': 'mobile', 'number': '+212 662 07 94 00', 'extension': None}], 'emails': [{'type': 'work', 'address': 'r.cherkaoui@academiesciences.ma'}, {'type': 'work', 'address': 'rajaa.cherkaoui@um5.ac.ma'}

In [32]:
import pprint

In [34]:
pprint.pprint(contact_dict)

{'affiliations': [{'department': None,
                   'institution': 'Hassan II Academy of Science and Technology',
                   'role': 'Member'},
                  {'department': 'Faculte des Sciences',
                   'institution': 'University Mohammed V',
                   'role': 'Professor'}],
 'contact_details': {'addresses': [{'city': 'Rabat',
                                    'country': None,
                                    'postal_code': 'BP 1014',
                                    'street': 'Avenue Ibn Batouta',
                                    'type': 'work'}],
                     'emails': [{'address': 'r.cherkaoui@academiesciences.ma',
                                 'type': 'work'},
                                {'address': 'rajaa.cherkaoui@um5.ac.ma',
                                 'type': 'work'}],
                     'phones': [{'extension': None,
                                 'number': '+212 537 77 18 34',
                         

In [59]:
import time 

def generate_metadata(image_path):
    metadata = {}
    
    metadata["creation_time"] = datetime.datetime.now().isoformat()
    metadata["source"] = image_path

    return metadata
    

In [60]:
metadata = generate_metadata(img_path)

In [62]:
contact_dict['metadata'].update(metadata)

In [63]:
pprint.pprint(contact_dict)

{'affiliations': [{'department': None,
                   'institution': 'Hassan II Academy of Science and Technology',
                   'role': 'Member'},
                  {'department': 'Faculte des Sciences',
                   'institution': 'University Mohammed V',
                   'role': 'Professor'}],
 'contact_details': {'addresses': [{'city': 'Rabat',
                                    'country': None,
                                    'postal_code': 'BP 1014',
                                    'street': 'Avenue Ibn Batouta',
                                    'type': 'work'}],
                     'emails': [{'address': 'r.cherkaoui@academiesciences.ma',
                                 'type': 'work'},
                                {'address': 'rajaa.cherkaoui@um5.ac.ma',
                                 'type': 'work'}],
                     'phones': [{'extension': None,
                                 'number': '+212 537 77 18 34',
                         

## adding background info

In [65]:
background_info = {

    'company_details' : None,
    'individual_detail' :None
    
}