In [1]:
# Load model directly
from transformers import AutoProcessor, AutoModelForImageTextToText

processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from transformers.image_utils import load_image


In [3]:
img_path = "20240625_154117.jpg"
image = load_image(img_path)

images = [image]

In [4]:
import requests
import torch
from PIL import Image
from io import BytesIO


# Create inputs
message = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "what is written in the image?"},
        ]
    }
]

prompts = [processor.apply_chat_template([m], add_generation_prompt=True) for m in message]


In [5]:

inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to(model.device)

# Generate
generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

OCR_text = generated_texts[0] 

print(generated_texts[0])





User:



what is written in the image?
Assistant: Professor Rajaai Cherkaoui El Moursli member of Hassan II Academy of Science and Technology . <ocr> University Mohammed V Universite Faculte des Sciences Rabat Professor Rajaai Cherkaoui El Moursli Member of Hassan II Academy of Science and Technology Avenue Ibn Batouta. BP 1014 . Agdal.Rabat Tel : + 212 (0) 5 37 77 18 34/35/38 r.cherkaoui@academiesciences.ma Fax : + 212 (0) 5 37 77 42 61 Gsm : +212 661 47 11 85 / 06 62 07 94 00 Site web : www.fsr.ac.ma scholar.um5.ac.ma/rajaa.cherkaoui E-mail : rajaa.cherkaoui@um5.ac.ma r.cherkaoui@academiesciences.ma Gsm : +212 661 47 11 


In [6]:
OCR_text = OCR_text.split("Assistant")[1]
OCR_text

': Professor Rajaai Cherkaoui El Moursli member of Hassan II Academy of Science and Technology . <ocr> University Mohammed V Universite Faculte des Sciences Rabat Professor Rajaai Cherkaoui El Moursli Member of Hassan II Academy of Science and Technology Avenue Ibn Batouta. BP 1014 . Agdal.Rabat Tel : + 212 (0) 5 37 77 18 34/35/38 r.cherkaoui@academiesciences.ma Fax : + 212 (0) 5 37 77 42 61 Gsm : +212 661 47 11 85 / 06 62 07 94 00 Site web : www.fsr.ac.ma scholar.um5.ac.ma/rajaa.cherkaoui E-mail : rajaa.cherkaoui@um5.ac.ma r.cherkaoui@academiesciences.ma Gsm : +212 661 47 11 '

In [7]:


try:
    import groq
    from groq import Groq
except:
    !uv pip install groq

In [8]:
GROQ_API_KEY = "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"

class LLM:

    def __init__(self) -> None:
        
        self.llm  = Groq(
                    api_key=GROQ_API_KEY,
                    )   
        self.response_count : int = 0
        
        self.response_limit : int = 100

    def get_response(self  , prompt : str , ) -> str:
        
        if self.response_count <= self.response_limit:
            
            response  = self.llm.chat.completions.create(
                        messages=[
                            {
                                "role": "user",
                                "content": f"{prompt}",
                            }
                        ],
                        model="llama-3.3-70b-versatile",
                    )
            
            self.response_count += 1
        
        else : response = "rate limit exceedee"
        
        
        return response
        
llm = LLM()

    

In [9]:
structure_prompt = """{
  "type": "person",              //"person" or "organization"
  "name": "Full Name or Organization Name",
  "title": "Job Title or Role",  // e.g., "Professor", "CEO"
  "affiliations": [
    {
      "institution": "Institution Name",
      "department": "Department Name",  // Optional
      "role": "Position/Role",
      "start_date": "YYYY-MM-DD",      // Optional
      "end_date": "YYYY-MM-DD"         // Optional
    }
  ],
  "contact_details": {
    "addresses": [
      {
        "type": "work",               // "work", "home", "branch"
        "street": "123 Main St",
        "city": "City",
        "state": "State/Province",     // Optional
        "postal_code": "12345",
        "country": "Country"
      }
    ],
    "phones": [
      {
        "type": "work",               // "work", "mobile", "fax"
        "number": "+1234567890",
        "extension": "123"            // Optional
      }
    ],
    "emails": [
      {
        "type": "work",               // "work", "personal"
        "address": "name@domain.com"
      }
    ],
    "websites": [
      {
        "type": "official",           // "official", "portfolio", "social"
        "url": "https://..."
      }
    ],
    "social_media": [                 // Optional
      {
        "platform": "linkedin",       // "twitter", "github", etc.
        "url": "https://linkedin.com/..."
      }
    ]
  },
  "tags": ["physics", "academia"],   // Optional: For categorization
  "metadata": {
    "source": "ocr",                 // "manual", "api", "csv_import"
    "confidence": 0.95,              // Optional: OCR confidence score
    "created_at": "YYYY-MM-DD",
    "last_updated": "YYYY-MM-DD"
  },
  "notes": "Additional context..."   // Optional
}
"""



In [10]:
structure = {
  "type": "person",              #"person" or "organization"
  "name": "Full Name or Organization Name",
  "title": "Job Title or Role",  # e.g., "Professor", "CEO"
  "photo_url": "https://...",    # Optional: Link to image/avatar
  "affiliations": [
    {
      "institution": "Institution Name",
      "department": "Department Name",  # Optional
      "role": "Position/Role",
      "start_date": "YYYY-MM-DD",      # Optional
      "end_date": "YYYY-MM-DD"         # Optional
    }
  ],
  "contact_details": {
    "addresses": [
      {
        "type": "work",               # "work", "home", "branch"
        "street": "123 Main St",
        "city": "City",
        "state": "State/Province",     # Optional
        "postal_code": "12345",
        "country": "Country"
      }
    ],
    "phones": [
      {
        "type": "work",               # "work", "mobile", "fax"
        "number": "+1234567890",
        "extension": "123"           # Optional
      }
    ],
    "emails": [
      {
        "type": "work",              # "work", "personal"
        "address": "name@domain.com"
      }
    ],
    "websites": [
      {
        "type": "official",           # "official", "portfolio", "social"
        "url": "https://..."
      }
    ],
    "social_media": [                 # Optional
      {
        "platform": "linkedin",       # "twitter", "github", etc.
        "url": "https://linkedin.com/..."
      }
    ]
  },
  "tags": ["physics", "academia"],   # Optional: For categorization
  "metadata": {
    "source": "ocr",                 # "manual", "api", "csv_import"
    "confidence": 0.95,              # Optional: OCR confidence score
    "created_at": "YYYY-MM-DD",
    "last_updated": "YYYY-MM-DD"
  },
  "notes": "Additional context..."   # Optional
}




In [11]:
prompt = f""" Here's your **LLM system prompt** version for an OCR-to-Structured Contact Card Parsing agent. This is formatted to guide the LLM's behavior and outputs consistently:

---

**System Prompt: OCR-to-Structured Contact Card Parser**

You are an expert information extraction agent. Your task is to **convert OCR-scanned text into a structured contact card** object by extracting and categorizing all visible entities. Follow the exact field definitions and formatting instructions strictly.

---

### **Task**

Parse the given OCR text into a JSON object representing a structured contact card.

---

### **Field Definitions**

* **name** (`string`):
  Full name of the person or organization.
  Format: `"FirstName LastName"` or `"Organization Name"`.
  Example: `"Rajaai Cherkaoui El Moursli"`.

* **title** (`string | null`):
  Job title or role (e.g., `"Professor"`, `"CEO"`). Omit if not clearly present.

* **type** (`string`):
  `"person"` or `"organization"` depending on the subject of the contact card.

* **affiliations** (`list of objects`):
  Each affiliation must include:

  * `institution` (`string`)
  * `department` (`string | null`)
  * `role` (`string | null`)

* **contact\_details** (`object`):

  * `addresses` (`list of objects`):
    Each object must include:

    * `type` (`string`): `"work"`, `"home"`, or `"branch"`
    * `street` (`string`)
    * `city` (`string`)
    * `country` (`string`)
    * `postal_code` (`string | null`)

  * `phones` (`list of objects`):
    Each object must include:

    * `type` (`string`): `"work"`, `"mobile"`, or `"fax"`
    * `number` (`string`): Include country code (e.g., `"+212 661 47 11 85"`)
    * `extension` (`string | null`)

  * `emails` (`list of objects`):
    Each object must include:

    * `type` (`string`): `"work"` or `"personal"`
    * `address` (`string`): Must be valid email format

  * `websites` (`list of objects`):
    Each object must include:

    * `type` (`string`): `"official"`, `"portfolio"`, or `"social"`
    * `url` (`string`): Full URL (e.g., `"https://www.fsr.ac.ma"`)

* **tags** (`list of strings | null`):
  Domain categories, such as `["academia", "physics" , etc]`. Omit if unclear.

* **metadata** (`object`):

  * `source` (`string`): `"ocr"`, `"manual"`, or `"api"`
  * `confidence` (`float | null`): OCR confidence score between `0.0` and `1.0`

---

### **Instructions**

* **Extract only from the provided OCR text**. Never invent or infer data.
* **Normalize** noisy formats (e.g., `"Tel:" → "+212..."`).
* **Prioritize** official contact details if multiple are present.
* **Validate** data: Omit entries that are ambiguous or unreadable.
* **Strict typing**: All data must match exactly the expected JSON schema.
* **Omit** any field that has no visible or validated data.

---

### **Output Format**

Respond with a **single JSON object** following the structure and types above. Do not include explanatory text, markdown, or commentary — just valid, formatted JSON.


OCR CONTACT INFORMATION = {OCR_text}

STRUCTURED_JSON : 
"""

  """


In [12]:
response = llm.get_response(prompt)

response.choices

[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='```json\n{\n  "name": "Rajaai Cherkaoui El Moursli",\n  "title": "Professor",\n  "type": "person",\n  "affiliations": [\n    {\n      "institution": "Hassan II Academy of Science and Technology",\n      "department": null,\n      "role": "Member"\n    },\n    {\n      "institution": "University Mohammed V",\n      "department": "Faculte des Sciences Rabat",\n      "role": "Professor"\n    }\n  ],\n  "contact_details": {\n    "addresses": [\n      {\n        "type": "work",\n        "street": "Avenue Ibn Batouta",\n        "city": "Rabat",\n        "country": null,\n        "postal_code": "BP 1014"\n      }\n    ],\n    "phones": [\n      {\n        "type": "work",\n        "number": "+212 537 77 18 34",\n        "extension": null\n      },\n      {\n        "type": "work",\n        "number": "+212 537 77 18 35",\n        "extension": null\n      },\n      {\n        "type": "work",\n        "nu

In [13]:
output = response.choices[0].message.content

In [14]:
cleaned = output.replace("\n" , "")
cleaned = cleaned.replace("json" , "")
cleaned = cleaned.replace("```" , "")

In [15]:
cleaned 

'{  "name": "Rajaai Cherkaoui El Moursli",  "title": "Professor",  "type": "person",  "affiliations": [    {      "institution": "Hassan II Academy of Science and Technology",      "department": null,      "role": "Member"    },    {      "institution": "University Mohammed V",      "department": "Faculte des Sciences Rabat",      "role": "Professor"    }  ],  "contact_details": {    "addresses": [      {        "type": "work",        "street": "Avenue Ibn Batouta",        "city": "Rabat",        "country": null,        "postal_code": "BP 1014"      }    ],    "phones": [      {        "type": "work",        "number": "+212 537 77 18 34",        "extension": null      },      {        "type": "work",        "number": "+212 537 77 18 35",        "extension": null      },      {        "type": "work",        "number": "+212 537 77 18 38",        "extension": null      },      {        "type": "mobile",        "number": "+212 661 47 11 85",        "extension": null      },      {        "

In [16]:
import json
from typing import Dict, Any

def json_string_to_dict(json_str: str) -> Dict[str, Any]:
    """Converts a JSON string to a Python dictionary with validation."""
    try:
        # Convert JSON string to dictionary
        contact_dict = json.loads(json_str)
        
        # Validate required fields
        required_fields = ['name', 'type', 'contact_details']
        for field in required_fields:
            if field not in contact_dict:
                raise ValueError(f"Missing required field: {field}")
        
        # Validate type
        if contact_dict['type'] not in ['person', 'organization']:
            raise ValueError("Type must be either 'person' or 'organization'")
            
        return contact_dict
    
    except json.JSONDecodeError as e:
        raise ValueError(f"Invalid JSON format: {str(e)}")

contact_dict = json_string_to_dict(cleaned)
print(contact_dict)

{'name': 'Rajaai Cherkaoui El Moursli', 'title': 'Professor', 'type': 'person', 'affiliations': [{'institution': 'Hassan II Academy of Science and Technology', 'department': None, 'role': 'Member'}, {'institution': 'University Mohammed V', 'department': 'Faculte des Sciences Rabat', 'role': 'Professor'}], 'contact_details': {'addresses': [{'type': 'work', 'street': 'Avenue Ibn Batouta', 'city': 'Rabat', 'country': None, 'postal_code': 'BP 1014'}], 'phones': [{'type': 'work', 'number': '+212 537 77 18 34', 'extension': None}, {'type': 'work', 'number': '+212 537 77 18 35', 'extension': None}, {'type': 'work', 'number': '+212 537 77 18 38', 'extension': None}, {'type': 'mobile', 'number': '+212 661 47 11 85', 'extension': None}, {'type': 'mobile', 'number': '+212 662 07 94 00', 'extension': None}, {'type': 'fax', 'number': '+212 537 77 42 61', 'extension': None}], 'emails': [{'type': 'work', 'address': 'r.cherkaoui@academiesciences.ma'}, {'type': 'work', 'address': 'rajaa.cherkaoui@um5.a

In [17]:
import pprint

In [18]:
pprint.pprint(contact_dict)

{'affiliations': [{'department': None,
                   'institution': 'Hassan II Academy of Science and Technology',
                   'role': 'Member'},
                  {'department': 'Faculte des Sciences Rabat',
                   'institution': 'University Mohammed V',
                   'role': 'Professor'}],
 'contact_details': {'addresses': [{'city': 'Rabat',
                                    'country': None,
                                    'postal_code': 'BP 1014',
                                    'street': 'Avenue Ibn Batouta',
                                    'type': 'work'}],
                     'emails': [{'address': 'r.cherkaoui@academiesciences.ma',
                                 'type': 'work'},
                                {'address': 'rajaa.cherkaoui@um5.ac.ma',
                                 'type': 'work'}],
                     'phones': [{'extension': None,
                                 'number': '+212 537 77 18 34',
                   

In [19]:
import datetime 

def generate_metadata(image_path):
    metadata = {}
    
    metadata["creation_time"] = datetime.datetime.now().isoformat()
    metadata["source"] = image_path

    return metadata
    

In [20]:
metadata = generate_metadata(img_path)

In [21]:
contact_dict['metadata'].update(metadata)

In [22]:
pprint.pprint(contact_dict)

{'affiliations': [{'department': None,
                   'institution': 'Hassan II Academy of Science and Technology',
                   'role': 'Member'},
                  {'department': 'Faculte des Sciences Rabat',
                   'institution': 'University Mohammed V',
                   'role': 'Professor'}],
 'contact_details': {'addresses': [{'city': 'Rabat',
                                    'country': None,
                                    'postal_code': 'BP 1014',
                                    'street': 'Avenue Ibn Batouta',
                                    'type': 'work'}],
                     'emails': [{'address': 'r.cherkaoui@academiesciences.ma',
                                 'type': 'work'},
                                {'address': 'rajaa.cherkaoui@um5.ac.ma',
                                 'type': 'work'}],
                     'phones': [{'extension': None,
                                 'number': '+212 537 77 18 34',
                   

In [23]:
import requests
import torch
from PIL import Image
from io import BytesIO


# Create inputs
message = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "what is written in the image?"},
        ]
    }
]

prompts = [processor.apply_chat_template([m], add_generation_prompt=True) for m in message]


## adding background info

In [24]:
background_info = {

    'company_details' : None,
    'individual_detail' :None
    
}

In [25]:
import requests

url = "https://api.tavily.com/search"





def query(query = None):
    


    payload = {
        "query": query,
        "topic": "general",
        "search_depth": "basic",
        "chunks_per_source": 3,
        "max_results": 1,
        "time_range": None,
        "days": 7,
        "include_answer": True,
        "include_raw_content": True,
        "include_images": False,
        "include_image_descriptions": False,
        "include_domains": [],
        "exclude_domains": [],
        "country": None
    }
    headers = {
        "Authorization": "Bearer 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",
        "Content-Type": "application/json"
    }

    response = requests.request("POST", url, json=payload, headers=headers)

    print(response.text)

    return response



In [26]:

query("who is Rajaai Cherkaoui El Moursli from Hassan II Academy of Science and Technology")


{"query":"who is Rajaai Cherkaoui El Moursli from Hassan II Academy of Science and Technology","follow_up_questions":null,"answer":"Rajaai Cherkaoui El Moursli is a professor of physics and a member of the Hassan II Academy of Science and Technology. She earned her PhD in France and Thèse d'Etat in Morocco. She has held leadership roles in nuclear physics and academia.","images":[],"results":[{"title":"Cherkaoui El Moursli, Rajaa - TWAS","url":"https://twas.org/directory/cherkaoui-el-moursli-rajaa","content":"Cherkaoui El Moursli (PhD, 1982, France and Thèse d'Etat, 1990, Morocco) is Professor of Physics, Faculty of Science, University Mohammed V, Rabat. She was Director of the Nuclear Physics Laboratory and Vice President of the University. She is a member of: the Hassan II Academy of Sciences and Technology; the African Academy of Sciences","score":0.8969848,"raw_content":"![](https://www.facebook.com/tr?id=226353719903618&ev=PageView&noscript=1)\n![twas-logo](/themes/custom/twas/ima

<Response [200]>

In [27]:
individial_detail_query_prompt = f"""


You are a search query generation assistant. Your task is to generate a short and precise web search query to learn more about the following individual based on the provided structured data:

Data:{contact_dict}

Generate a web search query that would help a search engine find authoritative pages about this person (Wikipedia, university page, research profile, etc.).
use their name , affliation , some other details you deem necesasary for the engine to loook for this individual

Output format:
Search query: <query here>



"""

In [28]:
pprint.pprint(individial_detail_query_prompt)

('\n'
 '\n'
 '\n'
 'You are a search query generation assistant. Your task is to generate a '
 'short and precise web search query to learn more about the following '
 'individual based on the provided structured data:\n'
 '\n'
 "Data:{'name': 'Rajaai Cherkaoui El Moursli', 'title': 'Professor', 'type': "
 "'person', 'affiliations': [{'institution': 'Hassan II Academy of Science and "
 "Technology', 'department': None, 'role': 'Member'}, {'institution': "
 "'University Mohammed V', 'department': 'Faculte des Sciences Rabat', 'role': "
 "'Professor'}], 'contact_details': {'addresses': [{'type': 'work', 'street': "
 "'Avenue Ibn Batouta', 'city': 'Rabat', 'country': None, 'postal_code': 'BP "
 "1014'}], 'phones': [{'type': 'work', 'number': '+212 537 77 18 34', "
 "'extension': None}, {'type': 'work', 'number': '+212 537 77 18 35', "
 "'extension': None}, {'type': 'work', 'number': '+212 537 77 18 38', "
 "'extension': None}, {'type': 'mobile', 'number': '+212 661 47 11 85', "
 "'extensi

In [29]:
individual_query = llm.get_response(prompt= individial_detail_query_prompt)

In [30]:
individual_query = individual_query.choices[0].message.content

In [31]:
individual_details = query(query= individual_query)

{"query":"Search query: \"Rajaai Cherkaoui El Moursli\" Professor University Mohammed V Morocco","follow_up_questions":null,"answer":"Rajaai Cherkaoui El Moursli is a professor at University Mohammed V in Morocco. He holds a significant academic position there. His expertise is in social sciences.","images":[],"results":[],"response_time":3.11}


In [32]:
pprint.pprint(individual_details.text)

('{"query":"Search query: \\"Rajaai Cherkaoui El Moursli\\" Professor '
 'University Mohammed V Morocco","follow_up_questions":null,"answer":"Rajaai '
 'Cherkaoui El Moursli is a professor at University Mohammed V in Morocco. He '
 'holds a significant academic position there. His expertise is in social '
 'sciences.","images":[],"results":[],"response_time":3.11}')


In [33]:
individual_details.text

'{"query":"Search query: \\"Rajaai Cherkaoui El Moursli\\" Professor University Mohammed V Morocco","follow_up_questions":null,"answer":"Rajaai Cherkaoui El Moursli is a professor at University Mohammed V in Morocco. He holds a significant academic position there. His expertise is in social sciences.","images":[],"results":[],"response_time":3.11}'

In [34]:
response = json.loads(individual_details.text)
response.keys()

dict_keys(['query', 'follow_up_questions', 'answer', 'images', 'results', 'response_time'])

In [35]:
# response['results'][0]['content']

individual_detail_response = response['answer']

In [36]:
company_detail_query_prompt = f"""


You are a search query generation assistant. Your task is to generate a short and precise web search query to learn more about the following institution based on the provided structured data:

Data:{contact_dict}

Generate a web search query that would help a search engine find authoritative pages about this institution/company (Wikipedia, university page, research profile, etc.).
use the name given  , affliation , location , or some other details you deem necesasary for the engine 
add what the onstitution does , sells etc
Output format:
Search query: <query here>



"""

In [37]:
company_query = llm.get_response(prompt= company_detail_query_prompt)

In [38]:
company_query = company_query.choices[0].message.content
company_query

'Search query: "Hassan II Academy of Science and Technology" OR "University Mohammed V" Rabat professor Rajaai Cherkaoui El Moursli science research'

In [39]:
company_detail = query(query=company_query)

{"query":"Search query: \"Hassan II Academy of Science and Technology\" OR \"University Mohammed V\" Rabat professor Rajaai Cherkaoui El Moursli science research","follow_up_questions":null,"answer":"Professor Rajaai Cherkaoui El Moursli is a notable scientist at Hassan II Academy of Science and Technology and University Mohammed V in Rabat. His research focuses on advanced scientific fields. No specific retrieved data is available.","images":[],"results":[],"response_time":3.82}


In [40]:
response = json.loads(company_detail.text)

In [41]:
company_detail_response = response['answer']

In [42]:
company_detail_response

'Professor Rajaai Cherkaoui El Moursli is a notable scientist at Hassan II Academy of Science and Technology and University Mohammed V in Rabat. His research focuses on advanced scientific fields. No specific retrieved data is available.'

In [43]:
background_info = {

    'company_details' : None,
    'individual_detail' :None
    
}
background_info['company_details'] = company_detail_response
background_info['individual_detail'] = individual_detail_response

pprint.pprint(background_info)

{'company_details': 'Professor Rajaai Cherkaoui El Moursli is a notable '
                    'scientist at Hassan II Academy of Science and Technology '
                    'and University Mohammed V in Rabat. His research focuses '
                    'on advanced scientific fields. No specific retrieved data '
                    'is available.',
 'individual_detail': 'Rajaai Cherkaoui El Moursli is a professor at '
                      'University Mohammed V in Morocco. He holds a '
                      'significant academic position there. His expertise is '
                      'in social sciences.'}


In [44]:
contact_dict['background_info'] = background_info

In [45]:
contact_dict

{'name': 'Rajaai Cherkaoui El Moursli',
 'title': 'Professor',
 'type': 'person',
 'affiliations': [{'institution': 'Hassan II Academy of Science and Technology',
   'department': None,
   'role': 'Member'},
  {'institution': 'University Mohammed V',
   'department': 'Faculte des Sciences Rabat',
   'role': 'Professor'}],
 'contact_details': {'addresses': [{'type': 'work',
    'street': 'Avenue Ibn Batouta',
    'city': 'Rabat',
    'country': None,
    'postal_code': 'BP 1014'}],
  'phones': [{'type': 'work',
    'number': '+212 537 77 18 34',
    'extension': None},
   {'type': 'work', 'number': '+212 537 77 18 35', 'extension': None},
   {'type': 'work', 'number': '+212 537 77 18 38', 'extension': None},
   {'type': 'mobile', 'number': '+212 661 47 11 85', 'extension': None},
   {'type': 'mobile', 'number': '+212 662 07 94 00', 'extension': None},
   {'type': 'fax', 'number': '+212 537 77 42 61', 'extension': None}],
  'emails': [{'type': 'work', 'address': 'r.cherkaoui@academiescie

# RAG system


In [58]:
try:
    import langchain
    import langchain_community
    import sentence_transformers
    import faiss_cpu
except:
    !uv pip install langchain langchain-community sentence-transformers faiss-cpu

[2K[37m⠙[0m [2msentence-transformers==4.1.0                                                  [0m

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2K[2mResolved [1m84 packages[0m [2min 422ms[0m[0m                                        [0m
[2K[37m⠙[0m [2mPreparing packages...[0m (0/1)                                                   
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)--------------[0m[0m     0 B/29.84 MiB           [1A
[2K[1A[37m⠹[0m [2mPreparing packages...[0m (0/1)--------------[0m[0m 16.00 KiB/29.84 MiB         [1A
[2K[1A[37m⠹[0m [2mPreparing packages...[0m (0/1)--------------[0m[0m 16.00 KiB/29.84 MiB         [1A
[2K[1A[37m⠹[0m [2mPreparing packages...[0m (0/1)--------------[0m[0m 32.00 KiB/29.84 MiB         [1A
[2K[1A[37m⠹[0m [2mPreparing packages...[0m (0/1)--------------[0m[0m 48.00 KiB/29.84 MiB         [1A
[2K[1A[37m⠹[0m [2mPreparing packages...[0m (0/1)--------------[0m[0m 60.04 KiB/29.84 MiB         [1A
[2K[1A[37m⠹[0m [2mPreparing packages...[0m (0/1)--------------[0m[0m 76.04 KiB/29.84 MiB         [1A
[2K[1A[37m⠹[0m [2mPre

In [61]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
import json
import os

# 1. Embedder
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 2. Load or Create FAISS store
if os.path.exists("faiss_index"):
    vectordb = FAISS.load_local("faiss_index", embedder)
else:
    dummy_doc = Document(page_content="dummy", metadata={"source": "init"})
    vectordb = FAISS.from_documents([dummy_doc], embedder)

# 3. Convert dict to Document
def dict_to_doc(d: dict):
    content = json.dumps(d, indent=2)  # Or extract only key parts if needed
    metadata = d.get("metadata", {})
    return Document(page_content=content, metadata=metadata)

# 4. Add one or many
def add_to_vectorstore(data_list):
    docs = [dict_to_doc(d) for d in data_list]
    vectordb.add_documents(docs)
    vectordb.save_local("faiss_index")  # Persist

# Example usage
  # the dict you posted
# add_to_vectorstore([your_dict])


In [62]:

data_list = [contact_dict]
add_to_vectorstore(data_list)

In [63]:
query = "Who is Rajaai Cherkaoui El Moursli?"
results = vectordb.similarity_search(query, k=3)  # `k` is number of top docs


In [64]:
results

[Document(id='1df1c198-fb86-4573-838d-c9e492321ada', metadata={'source': '20240625_154117.jpg', 'confidence': None, 'creation_time': '2025-06-15T21:33:57.833897'}, page_content='{\n  "name": "Rajaai Cherkaoui El Moursli",\n  "title": "Professor",\n  "type": "person",\n  "affiliations": [\n    {\n      "institution": "Hassan II Academy of Science and Technology",\n      "department": null,\n      "role": "Member"\n    },\n    {\n      "institution": "University Mohammed V",\n      "department": "Faculte des Sciences Rabat",\n      "role": "Professor"\n    }\n  ],\n  "contact_details": {\n    "addresses": [\n      {\n        "type": "work",\n        "street": "Avenue Ibn Batouta",\n        "city": "Rabat",\n        "country": null,\n        "postal_code": "BP 1014"\n      }\n    ],\n    "phones": [\n      {\n        "type": "work",\n        "number": "+212 537 77 18 34",\n        "extension": null\n      },\n      {\n        "type": "work",\n        "number": "+212 537 77 18 35",\n      