# code


## Install Dependencies

In [12]:
# Install required packages
!pip install opensearch-py spacy dateparser geopy bs4 lxml tqdm

# Download spaCy English model
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     - -------------------------------------- 0.5/12.8 MB 1.5 MB/s eta 0:00:09
     - -------------------------------------- 0.5/12.8 MB 1.5 MB/s eta 0:00:09
     --- ------------------------------------ 1.0/12.8 MB 1.2 MB/s eta 0:00:11
     --- ------------------------------------ 1.0/12.8 MB 1.2 MB/s eta 0:00:11
     ---- ----------------------------------- 1.3/12.8 MB 1.1 MB/s eta 0:00:11
     ---- ----------------------------------- 1.6/12.8 MB 1.2 MB/s eta 0:00:10
     ----- ---------------------------------- 1.8/12.8 MB 1.1 MB/s eta 0:00:10
     ------ --------------------------------- 2.1/12.8 

## Connect OpenSearch

In [21]:
#Connect to the running OpenSearch cluster
from opensearchpy import OpenSearch

client = OpenSearch(
    hosts=[{"host": "localhost", "port": 9200}],
    http_compress=True,
    use_ssl=False,
    verify_certs=False
)

# Test connection
print(client.info())


{'name': 'opensearch-node1', 'cluster_name': 'opensearch-cluster', 'cluster_uuid': 'WHW2Xt5vQfqBA9trb4dorg', 'version': {'distribution': 'opensearch', 'number': '3.3.2', 'build_type': 'tar', 'build_hash': '6564992150e26aaa62d4522a220dfff5188aeb88', 'build_date': '2025-10-29T22:24:07.450919802Z', 'build_snapshot': False, 'lucene_version': '10.3.1', 'minimum_wire_compatibility_version': '2.19.0', 'minimum_index_compatibility_version': '2.0.0'}, 'tagline': 'The OpenSearch Project: https://opensearch.org/'}


In [14]:
from bs4 import BeautifulSoup
import os, re
import spacy
import dateparser
from opensearchpy import OpenSearch
from geopy.geocoders import Nominatim
from opensearchpy.helpers import bulk


## Create Index With Correct Mappings + Analyzers

In [24]:
index_name = "index"

mapping = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0,
        "analysis": {
            "analyzer": {
                "autocomplete_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": ["lowercase", "autocomplete_filter"]
                }
            },
            "filter": {
                "autocomplete_filter": {
                    "type": "edge_ngram",
                    "min_gram": 3,
                    "max_gram": 20
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "title": {
                "type": "text",
                "analyzer": "autocomplete_analyzer",
                "search_analyzer": "standard"
            },
            "content": {
                "type": "text",
                "analyzer": "standard"
            },
            "authors": {
                "type": "nested",
                "properties": {
                    "first_name": {"type": "text"},
                    "last_name": {"type": "text"},
                    "email": {"type": "keyword"}
                }
            },
            "date": {"type": "date"},
            "geopoint": {"type": "geo_point"},
            "temporal_expressions": {"type": "date"},  # array of extracted dates
            "georeferences": {"type": "geo_point"}     # array of extracted locations
        }
    }
}

# Create index
if not client.indices.exists(index=index_name):
    client.indices.create(index=index_name, body=mapping)
    print(f"Index '{index_name}' created successfully")
else:
    print(f"Index '{index_name}' already exists")



Index 'index' created successfully


In [25]:
import spacy
from dateparser import parse as parse_date
from geopy.geocoders import Nominatim

nlp = spacy.load("en_core_web_sm")
geolocator = Nominatim(user_agent="smart_doc_system")

def extract_temporal(text):
    doc = nlp(text)
    dates = []
    for ent in doc.ents:
        if ent.label_ == "DATE":
            parsed = parse_date(ent.text)
            if parsed:
                dates.append(parsed)
    return dates

def extract_georeferences(text):
    doc = nlp(text)
    coords = []
    for ent in doc.ents:
        if ent.label_ in ["GPE", "LOC"]:
            location = geolocator.geocode(ent.text)
            if location:
                coords.append({"lat": location.latitude, "lon": location.longitude})
    return coords

# Example indexing function
def index_document(doc_id, title, content, authors, date=None, geopoint=None):
    temporal_expressions = extract_temporal(content)
    georeferences = extract_georeferences(content)

    # Use extracted temporal/geopoint if original is missing
    doc_body = {
        "title": title,
        "content": content,
        "authors": authors,
        "date": date or (temporal_expressions[0] if temporal_expressions else None),
        "geopoint": geopoint or (georeferences[0] if georeferences else None),
        "temporal_expressions": temporal_expressions,
        "georeferences": georeferences
    }

    client.index(index=index_name, id=doc_id, body=doc_body)


## Load & Parse Documents (SGML / HTML)