In [74]:
from elasticsearch import Elasticsearch
import elasticsearch
import json
from sentence_transformers import SentenceTransformer
from elasticsearch import helpers
from opensearchpy.helpers.actions import parallel_bulk,bulk
import numpy as np
print(elasticsearch.__version__)


(7, 10, 1)


In [95]:
# Créer une instance Elasticsearch
es = Elasticsearch(
    ["http://172.22.30.5:9200/"],
    verify_certs=False
)

In [76]:
index_name = "pubmed_2024"
# Configurer les paramètres de l'index
index_settings = {
  "settings": {
    "number_of_shards": 2,
    "number_of_replicas": 1,
    "index": {
                "number_of_shards": 1,
                "elastiknn": True,
            }
  },
  "mappings": {
      "medical": {
    "dynamic": "true",
    "_source": {
      "enabled": "true"
    },
    "properties": {
      "article_title": {
        "type": "text"
      },
      "id": {
  "type": "long"
},
      "article_abstract": {
        "type": "text"
      },
        "pub_date": {
                "type": "date",
                "format": "yyyy-MM-dd",
                "fields": {
                    "year": {"type": "keyword"},
                    "month": {"type": "keyword"},
                    "day": {"type": "keyword"}
                }
            },
      "text_vector": {
        "type": "dense_vector",
        "dims": 768
      }
    }
      }
  }
}

In [77]:
def create_or_refresh_index(es, index_name, index_settings):
    if es.indices.exists(index=index_name):
        print(f"Suppression de l'index existant : {index_name}")
        es.indices.delete(index=index_name)

    print(f"Création de l'index : {index_name}")
    es.indices.create(index=index_name, body=index_settings)

In [78]:
# create_or_refresh_index(es, index_name, index_settings)

In [79]:
def create_document(doc):
    year = doc['pub_date'].get('year', '')
    month = doc['pub_date'].get('month', '')
    day = doc['pub_date'].get('day', '')
    return {
        'article_title': doc.get('article_title',''),
        'article_abstract': doc.get('article_abstract',''),
        "pub_date": f"{year}-{month.zfill(2)}-{day.zfill(2)}"
    }

In [80]:
# Importer un JSON

with open("pubmed_article_december-2023.json") as f:
    data = json.load(f)

In [81]:
docs = data
id = 0
for doc in docs :
    docs_to_be_indexed = create_document(doc)
    response = es.index(index=index_name, body=docs_to_be_indexed, doc_type='_doc', id=id) # Pour indexer des documents
    id += 1
    print(response)



{'_index': 'pubmed_2024', '_type': '_doc', '_id': '0', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}
{'_index': 'pubmed_2024', '_type': '_doc', '_id': '1', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1}
{'_index': 'pubmed_2024', '_type': '_doc', '_id': '2', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 2, '_primary_term': 1}
{'_index': 'pubmed_2024', '_type': '_doc', '_id': '3', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 3, '_primary_term': 1}
{'_index': 'pubmed_2024', '_type': '_doc', '_id': '4', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 4, '_primary_term': 1}
{'_index': 'pubmed_2024', '_type': '_doc', '_id': '5', '_version': 1, 'result': 'created', '_shards'

In [82]:
docs_to_be_indexed

{'article_title': 'Periodontitis and risk of cancer: Mechanistic evidence.',
 'article_abstract': 'This review aims to critically analyze the pathways of interaction and the pathogenic mechanisms linking periodontitis and oral bacteria with the initiation/progression of cancer at different body compartments. A higher risk of head and neck cancer has been consistently associated with periodontitis. This relationship has been explained by the local promotion of dysbiosis, chronic inflammation, immune evasion, and direct (epi)genetic damage to epithelial cells by periodontal pathobionts and their toxins. Epidemiological reports have also studied a possible link between periodontitis and the incidence of other malignancies at distant sites, such as lung, breast, prostate, and digestive tract cancers. Mechanistically, different pathways have been involved, including the induction of a chronic systemic inflammatory state and the spreading of oral pathobionts with carcinogenic potential. Inde

In [96]:
query = {
	"query": {
"bool": {
		"must": {
			"fuzzy": {
				"article_abstract": {"value": "halth", "fuzziness": 1}
			}
},
		"must_not": {
			"match": {
				"article_title": "brain"
			}
		},
		"should": {
			"match": {
				"article_title": "liberation.com"
			}
		}
	}
}
}

In [84]:
res = es.search(index=index_name, body=query)
print('\n******* DOCUMENTS ***************')
# Parcourir les résultats et afficher le contenu des documents
for hit in res['hits']['hits']:
    document = hit['_source']
    print(document)


******* DOCUMENTS ***************
{'article_title': "African Immigrant Women's Maternal Health Experiences in Clarkston, Georgia: A Qualitative Study.", 'article_abstract': 'The maternal health experiences of African immigrant women, their utilization of health care services, and the effects on maternal health have received limited attention in research. This research explored the maternal health experiences of African immigrant women residing in Clarkston, Georgia, and their use of health services.', 'pub_date': '2023-12-12'}
{'article_title': "Addressing the SUD training gap: Two pilot feasibility studies in the Department of Veteran's Affairs Health Care System.", 'article_abstract': 'Substance use disorders (SUDs) are an ongoing public health crisis in the United States. A large body of research indicates an urgent need for increased training in SUD research and treatment for trainees in mental health service disciplines. The VA Health Care System is well positioned, as the larges

In [85]:
res

{'took': 1155,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 928, 'relation': 'eq'},
  'max_score': 3.5023613,
  'hits': [{'_index': 'pubmed_2024',
    '_type': '_doc',
    '_id': '373',
    '_score': 3.5023613,
    '_ignored': ['article_abstract.keyword'],
    '_source': {'article_title': "African Immigrant Women's Maternal Health Experiences in Clarkston, Georgia: A Qualitative Study.",
     'article_abstract': 'The maternal health experiences of African immigrant women, their utilization of health care services, and the effects on maternal health have received limited attention in research. This research explored the maternal health experiences of African immigrant women residing in Clarkston, Georgia, and their use of health services.',
     'pub_date': '2023-12-12'}},
   {'_index': 'pubmed_2024',
    '_type': '_doc',
    '_id': '4916',
    '_score': 3.4712427,
    '_ignored': ['article_abstract.keyword'],
  

In [97]:
document = {
    "title": "Voiture hybride et voiture électrique : quelle sont les différences ?",
    "source": "challenges.fr",
    "date": "2022-11-30"
}
document_2 =  {
    "title": "éthique de l'intelligence artificielle",
    "source": "technology.com",
    "date": "2021-05-21"
}
document_3 = {
    "title": "Allemagne et énergie électrique",
    "source": "futur.com",
    "date": "2023-05-21"
}
document_4 = {
    "title": "La plage de Mai",
    "source": "liberation.com",
    "date": "2023-05-21"
}

In [98]:
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [88]:
###### with neural search
def create_document_emb(doc):
    embeddings = sbert_model.encode([doc.get('title', '')])[0]
    embeddings = [float(value) for value in embeddings]
    return {
    'source': doc.get('source',''),
    'date': doc.get('date',''),
    'title': doc['title'],
    'text_vector': list(embeddings)
}

In [89]:
docs   = [document, document_2, document_3, document_4]

list_docs = []
for doc in docs:
    docs_to_be_indexed = create_document_emb(doc)
    print(docs_to_be_indexed)
    list_docs.append(docs_to_be_indexed)

{'source': 'challenges.fr', 'date': '2022-11-30', 'title': 'Voiture hybride et voiture électrique : quelle sont les différences ?', 'text_vector': [-0.6531800627708435, -0.7892552018165588, 0.9064341187477112, 0.44022712111473083, 0.6747838854789734, -0.07035467773675919, -0.15778547525405884, 0.2515799403190613, 0.755397379398346, -0.4582359492778778, -0.33394643664360046, 0.8578912615776062, -0.33558782935142517, 0.7871348857879639, -0.2986924350261688, 0.6185118556022644, -0.8708463311195374, -0.3100438714027405, -0.0142916115000844, -0.001421110238879919, 0.10587534308433533, 0.5273206830024719, -0.4504958987236023, 0.061020102351903915, 0.47635766863822937, -0.4765840470790863, 0.4793623983860016, -1.153828740119934, -0.6109431982040405, 0.5583295226097107, 0.27552616596221924, 1.0591999292373657, -0.2828276753425598, -0.7385966777801514, -0.3196946382522583, 0.5391615033149719, -0.47649070620536804, -0.010746762156486511, -0.052733030170202255, -0.4603654146194458, 0.657118380069

In [99]:
index_name = "alternance_2024"
index_settings = {
  "settings": {
    "number_of_shards": 2,
    "number_of_replicas": 1,
    "index": {
                "number_of_shards": 1,
                "elastiknn": True
            }
  },
  "mappings": {
    "dynamic": "true",
    "_source": {
      "enabled": "true"
    },
    "properties": {
      "title": {
        "type": "text"
      },
      "id": {
         "type": "long"
      },
      "text_vector": {
        "type": "dense_vector",
        "dims": 768
      }
    }
  }
}


In [101]:
es.indices.create(index = index_name, body=index_settings)


RequestError: RequestError(400, 'illegal_argument_exception', 'unknown setting [index.elastiknn] please check that any required plugins are installed, or check the breaking changes documentation for removed settings')

In [None]:
def index_documents(index, doc_src):
    def bulk_docs(doc_src):
        i = 1
        for doc in doc_src:
            add_cmd = {"_index": index,
                       "_id": "doc_"+str(i),
                       "_source": doc}
            i+= 1
            print(add_cmd)
            yield add_cmd
    resp = bulk(es, bulk_docs(doc_src), chunk_size=1000)
    es.indices.refresh(index=index_name)

In [None]:
index_documents(index_name, list_docs)

{'_index': 'alternance_2024', '_id': 'doc_1', '_source': {'source': 'challenges.fr', 'date': '2022-11-30', 'title': 'Voiture hybride et voiture électrique : quelle sont les différences ?', 'text_vector': [-0.6531800627708435, -0.7892552018165588, 0.9064341187477112, 0.44022712111473083, 0.6747838854789734, -0.07035467773675919, -0.15778547525405884, 0.2515799403190613, 0.755397379398346, -0.4582359492778778, -0.33394643664360046, 0.8578912615776062, -0.33558782935142517, 0.7871348857879639, -0.2986924350261688, 0.6185118556022644, -0.8708463311195374, -0.3100438714027405, -0.0142916115000844, -0.001421110238879919, 0.10587534308433533, 0.5273206830024719, -0.4504958987236023, 0.061020102351903915, 0.47635766863822937, -0.4765840470790863, 0.4793623983860016, -1.153828740119934, -0.6109431982040405, 0.5583295226097107, 0.27552616596221924, 1.0591999292373657, -0.2828276753425598, -0.7385966777801514, -0.3196946382522583, 0.5391615033149719, -0.47649070620536804, -0.010746762156486511, -

In [None]:
#
query = "véhicule électrique"
query_vector = sbert_model.encode([query])[0]
query_vector = [float(value) for value in query_vector]

In [None]:
script_query = {
    "script_score": {
        "query": {"match_all": {}},
        "script": {
            "source": "cosineSimilarity(params['query_vector'], 'text_vector') + 1.0", # painless
            "params": {"query_vector": query_vector}
        }
    }
}

In [None]:
response = es.search(
    index=index_name,
    body={
        "size": 10,
        "query": script_query
    }
)
for hit in response['hits']['hits']:
    doc = hit['_source'] 
    print(doc)

RequestError: RequestError(400, 'search_phase_execution_exception', 'runtime error')