![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/45.VectorDB_and_PostProcessor_for_RAG_Generative_AI.ipynb)

# VectorDB, VectorDBPostProcessor and ContextSplitAssembler

# Setup



In [None]:
import json
import os

from google.colab import files

if 'spark_jsl.json' not in os.listdir():
  license_keys = files.upload()
  os.rename(list(license_keys.keys())[0], 'spark_jsl.json')

with open('spark_jsl.json') as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)
os.environ.update(license_keys)

In [None]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.5.1  spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
#! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET
! pip install --upgrade -q spark-nlp-jsl==5.4.0  --extra-index-url https://pypi.johnsnowlabs.com/5.4.0-db9b4c2dee65db7fa6100bcc05a0f609a2e1c396

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

In [None]:
import sparknlp
import sparknlp_jsl

from sparknlp.base import *
from sparknlp.util import *
from sparknlp.common import *
from sparknlp.annotator import *
from sparknlp_jsl.base import *
from sparknlp_jsl.annotator import *

import pyspark.sql.types as T
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline, PipelineModel

spark = sparknlp_jsl.start(secret = "5.4.0-db9b4c2dee65db7fa6100bcc05a0f609a2e1c396", use_vectordb=True)

print("Spark NLP Version :", sparknlp.version())
print("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark

Spark NLP Version : 5.4.0
Spark NLP_JSL Version : 5.4.0


# DATAFRAME

In [None]:
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/healthcare-nlp/data/pubmed_diabetes_1000_meta.csv

In [None]:
from pyspark.sql.types import IntegerType, StringType, StructType, StructField

schema = StructType(
    [
        StructField("row_id", StringType(), nullable = True),
        StructField("pubmed_id", StringType(), nullable = True),
        StructField("abstract", StringType(), nullable = True),
        StructField("pubdate", StringType(), nullable = True),
        StructField("title", StringType(), nullable = True),
        StructField("fulljournalname", StringType(), nullable = True),
    ]
)

In [None]:
df = spark.read.option("header", "true")\
               .option("multiLine", "true")\
               .schema(schema)\
               .csv(path = "./pubmed_diabetes_1000_meta.csv")\
               .withColumnRenamed("abstract", "text")\
               .limit(100)

print(f"count: {df.count()}")
print(f"columns: {df.columns}")

count: 100
columns: ['row_id', 'pubmed_id', 'text', 'pubdate', 'title', 'fulljournalname']


In [None]:
df.schema

StructType([StructField('row_id', StringType(), True), StructField('pubmed_id', StringType(), True), StructField('text', StringType(), True), StructField('pubdate', StringType(), True), StructField('title', StringType(), True), StructField('fulljournalname', StringType(), True)])

In [None]:
df.show(20,truncate=100)

+------+----------+----------------------------------------------------------------------------------------------------+-----------+----------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
|row_id| pubmed_id|                                                                                                text|    pubdate|                                                                                               title|                                                                      fulljournalname|
+------+----------+----------------------------------------------------------------------------------------------------+-----------+----------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
|     0|PMC3312572|The pituitary gland a

# Pipeline Stages


In [None]:
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

splitter = InternalDocumentSplitter()\
    .setInputCols("document")\
    .setOutputCol("split")\
    .setSplitMode("regex")\
    .setSplitPatterns(["\n", "\r\n", "\n\n"])\
    .setExplodeSplits(True)\
    .setEnableSentenceIncrement(True)

embeddings = BertSentenceEmbeddings\
    .pretrained("sbiobert_base_cased_mli_onnx", "en", "clinical/models") \
    .setInputCols(["split"]) \
    .setOutputCol("embeddings")

embedding_pipeline = Pipeline().setStages([
    document,
    splitter,
    embeddings
])

sbiobert_base_cased_mli_onnx download started this may take some time.
Approximate size to download 384.4 MB
[OK!]


# Save Base DF

In [None]:
%%time
embedding_pipeline.fit(df).transform(df)\
    .write.mode("overwrite")\
    .parquet("./pubmed_df.parquet")

CPU times: user 34.4 ms, sys: 11.4 ms, total: 45.9 ms
Wall time: 2min 15s


In [None]:
baseDF = spark.read.load("./pubmed_df.parquet")

InternalDocumentSplitter has split documents by line split chars.
DocumentSplitter has increased the length of df

In [None]:
baseDF.show(20,truncate=100)
baseDF.count()

+------+----------+----------------------------------------------------------------------------------------------------+-----------+----------------------------------------------------------------------------------------------------+------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+
|row_id| pubmed_id|                                                                                                text|    pubdate|                                                                                               title|                     fulljournalname|                                                                                            document|                                                       

243

# VectorDB Stage

## Train a new VectorDB model

In [None]:
vector_db = VectorDBApproach()\
    .setInputCols("embeddings", "split")\
    .setOutputCol("vector_db")\
    .setMetaDataFields(["fulljournalname", "title", "pubdate"])\
    .setTopK(40)\
    .setIdentifierCol("pubmed_id")\
    .fit(baseDF)

Functions:
- setMetadataFields: Columns in the dataframe from which to create partitions
- setExcludeSelf: whether to include the row identifier as a candidate neighbor
- setSimilarityThreshold: do not return neighbors further away than this distance
- setNumReplicas: number of index replicas to create when querying
- setIdentifierCol: column name for the row identifier
- setDistanceFunction: distance function to use
- setTopK: number of neighbors to find

## Save a new VectorDB model

In [None]:
vector_db.write().overwrite().save("./models/new_vector_db_model")

## Load a saved model

In [None]:
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

embeddings = BertSentenceEmbeddings\
    .pretrained("sbiobert_base_cased_mli_onnx", "en", "clinical/models") \
    .setInputCols(["document"]) \
    .setOutputCol("embeddings")

vector_db_loaded = VectorDBModel.load("./models/new_vector_db_model")\
    .setInputCols("embeddings", "document")\
    .setOutputCol("vector_db")\
    .setTopK(10)\

pipeline = Pipeline().setStages([
    document,


    embeddings.setInputCols("document"),
    vector_db_loaded
])

pipeline_model = pipeline.fit(spark.createDataFrame([[""]]))

sbiobert_base_cased_mli_onnx download started this may take some time.
Approximate size to download 384.4 MB
[OK!]


## QUERY

In [None]:
query_1 = """What are the causes of diabetes?"""
query_2 = """relationship between diabetes and obesity"""

queries = [
    [query_1],
    [query_2]
]

query_df = spark.createDataFrame(queries).toDF("text")
query_df.show(truncate=False)

+-----------------------------------------+
|text                                     |
+-----------------------------------------+
|What are the causes of diabetes?         |
|relationship between diabetes and obesity|
+-----------------------------------------+



In [None]:
result_df = pipeline_model.transform(query_df)
result_df.selectExpr("text","vector_db").show(truncate=False)

+-----------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
result_df.selectExpr("text","explode(vector_db) as vector_db_annotations").show(truncate=False)

+-----------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Check the metadata

In [None]:
result_df.selectExpr("explode(vector_db.metadata) as vector_db_annotation_metadata").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Check the similarity

In [None]:
baseDF.select("pubmed_id", "split.result").where("""pubmed_id == 'PMC7730786'""").show(truncate=False)

+----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|pubmed_id |result                                                                                                                                                                                                                                                                                                                                     

In [None]:
spark_df = spark.createDataFrame([[query_1, "Italian Journal of Pediatrics"]]).toDF("text", "fulljournalname")

pipeline_model.transform(spark_df)\
    .selectExpr("explode(vector_db.metadata) as vector_db_annotation_metadata").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
spark_df = spark.createDataFrame([[query_1, "XXXXXXX"]]).toDF("text", "fulljournalname")

pipeline_model.transform(spark_df)\
    .selectExpr("explode(vector_db.metadata) as vector_db_annotation_metadata").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Check the setTopK()

In [None]:
pipeline_model.stages[-1].setTopK(5)

spark_df = spark.createDataFrame([[query_1, "Italian Journal of Pediatrics"]]).toDF("text", "fulljournalname")

pipeline_model.transform(spark_df).selectExpr("explode(vector_db.metadata) as vector_db_annotation_metadata").show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|vector_db_annotation_metadata                                                                                                                                                                                           

In [None]:
result_df.selectExpr("text", "explode(vector_db.result) as results").show(truncate=100)

+-----------------------------------------+----------------------------------------------------------------------------------------------------+
|                                     text|                                                                                             results|
+-----------------------------------------+----------------------------------------------------------------------------------------------------+
|         What are the causes of diabetes?|Current evidence suggests that the information needs of people with diabetes mellitus differ acro...|
|         What are the causes of diabetes?|Background: Diabetes is referred to a group of diseases characterized by high glucose levels in b...|
|         What are the causes of diabetes?|Adiponectin and pro‐inflammatory cytokines are associated with type 2 diabetes mellitus, and migh...|
|         What are the causes of diabetes?|Excessive glucocorticoid secretion has been associated with type 2 diabetes mellitus (T

In [None]:
flattener = Flattener()\
    .setInputCols("vector_db")\
    .setExplodeSelectedFields({"vector_db": ["metadata.target_text as Questions",
                                             "result as Text",
                                             "metadata.distance as Distance",
                                             "metadata.pubdate as Date",
                                             "metadata.code as ID",
                                             "metadata.title as Title"]})

flattener.transform(result_df).show(truncate=100)

+-----------------------------------------+----------------------------------------------------------------------------------------------------+----------+-----------+-----------------------------------------------+----------------------------------------------------------------------------------------------------+
|                                Questions|                                                                                                Text|  Distance|       Date|                                             ID|                                                                                               Title|
+-----------------------------------------+----------------------------------------------------------------------------------------------------+----------+-----------+-----------------------------------------------+----------------------------------------------------------------------------------------------------+
|         What are the causes of diabetes?|Curren

In [None]:
flattener = Flattener()\
    .setInputCols("vector_db")\
    .setExplodeSelectedFields({"vector_db": ["metadata.token",
                                             "metadata.distance",
                                             "metadata.pubdate",
                                             "metadata.code",
                                             "metadata.title"]})

flattener.transform(result_df).show(truncate=False)

+-----------------------------------------+---------------------------+--------------------------+-----------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
|vector_db_metadata_token                 |vector_db_metadata_distance|vector_db_metadata_pubdate|vector_db_metadata_code                        |vector_db_metadata_title                                                                                                                      |
+-----------------------------------------+---------------------------+--------------------------+-----------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
|What are the causes of diabetes?         |0.25732908                 |2020 Dec 10               |PMC7730786-55f338b0-af3b-4e62-91

## Check setSimilarityThreshold

In [None]:
pipeline_model.stages[-1].setSimilarityThreshold(0.30)

flattener = Flattener()\
    .setInputCols("vector_db")\
    .setExplodeSelectedFields({"vector_db": ["metadata.token",
                                             "metadata.distance",
                                             "metadata.code",
                                             "metadata.title"]})

result_df = pipeline_model.transform(query_df)

flattener.transform(result_df).show(truncate=False)

+-----------------------------------------+---------------------------+-----------------------------------------------+-------------------------------------------------------------------------------------------------------------+
|vector_db_metadata_token                 |vector_db_metadata_distance|vector_db_metadata_code                        |vector_db_metadata_title                                                                                     |
+-----------------------------------------+---------------------------+-----------------------------------------------+-------------------------------------------------------------------------------------------------------------+
|What are the causes of diabetes?         |0.25732908                 |PMC7730786-55f338b0-af3b-4e62-9195-a2149242b318|Different information needs in subgroups of people with diabetes mellitus: a latent class analysis           |
|relationship between diabetes and obesity|0.18294283                 |PMC541548

In [None]:
pipeline_model.stages[-1].setSimilarityThreshold(0.35)

flattener = Flattener()\
    .setInputCols("vector_db")\
    .setExplodeSelectedFields({"vector_db": ["metadata.token",
                                             "metadata.distance",
                                             "metadata.code",
                                             "metadata.title"]})

result_df = pipeline_model.transform(query_df)

flattener.transform(result_df).show(truncate=False)

+-----------------------------------------+---------------------------+-----------------------------------------------+-------------------------------------------------------------------------------------------------------------+
|vector_db_metadata_token                 |vector_db_metadata_distance|vector_db_metadata_code                        |vector_db_metadata_title                                                                                     |
+-----------------------------------------+---------------------------+-----------------------------------------------+-------------------------------------------------------------------------------------------------------------+
|What are the causes of diabetes?         |0.25732908                 |PMC7730786-55f338b0-af3b-4e62-9195-a2149242b318|Different information needs in subgroups of people with diabetes mellitus: a latent class analysis           |
|What are the causes of diabetes?         |0.32312741                 |PMC714603

# VectorDBPostProcessor

## Save VectorDB DF

In [None]:
query2 = """What are the effects of diabetes on the body?"""

In [None]:
pipeline_model.stages[-1].setSimilarityThreshold(0.99).setTopK(20)
pipeline_model.transform(spark.createDataFrame([[query2]]).toDF("text")).select("vector_db").write.mode("overwrite").parquet("/content/models/vector_df")

## Read Vector DF

In [None]:
vector_df = spark.read.load("/content/models/vector_df")

In [None]:
vector_df.selectExpr("explode(vector_db) as vector_db").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
Flattener().setInputCols("vector_db").setExplodeSelectedFields({"vector_db": ["metadata.distance", "metadata.pubdate", "metadata.title", "metadata.fulljournalname"]}).transform(vector_df).show(truncate=False)

+---------------------------+--------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
|vector_db_metadata_distance|vector_db_metadata_pubdate|vector_db_metadata_title                                                                                                                             |vector_db_metadata_fulljournalname                                                   |
+---------------------------+--------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
|0.20076137                 |2020 Dec 10               |Different information needs in subgroups of people with diabetes 

## VectorDBPostProcessor Initialize

VectorDBPostProcessor
````
Parameters
----------
filterBy: str
    The filterBy parameter is used to select and prioritize filter options.
    Options: `metadata`, and `diversity_by_threshold`.
    Options can be given as a comma separated string like "metadata, diversityByThreshold". The order of the options will be used to filter the annotations.
    - `metadata`: Filter by metadata fields. The `metadataCriteria` parameter should be set.
    - `diversity_by_threshold`: Filter by diversity threshold. Filter by the distance between the sorted annotations.
    When `diversity_by_threshold` option is set, `diversityThreshold` parameter should be used to set the threshold.
    Default: `metadata`
sortBy: str
    The sortBy parameter is used to select sorting option.
    Options: `ascending`, `descending`, `lost_in_the_middle`, `diversity`.
    - `ascending`: Sort by ascending order of distance.
    - `descending`: Sort by descending order of distance.
    - `lost_in_the_middle`: Sort by lost in the middle ranker. Let's say we have 5 annotations with distances [1, 2, 3, 4, 5]. The lost in the middle ranker will sort them as [1, 3, 5, 4, 2].
    - `diversity`:  Sort by diversity ranker. The annotations are sorted by distance and the first annotation select, and then the next annotation is selected by the maximum average distance from the selected annotations.
    Default: `ascending`
caseSensitive: bool
    Whether the criteria of the string operators are case sensitive or not.
    For example, if set to False, the operator "equals" will match "John" with "john".
    Default: False
diversityThreshold: float
    The diversityThreshold parameter is used to set the threshold for the diversityByThreshold filter.
    The diversityByThreshold filter selects the annotations by the distance between the sorted annotations.
    Default: 0.01
maxTopKAfterFiltering: int
    The maxTopKAfterFiltering parameter is used to set the maximum number of annotations to return after filtering.
    If the number of annotations after filtering is greater than maxTopKAfterFiltering, the top maxTopKAfterFiltering annotations are selected.
    Default: 20
allowZeroContentAfterFiltering: bool
    Whether to allow zero annotation after filtering.
    If set to True, the output may contain zero annotation if all annotations are filtered out.
    If set to False, The output is tried to contain at least one annotation.
    Default: False
metadataCriteria: list[dict]
    The metadataCriteria parameter is used to filter the annotations by metadata fields.
    The metadataCriteria param is a list of dictionaries.
    A dictionary should contain the following keys:
    
    - `field`: The field of the metadata to filter.
    - `fieldType`: The type of the field to filter. Options: string, int, float, date.
    - `operator`: The operator to apply to the filter. Options: equals, not_equals, greater_than, greater_than_or_equals, less_than, less_than_or_equals, contains, not_contains, regex.
    - `value`: The value to filter.
    - `matchMode`: The match mode to apply to the filter. Options: any, all, none.
    - `matchValues`: The values to filter.
    - `dateFormats`: The date formats to parse the date metadata field.
    - `converterFallback`: The converter fallback when hitting cast exception. Options: filter, not_filter, error.
    
    Notes:
    ------
    - `field`, `fieldType`, and `operator` are required. Other keys are optional.
    - `fieldType` is set to `string`, supported operators are: equals, not_equals, contains, not_contains, regex.
    - `fieldType` is set to `int` or `float` or `date`, supported operators are: equals, not_equals, greater_than, greater_than_or_equals, less_than, less_than_or_equals.
    - If `matchMode` and `matchValues` are not set, `value` must be set.
    - If `value` is set, `matchMode` and `matchValues` are ignored.
    - If `fieldType` is set to `date`, `dateFormats` must be set.
    - `matchMode` and `matchValues` must be set together.
    - If `converterFallback` is set to `error`, the filter will throw an error when hitting cast exception. Default 'error'.
````


In [None]:
post_processor = VectorDBPostProcessor() \
    .setInputCols("vector_db") \
    .setOutputCol("post") \
    .setFilterBy("metadata") \
    .setMetadataCriteria([
        {"field": "pubdate", "fieldType": "date", "operator": "greater_than", "value": "2017 May 11", "dateFormats": ["yyyy MMM dd", "yyyy MMM d"], "converterFallback": "filter"},
        {"field": "distance", "fieldType": "float", "operator": "less_than", "value": "0.5470"},
        {"field": "title", "fieldType": "string", "operator": "contains", "matchMode": "any", "matchValues": ["diabetes", "immune system"]}
      ])

flattener = Flattener() \
  .setInputCols("post") \
  .setExplodeSelectedFields({"post": ["metadata.distance", "metadata.score", "metadata.pubdate", "metadata.title", "metadata.fulljournalname"]})

In [None]:
flattener.transform(post_processor.transform(vector_df)) \
  .show(truncate=False)

+----------------------+-------------------+---------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
|post_metadata_distance|post_metadata_score|post_metadata_pubdate|post_metadata_title                                                                                                                                  |post_metadata_fulljournalname                                                        |
+----------------------+-------------------+---------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
|0.20076137            |0.79923863         |2020 Dec 10          |Different information nee

## Check sortBy lost_in_the_middle

In [None]:
flattener.transform(post_processor.setSortBy("lost_in_the_middle").transform(vector_df)) \
  .show(truncate=False)

+----------------------+-------------------+---------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
|post_metadata_distance|post_metadata_score|post_metadata_pubdate|post_metadata_title                                                                                                                                  |post_metadata_fulljournalname                                                        |
+----------------------+-------------------+---------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
|0.20076137            |0.79923863         |2020 Dec 10          |Different information nee

## check sortBy diversity

In [None]:
flattener.transform(post_processor.setSortBy("diversity").transform(vector_df)) \
  .show(truncate=False)

+----------------------+-------------------+---------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
|post_metadata_distance|post_metadata_score|post_metadata_pubdate|post_metadata_title                                                                                                                                  |post_metadata_fulljournalname                                                        |
+----------------------+-------------------+---------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
|0.20076137            |0.79923863         |2020 Dec 10          |Different information nee

## Check sortBy ascending

In [None]:
flattener.transform(post_processor.setSortBy("ascending").transform(vector_df)) \
  .show(truncate=False)

+----------------------+-------------------+---------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
|post_metadata_distance|post_metadata_score|post_metadata_pubdate|post_metadata_title                                                                                                                                  |post_metadata_fulljournalname                                                        |
+----------------------+-------------------+---------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
|0.20076137            |0.79923863         |2020 Dec 10          |Different information nee

## check filterBy diversityByThreshold

In [None]:
flattener.transform(post_processor.setFilterBy("metadata, diversity_by_threshold").setDiversityThreshold(0.05).transform(vector_df)) \
  .show(truncate=False)

+----------------------+-------------------+---------------------+--------------------------------------------------------------------------------------------------+-----------------------------+
|post_metadata_distance|post_metadata_score|post_metadata_pubdate|post_metadata_title                                                                               |post_metadata_fulljournalname|
+----------------------+-------------------+---------------------+--------------------------------------------------------------------------------------------------+-----------------------------+
|0.20076137            |0.79923863         |2020 Dec 10          |Different information needs in subgroups of people with diabetes mellitus: a latent class analysis|BMC Public Health            |
|0.27923813            |0.72076187         |2020 Mar 30          |Prevalence and incidence of type 1 diabetes in the world: a systematic review and meta-analysis   |Health Promotion Perspectives|
|0.34893499         

## check metadataCriteria

In [None]:
flattener.transform((
    post_processor
    .setSortBy("ascending")
    .setFilterBy("metadata")
    .setMetadataCriteria([{"field": "fulljournalname", "fieldType": "string", "operator": "equals", "matchMode": "any", "matchValues": ["Journal of Diabetes Investigation", "BMC Public Health"] }])).transform(vector_df)) \
  .show(truncate=False)

+----------------------+-------------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------+
|post_metadata_distance|post_metadata_score|post_metadata_pubdate|post_metadata_title                                                                                                                           |post_metadata_fulljournalname    |
+----------------------+-------------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------+
|0.20076137            |0.79923863         |2020 Dec 10          |Different information needs in subgroups of people with diabetes mellitus: a latent class analysis                                            |BMC Public Health                |
|0.26189331            |

In [None]:
flattener.transform((
    post_processor
    .setSortBy("ascending")
    .setFilterBy("metadata")
    .setMetadataCriteria(
        [{"field": "fulljournalname", "fieldType": "string", "operator": "equals", "matchMode": "any", "matchValues": ["Journal of Diabetes Investigation", "BMC Public Health"] },
         {"field": "pubdate", "fieldType": "date", "operator": "not_equals", "matchMode": "all", "matchValues": ["2017Mar27", "2017May03"], "dateFormats": ["yyyy MMM dd", "yyyy MMM d", "yyyy MMM", "yyyyMMMdd"] }
         ])).transform(vector_df)) \
  .show(truncate=False)

+----------------------+-------------------+---------------------+------------------------------------------------------------------------------------------------------------------------------------+---------------------------------+
|post_metadata_distance|post_metadata_score|post_metadata_pubdate|post_metadata_title                                                                                                                 |post_metadata_fulljournalname    |
+----------------------+-------------------+---------------------+------------------------------------------------------------------------------------------------------------------------------------+---------------------------------+
|0.20076137            |0.79923863         |2020 Dec 10          |Different information needs in subgroups of people with diabetes mellitus: a latent class analysis                                  |BMC Public Health                |
|0.26189331            |0.73810669         |2016 Oct 30         

## check between

In [None]:
flattener.transform((
    post_processor
    .setSortBy("ascending")
    .setFilterBy("metadata")
    .setMetadataCriteria(
        [{"field": "distance", "fieldType": "float", "operator": "greater_than", "value": "0.30"},
         {"field": "distance", "fieldType": "float", "operator": "less_than", "value": "0.39"}
         ])).transform(vector_df)) \
  .show(truncate=False)

+----------------------+-------------------+---------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
|post_metadata_distance|post_metadata_score|post_metadata_pubdate|post_metadata_title                                                                                                                                  |post_metadata_fulljournalname                                                        |
+----------------------+-------------------+---------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
|0.30312586            |0.69687414         |2019 Jun 6           |Salivary cortisol is not 

## check maxTopKAfterFiltering

In [None]:
flattener.transform((
    post_processor
    .setSortBy("ascending")
    .setFilterBy("metadata")
    .setMaxTopKAfterFiltering(5)
    .setMetadataCriteria(
        [{"field": "distance", "fieldType": "float", "operator": "greater_than", "value": "0.30"},
         {"field": "distance", "fieldType": "float", "operator": "less_than", "value": "0.39"}
         ])).transform(vector_df)) \
  .show(truncate=False)

+----------------------+-------------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
|post_metadata_distance|post_metadata_score|post_metadata_pubdate|post_metadata_title                                                                                                                           |post_metadata_fulljournalname                                                        |
+----------------------+-------------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
|0.30312586            |0.69687414         |2019 Jun 6           |Salivary cortisol is not associated with incid

## check allowZeroContentAfterFiltering

In [None]:
flattener.transform((
    post_processor
    .setSortBy("ascending")
    .setFilterBy("metadata")
    .setAllowZeroContentAfterFiltering(False)  # Default: False
    .setMetadataCriteria(
        [{"field": "distance", "fieldType": "float", "operator": "equals", "value": "1.0"},
         ])).transform(vector_df)) \
  .show(truncate=False)

+----------------------+-------------------+---------------------+--------------------------------------------------------------------------------------------------+-----------------------------+
|post_metadata_distance|post_metadata_score|post_metadata_pubdate|post_metadata_title                                                                               |post_metadata_fulljournalname|
+----------------------+-------------------+---------------------+--------------------------------------------------------------------------------------------------+-----------------------------+
|0.20076137            |0.79923863         |2020 Dec 10          |Different information needs in subgroups of people with diabetes mellitus: a latent class analysis|BMC Public Health            |
+----------------------+-------------------+---------------------+--------------------------------------------------------------------------------------------------+-----------------------------+



# ContextSplitAssembler

ContextSplitAssembler is compatible with VectorDB and VectorDBPostProcessor
```python
Parameters
    ----------
    joinString : str
        This parameter specifies the string that will be inserted between results of annotations when combining them into a single result.
        It acts as a delimiter, ensuring that the elements are properly separated and organized in the final result of annotation.
        Default: " ".
    explodeSplits : bool
        Whether to explode the splits into separate annotations or not.
        Default: False.
  ```

In [None]:
context_split_assembler = ( ContextSplitAssembler()
  .setInputCols("vector_db")
  .setOutputCol("document")
  .setJoinString("\n")
  .setExplodeSplits(False))


In [None]:
vector_df.selectExpr("explode(vector_db) as vector_db").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
context_split_assembler.transform(
    post_processor.transform(
        vector_df
    )
  ).selectExpr("explode(document) as document").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
doc = Annotation.fromRow(
    context_split_assembler.transform(
        post_processor.transform(vector_df)
    ).selectExpr("document").collect()[0]["document"][0]
  )

In [None]:
print(doc.result)

Current evidence suggests that the information needs of people with diabetes mellitus differ across patient groups. With a view to being able to provide individualized information, this study aims to identify (i) the diabetes-related information needs of people with diabetes mellitus; (ii) different subgroups of people with specific information needs; and (iii) associated characteristics of the identified subgroups, such as sociodemographic characteristics, diabetes-related comorbidities, and well-being.
Adiponectin and pro‐inflammatory cytokines are associated with type 2 diabetes mellitus, and might serve as a prognostic marker and a therapeutic intervention for overweight‐related type 2 diabetes mellitus.
Background: Diabetes is referred to a group of diseases characterized by high glucose levels in blood. It is caused by a deficiency in the production or function of insulin or both, which can occur because of different reasons, resulting in protein and lipid metabolic disorders. Th

!!!!!  Lost metadata values

In [None]:
print(doc.end)
print(doc.annotatorType)
print(doc.metadata)

7518
document
{'sentence': '0', 'document': '0'}


## check setExplodeSplits(True)

In [None]:
context_split_assembler.setExplodeSplits(True).transform(
    post_processor.transform(
        vector_df
    )
  ).selectExpr("explode(document) as document").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
collected = context_split_assembler.setExplodeSplits(True).transform(
    post_processor.transform(
        vector_df
    )
  ).selectExpr("document").collect()[0]["document"]

for row in collected:
  anno = Annotation.fromRow(row)
  print(f"Begin: {anno.begin}, End: {anno.end}, metadata: {anno.metadata}")

Begin: 0, End: 508, metadata: {'sentence': '0', 'document': '0'}
Begin: 509, End: 715, metadata: {'sentence': '1', 'document': '1'}
Begin: 716, End: 1107, metadata: {'sentence': '2', 'document': '2'}
Begin: 1108, End: 1380, metadata: {'sentence': '3', 'document': '3'}
Begin: 1381, End: 1801, metadata: {'sentence': '4', 'document': '4'}
Begin: 1802, End: 2022, metadata: {'sentence': '5', 'document': '5'}
Begin: 2023, End: 2661, metadata: {'sentence': '6', 'document': '6'}
Begin: 2662, End: 2892, metadata: {'sentence': '7', 'document': '7'}
Begin: 2893, End: 3629, metadata: {'sentence': '8', 'document': '8'}
Begin: 3630, End: 3745, metadata: {'sentence': '9', 'document': '9'}
Begin: 3746, End: 4609, metadata: {'sentence': '10', 'document': '10'}
Begin: 4610, End: 4947, metadata: {'sentence': '11', 'document': '11'}
Begin: 4948, End: 5921, metadata: {'sentence': '12', 'document': '12'}
Begin: 5922, End: 6235, metadata: {'sentence': '13', 'document': '13'}
Begin: 6236, End: 6437, metadata: