In [1]:
import pandas as pd
import dotenv

In [2]:
from llama_index.core import Document
from llama_index.core.node_parser import (
    HierarchicalNodeParser,
    SentenceSplitter,
)

In [3]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import AutoMergingRetriever
from llama_index.core import StorageContext
from llama_index.llms.openai import OpenAI
from llama_index.core.node_parser import get_leaf_nodes, get_root_nodes

In [4]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core import VectorStoreIndex, ServiceContext, Document
from llama_index.core.storage.docstore import SimpleDocumentStore

In [5]:
from llama_index.vector_stores.qdrant import QdrantVectorStore

In [6]:
import pandas as pd

In [7]:
import qdrant_client

In [8]:
client = qdrant_client.QdrantClient(path="qdrant_adverts")

In [185]:
# file_path1 = "results/advert_comparison_cleaned.csv"

# advert_comparison_cleaned = pd.read_csv(file_path1)

redflags = pd.read_csv('results/redflag_model_result_2024-08-15T12:14:35_04754c.csv')
documents = [Document(text=row['advert'], metadata = {'IDn':row['IDn']}) for idx,row in redflags.iterrows()]
for doc in documents:
    doc.id_ = doc.metadata['IDn']

In [187]:
documents[20]

Document(id_='573156', embedding=None, metadata={'IDn': 573156}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='DELIVERY DRIVERBecome A Driver PartnerBecome a Driver Partner for Mr D food |takealot.com|superbalist.com!Earn between R5,000 – R12,000 per month or R200 – R350 per deliveryWhat you need for you to qualify: A clear criminal background check.valid South African drivers license. An Android smartphone V6 or higherA South African ID or work permit for foreign nationals.Start you application here:https://tinyurl.com/MrDdriver', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [188]:
node_parser = HierarchicalNodeParser.from_defaults()
nodes = node_parser.get_nodes_from_documents(documents)
len(nodes)

323

In [189]:
docstore = SimpleDocumentStore()
docstore.add_documents(nodes)

In [190]:
vector_store = QdrantVectorStore(
    client=client, collection_name="adverts"
)

In [191]:
storage_context = StorageContext.from_defaults(vector_store = vector_store, docstore=docstore)

In [192]:
leaf_nodes = get_leaf_nodes(nodes)
len(leaf_nodes)

123

In [193]:
root_nodes = get_root_nodes(nodes)
len(root_nodes)

100

In [207]:
llm = OpenAI(model="gpt-4o")
## Load index into vector index


base_index = VectorStoreIndex(
    leaf_nodes,
    storage_context=storage_context,
)


base_retriever = base_index.as_retriever(similarity_top_k=3)

In [208]:
retriever = AutoMergingRetriever(base_retriever, storage_context, verbose=True)

In [209]:
prompt="""Which documents assure applicants that qualifications or experience are not important? "
        "Return your analysis  the following list of JSON formats:"""

In [210]:
ANALYSIS_STR = """['{"IDn":IDn,"result": "yes" or "no", "evidence": ["evidence 1", "evidence 2", ...] or "no evidence", "confidence": 0.0 to 1.0, "explanation": "Brief explanation of your reasoning"}', '{...}', ...] """


In [211]:
query_engine = RetrieverQueryEngine.from_args(retriever)
base_query_engine = RetrieverQueryEngine.from_args(base_retriever)
response = query_engine.query(prompt+ANALYSIS_STR)

> Merging 1 nodes into parent node.
> Parent node id: b8bf2895-a7f3-47ff-8c88-6d9d0110f564.
> Parent node text: There's a certificate called RE5.The certificate is in high demand in South Africa. You don't nee...

> Merging 1 nodes into parent node.
> Parent node id: b20cd725-0993-494d-a7e3-d4995af21e8d.
> Parent node text: There's a certificate called RE5.The certificate is in high demand in South Africa. You don't nee...



In [212]:
response.response

'[\'{"IDn":573452,"result":"yes","evidence":["No certificate wanted","Minimum Requirements: Grade 08/12","Physically fit","Able to work long hours"],"confidence":0.9,"explanation":"The job posting explicitly states that no certificate is wanted and lists minimum requirements that do not include qualifications or experience."}\',\n\'{"IDn":573444,"result":"yes","evidence":["No certificate wanted","Minimum Requirements: Grade 08/12","Physically fit","Able to work long hours"],"confidence":0.8,"explanation":"Similar to the previous job posting, this one also clearly states that no certificate is required and focuses on other criteria like physical fitness and ability to work long hours."}\']'

In [40]:
from qdrant_client import QdrantClient, models

In [43]:
scroll_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="city",
                match=models.MatchValue(value="London"),
            ),
            models.FieldCondition(
                key="color",
                match=models.MatchValue(value="red"),
            ),
        ]
    ),

In [89]:
len(search_result[0])

36

In [None]:
search_result = client.scroll(
    collection_name="adverts",limit=50
)
print(search_result)

In [None]:
redflags = pd.read_csv('results/redflag_model_result_2024-08-15T12:14:35_04754c.csv')

documents = [Document(text=row['advert'], metadata = {'doc_id':row['IDn'],'IDn':row['IDn']}) for idx,row in redflags[:2].iterrows()]

In [96]:
len(documents)

2

In [178]:
documents[0].id_

'1'

In [97]:
node_parser = HierarchicalNodeParser.from_defaults()
nodes = node_parser.get_nodes_from_documents(documents)
len(nodes)

7

In [99]:
base_index.insert_nodes(nodes)

In [90]:
filter = {"must": [{"key": "IDn", "match": {"value": 572469}}]}

In [109]:
base_index.delete_ref_doc(ref_doc_id='3cfb8a6b-86bd-4c53-bb96-0f4dff6efa20')

In [112]:
f=base_index.from_documents(documents)

In [116]:
base_index.delete()

TypeError: delete() missing 1 required positional argument: 'doc_id'

In [None]:
def delete_vectors_by_source_document(client, collection_namesource_document_id: str, **kwargs: Any) -> None:
    """Delete vectors from the collection associated with a specific source document.

    Args:
        source_document_id: The ID of the source document whose associated vectors should be deleted.
    """
    filter = {"must": [{"key": "source_document_id", "match": {"value": source_document_id}}]}
    self.client.delete(collection_name=self.collection_name, filter=filter, **kwargs)

In [125]:
from qdrant_client.models import Filter, FieldCondition, Range

hits = client.search(
    collection_name="adverts",
    query_vector=query_vector,
    query_filter=Filter(
        must=[  # These conditions are required for search results
            FieldCondition(
                key='rand_number',  # Condition based on values of `rand_number` field.
                range=Range(
                    gte=3  # Select only those results where `rand_number` >= 3
                )
            )
        ]
    ),
    limit=5  # Return 5 closest points
)

TypeError: search() missing 1 required positional argument: 'query_vector'

In [None]:
dir(doc)

In [163]:
storage_context.docstore.delete_document(doc_id = doc.id_)

In [None]:
dir(storage_context.docstore)

In [167]:
len(storage_context.docstore.docs.values())

0

In [None]:
dir(doc)

## Delete all docs:

In [166]:
for doc in storage_context.docstore.docs.values():
    print(doc)
    storage_context.docstore.delete_document(doc_id = doc.id_)
    

In [172]:
storage_context.docstore.get_all_ref_doc_info()

{}

In [170]:
dir(storage_context)

['__annotations__',
 '__class__',
 '__dataclass_fields__',
 '__dataclass_params__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'add_vector_store',
 'docstore',
 'from_defaults',
 'from_dict',
 'graph_store',
 'index_store',
 'persist',
 'property_graph_store',
 'to_dict',
 'vector_store',
 'vector_stores']