In [1]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams,PointStruct
import pandas as pd
import json
import torch
with open('config.json', 'r') as f:
    config = json.load(f)

data_path = config['data_path']


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer

# Load tokenizer của model sentence-transformer
tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-small")
def get_length(sentence):
    tokens = tokenizer.encode(sentence, truncation=False)
    num_tokens = len(tokens)
    return num_tokens
splitter=RecursiveCharacterTextSplitter.from_huggingface_tokenizer(tokenizer=tokenizer, chunk_size=300,chunk_overlap=0)

def advance_split(docs : str,min_length: int =100):
    if get_length(docs)<=400:
        return [docs]
    initial_chunks=splitter.split_text(docs)
    
    if get_length(initial_chunks[-1])<min_length:
        initial_chunks[-2]=initial_chunks[-2]+initial_chunks[-1]
        initial_chunks.remove(initial_chunks[-1])

    combined_chunks = []
    current_chunk = ""
    for chunk in initial_chunks:
        if get_length(current_chunk) < min_length:
            current_chunk += chunk
        else:
            combined_chunks.append(current_chunk)
            current_chunk = chunk
    if current_chunk:  # Add any remaining chunk
        combined_chunks.append(current_chunk)
    return combined_chunks

corpus = pd.read_csv(data_path+'corpus.csv')

# Creating a new DataFrame to hold the expanded rows
expanded_rows = []

for index, row in corpus.iterrows():
    # Split the positive column using advance_split
    chunks = advance_split(row['text'])
    
    # Create new rows for each chunk
    for chunk in chunks:
        new_row = row.copy()
        new_row['text'] = chunk
        expanded_rows.append(new_row)

# Convert the list of new rows into a DataFrame
expanded_df = pd.DataFrame(expanded_rows)

expanded_df.to_csv(data_path+'expended_corpus_2.csv', index=False)
# Display the expanded DataFrame
expanded_df.head()

In [3]:

client = QdrantClient(path=data_path+'qdrant_db')

collection_name = "legal_hackthon_2024_ver3"
if not client.collection_exists(collection_name):
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=384, distance=Distance.COSINE),
    )

print(client.get_collections())
number_points = client.count(collection_name).count
print(number_points)
# client.close()

collections=[CollectionDescription(name='legal_hackthon_2024_ver1'), CollectionDescription(name='legal_hackthon_2024_ver2'), CollectionDescription(name='legal_hackthon_2024_ver3')]
0


In [6]:
df =pd.read_csv(data_path+'expended_corpus_2.csv')
data_list_of_dicts =df.to_dict(orient='records')
print(data_list_of_dicts[0])

with open(data_path+'finetune results\\run 5\\corpus_embeddings.json', 'r') as file:
    corpus_embeddings = json.load(file)

{'text': 'Thông tư này hướng dẫn tuần tra, canh gác bảo vệ đê Điều trong mùa lũ đối với các tuyến đê sông được phân loại, phân cấp theo quy định tại Điều 4 của Luật Đê Điều.', 'cid': 0}


In [9]:
def insert_qdrant(part):
    points=[]
    number_points = client.count(collection_name).count
    print(number_points)
    for idx, vec in enumerate(part):
        curr=PointStruct(id = idx+number_points,vector=vec,payload={"cid": data_list_of_dicts[idx+number_points]['cid'],'text':data_list_of_dicts[idx+number_points]['text']})
        points.append(curr)
    client.upsert(
        collection_name=collection_name,
        points=points,
        wait=True
    )
    print(f'insert embedding part successfully!')
# Split the DataFrame into 10 approximately equal-sized chunks
batch_size = len(corpus_embeddings) // 10 + 1  # To make sure it divides even with a remainder

# Iterate over the DataFrame in chunks
for i in range(0, len(corpus_embeddings), batch_size):
    batch = corpus_embeddings[i:i + batch_size]

    
    # Call your function to insert this part
    insert_qdrant(batch)
    


0
insert embedding part successfully!
39823
insert embedding part successfully!
79646
insert embedding part successfully!
119469
insert embedding part successfully!
159292
insert embedding part successfully!
199115
insert embedding part successfully!
238938
insert embedding part successfully!
278761
insert embedding part successfully!
318584
insert embedding part successfully!
358407
insert embedding part successfully!


In [None]:
apoint=client.search(collection_name, with_vectors=True)
apoint

[ScoredPoint(id=0, version=0, score=1.0, payload={'cid': 0, 'text': {'text': 'Thông tư này hướng dẫn tuần tra, canh gác bảo vệ đê Điều trong mùa lũ đối với các tuyến đê sông được phân loại, phân cấp theo quy định tại Điều 4 của Luật Đê Điều.', 'cid': 0}}, vector=[-0.07376737305990994, 0.07594932915041865, -0.00795849733012271, -0.045863436902439804, 0.018912133784485387, 0.045990856907725246, -0.11175318463558163, -0.016400060680283258, 0.03240189134404772, 0.11964536996295369, -0.012983076538544932, 0.04962894905863525, 0.019958554827891488, 0.008813494365588443, -0.01924212679817367, 0.04695389194767245, -0.007055970692685424, 0.05921053245608443, -0.03256176535067938, -0.06358898263770489, -0.0020560436852857873, -0.0069439464880386, 0.06745852279821547, 0.06107506753342634, 0.023390898970266954, -0.06624737274797632, 0.06403059765602331, -0.050221662083221305, -0.07251709300804766, -0.045961881906523346, 0.00016942553702786156, 0.05743306038235395, -0.024703421024711066, -0.0186190