In [26]:
import pandas as pd
import os
from dotenv import load_dotenv
from pinecone import Pinecone
from pinecone import ServerlessSpec
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document
from langchain_pinecone import PineconeVectorStore
import getpass
from uuid import uuid4


In [2]:
# Pinecone client
if not os.getenv("PINECONE_API_KEY"):
    os.environ["PINECONE_API_KEY"] = getpass.getpass("Enter your Pinecone API key: ")
pinecone_api_key = os.environ.get("PINECONE_API_KEY")
pc = Pinecone(api_key=pinecone_api_key)

In [52]:
index_name = "legal-cases" # Replace the name with anything you like 
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

In [53]:
index = pc.Index(index_name)
embeddings = OpenAIEmbeddings()
vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [54]:
df = pd.read_csv("cases.csv")
df.head()

Unnamed: 0,cases
0,
1,",}/Judgmenl Sheet\nIN THE PBSHAWAR HIGH COURT,..."
2,"1Judgment Sheet\nPESHAWAR HIGH COURT, ABBOTTAB..."
3,"JUDGMENT SHEET IN THE PESHAWAR HIGH COURT, MIN..."
4,"JUDGMENT SHEET PESHAWAR HIGH COURT, PESHAWAR (..."


In [55]:
cases = list(df["cases"])[1:]

In [56]:
len(cases)

990

In [57]:
docs = [Document(page_content=case[:40000] if len(case) > 40000 else case,metadata={id:i}) for i,case in enumerate(cases)] # Length that pinecone does not allow

In [58]:
for i in range(len(docs)):
    docs[i].metadata = {
        "source": f"case_{i}",
        "summary": docs[i].page_content[:500]  # Keep summary short
    }


In [59]:
import json

def is_valid_document(doc):
    metadata_size = len(json.dumps(doc.metadata).encode('utf-8'))
    content_size = len(doc.page_content.encode('utf-8'))
    total_size = metadata_size + content_size
    return total_size < 40960


In [60]:
docs = [doc for doc in docs if is_valid_document(doc)]
uuids = [str(uuid4()) for _ in range(len(docs))]


In [61]:
print(len(docs)) # Should be equal to the one we are giving

990


In [62]:
uuids = [str(uuid4()) for _ in range(len(docs))]
batch_size = 5
for i in range(0, len(docs), batch_size):
    print("Current Batch Index is:",i)
    batch = docs[i:i+batch_size]
    batch_ids = uuids[i:i+batch_size]
    vector_store.add_documents(batch,ids=batch_ids)

Current Batch Index is: 0
Current Batch Index is: 5
Current Batch Index is: 10
Current Batch Index is: 15
Current Batch Index is: 20
Current Batch Index is: 25
Current Batch Index is: 30
Current Batch Index is: 35
Current Batch Index is: 40
Current Batch Index is: 45
Current Batch Index is: 50
Current Batch Index is: 55
Current Batch Index is: 60
Current Batch Index is: 65
Current Batch Index is: 70
Current Batch Index is: 75
Current Batch Index is: 80
Current Batch Index is: 85
Current Batch Index is: 90
Current Batch Index is: 95
Current Batch Index is: 100
Current Batch Index is: 105
Current Batch Index is: 110
Current Batch Index is: 115
Current Batch Index is: 120
Current Batch Index is: 125
Current Batch Index is: 130
Current Batch Index is: 135
Current Batch Index is: 140
Current Batch Index is: 145
Current Batch Index is: 150
Current Batch Index is: 155
Current Batch Index is: 160
Current Batch Index is: 165
Current Batch Index is: 170
Current Batch Index is: 175
Current Batch 

In [63]:
res = vector_store.similarity_search("Case by Muhammad Ismail",k=1)

In [67]:
print(res[0].page_content)

JUDGMENT SHEET IN THE PESHAWAR HIGH COURT, D.LL.KHAN BENCH (Judicial Department)
C.M.No.42-D of 2023 with C.M.Nos.43 & 194-D of 2023 Muhammad Ismail and anotherVersus
Muhammad Aslam (deceased)
though L.Rs and others JUDGMENT. For petitioners: Muhammad Mohsin Ali, Advocate. For respondents: M/S Ahmad Ali Khan and SalimullahKhan Ranazai, Advocates. Date of hearing 21.02.2024 FAZAL SUBHAN, J.- Through the instant petition filed under section 12(2) C.P.C, the petitioners have called in question the judgment of this Court dated 19.9.2022,rendered in C.R.No.89-D of 2022.2. Relevant facts leading to the instant petition are that Abdul Razaq (respondent No.4 herein) had filed a suit for declaration and permanent injunction in respect of the property detailed in the plaint, which was decreed in his favour vide judgment and decree dated 29.4.2011. The appeal filed thereagainst was dismissed vide judgment and decree dated 08.4.2012 of learned Additional District Judge-V, D.I.Khan. The revision pe