In [81]:
from sentence_transformers import SentenceTransformer
import spacy
import pandas as pd
import boto3
from io import StringIO
import datetime
import os

In [62]:
nlp = spacy.load("en_core_web_sm")

In [54]:
s3 = boto3.client(
    "s3",
    aws_access_key_id=os.environ["AWS_ACCESS_KEY"],
    aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
)

s3_view = boto3.resource(
    "s3",
    aws_access_key_id=os.environ["AWS_ACCESS_KEY"],
    aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
)

s3_bucket = "digital-adhivakta"
prefix = "Cases/date="

In [55]:
default = (datetime.datetime.now()).strftime("%Y%m%d")
date = input("Enter date (%Y%m%d):").strip() or default
date_formatted = datetime.datetime.strptime(date, "%Y%m%d").strftime("%Y%m%d")
print(date_formatted)

20240202


In [56]:
prefix = prefix + date_formatted
prefix

'Cases/date=20240202'

In [57]:
response = s3.list_objects_v2(Bucket=s3_bucket, Prefix=prefix)

In [48]:
files = response.get("Contents", [])
cases = []
for file in files:
    file_key = file.get("Key")
    if file_key.endswith(".txt"):
        cases.append(file_key)

cases
# file_key

['Cases/date=20240202/W15582024.txt', 'Cases/date=20240202/W98872019.txt']

In [72]:
case_data = []
for case in cases:
    case_content = s3_view.Object(s3_bucket, case).get()["Body"].read().decode("utf-8")
    doc = nlp(case_content)

    sentences = [sent.text for sent in doc.sents]
    case_data.append(
        {"case_no": case, "datedate_of_judgement": date_formatted, "tokens": sentences}
    )

case_df = pd.DataFrame(case_data)

                             case_no datedate_of_judgement  \
0  Cases/date=20240202/W15582024.txt              20240202   
1  Cases/date=20240202/W98872019.txt              20240202   

                                              tokens  
0  [* IN THE HIGH COURT OF DELHI AT NEW DELHI + W...  
1  [* IN THE HIGH COURT OF DELHI AT NEW DELHI % R...  


In [74]:
case_df.to_csv(f"tokens/date={date_formatted}.csv", index=False)

In [84]:
case_buffer = StringIO()
case_csv = case_df.to_csv(case_buffer, index=False)
case_buffer.seek(0)

s3.put_object(
    Body=case_buffer.getvalue(),
    Bucket=s3_bucket,
    Key=f"Tokens/date_{date_formatted}.csv",
)

{'ResponseMetadata': {'RequestId': 'XREB1X1DYPFE5NXY',
  'HostId': '/fo1J3Nu8aInzMFnGw9PJA5EGzz3dXjWaXTbIcjeBhv3cziK9cPcLFS4eAf200KZ/ugVs/gbe1g=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '/fo1J3Nu8aInzMFnGw9PJA5EGzz3dXjWaXTbIcjeBhv3cziK9cPcLFS4eAf200KZ/ugVs/gbe1g=',
   'x-amz-request-id': 'XREB1X1DYPFE5NXY',
   'date': 'Mon, 05 Feb 2024 10:58:17 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"599133805d45af48aaa121f81f6b90cc"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"599133805d45af48aaa121f81f6b90cc"',
 'ServerSideEncryption': 'AES256'}

In [77]:
# sentences = [sentence.text for sentence in nlp(document).sents]
print(case_df["tokens"][0])

['* IN THE HIGH COURT OF DELHI AT NEW DELHI + W.P.(C) 1558/2024 and CM APPL.', '6461/2024, CM APPL.', '6462/2024 MAYAPURI CETP SOCIETY .....', 'Petitioner Through: Mr. Siddhartha Iyer and Mr. Deevanshu Sharma, Advocates versus THE SPECIAL COMMISSIONER OFINDUSTRY CUM APPROPRIATE AUTHORITY CETP AND ORS .....', "Respondents Through: Ms. Sangeeta Bharti, Standing Counsel with Ms. Malvi Balyan and Ms. Aarushi Bhel, Advocates for Delhi Jal Board % Date of Decision: 2nd February, 2024 CORAM: HON'BLE THE ACTING CHIEF JUSTICE HON'BLE MS.", 'JUSTICE MANMEET PRITAM SINGH ARORA JUDGMENT MANMOHAN, ACJ: (ORAL) 1.', 'Present petition has been filed seeking quashing of the notification No. F1/CI/OSD/Transfer of CETPs/2021-22/4570-83 dated 1st January, 2024 issued by Respondent No. 1 (�Impugned Notification�).', 'The Common Effluent Treatment Plant managed by Petitioner No. 1 is enlisted at Serial No. 6 of the impugned notification.', '2. Learned counsel for the Petitioner states that the Petitioner So

Embeddings


In [79]:
sentences = case_df["tokens"][1]
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
embeddings = model.encode(sentences)
print(embeddings)

[[ 6.0616452e-02  2.6211658e-02 -9.4094556e-03 ... -1.1399918e-02
  -7.0926949e-02 -1.1393674e-02]
 [-6.8027782e-03 -4.9947020e-02  6.0542366e-03 ... -2.2570305e-02
  -5.0657842e-02 -4.6072807e-02]
 [-9.6992906e-03 -5.0415434e-02  7.8566764e-03 ... -2.2679275e-02
  -5.2762594e-02 -4.9357846e-02]
 ...
 [ 3.0448144e-02 -4.4737093e-02  2.9214546e-03 ... -2.6977085e-02
   5.3604995e-03  7.8415517e-03]
 [ 5.3444795e-02  1.8331602e-02 -2.6015736e-02 ... -4.2434219e-02
  -8.6601794e-02 -6.8801230e-05]
 [ 2.5921328e-02 -4.3619342e-02  7.9045873e-03 ... -2.1928374e-03
  -3.0777067e-02 -2.7946236e-02]]
