In [None]:
# Step 1: Install Dependencies
!pip install transformers PyPDF2

# Step 2: Import Libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from PyPDF2 import PdfReader

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### **Sequence classifier try**

In [None]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [None]:
pip install torch transformers PyPDF2 tqdm numpy

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

# **# Code to create Embeddings**

In [None]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/227.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/227.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-3.0.1


In [None]:
import torch
from sentence_transformers import SentenceTransformer
import pickle
import numpy as np
from tqdm import tqdm

# Load pre-trained SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to load IPC chunks from a pickle file
def load_chunks(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Function to generate embeddings using SentenceTransformer
def generate_embeddings(chunks):
    embeddings = []
    for chunk in tqdm(chunks, desc="Generating embeddings"):
        embedding = model.encode(chunk, convert_to_tensor=True).cpu().numpy()
        embeddings.append(embedding)
    return np.vstack(embeddings)

# Main function to process the IPC document and save embeddings
def process_ipc(file_path, output_file):
    # Load IPC chunks
    ipc_chunks = load_chunks(file_path)

    # Generate embeddings for the chunks
    embeddings = generate_embeddings(ipc_chunks)

    # Save embeddings to a pickle file
    with open(output_file, 'wb') as f:
        pickle.dump(embeddings, f)

    print(f"Embeddings saved to {output_file}")

# Example usage: Process IPC document and save embeddings
process_ipc('/content/ipc_chunks_all.pkl', 'ipc_embeddings_st.pkl')


Generating embeddings: 100%|██████████| 2369/2369 [02:24<00:00, 16.39it/s]

Embeddings saved to ipc_embeddings_st.pkl





# **Code to create Chunks**

In [None]:
import os
import re
import pickle
from PyPDF2 import PdfReader

# Function to read the IPC PDF and extract text
def load_ipc_pdf(file_path):
    text = ""
    try:
        reader = PdfReader(file_path)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    except Exception as e:
        print(f"Error reading PDF file: {e}")
    return text

# Function to split text into manageable chunks
def split_text(text, max_chunk_size=512, min_chunk_size=256):
    # Split at sentence boundaries while respecting chunk size constraints
    sentences = re.split(r'(?<=\.)\s', text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_chunk_size:
            current_chunk += sentence + " "
            # Ensure minimum chunk size is maintained
            if len(current_chunk) >= min_chunk_size:
                chunks.append(current_chunk.strip())
                current_chunk = ""
        else:
            # If the current sentence exceeds max_chunk_size, split it forcibly
            while len(sentence) > max_chunk_size:
                chunks.append(sentence[:max_chunk_size].strip())
                sentence = sentence[max_chunk_size:]
            current_chunk = sentence + " "

    # Append the remaining chunk if it meets the minimum size
    if len(current_chunk) >= min_chunk_size:
        chunks.append(current_chunk.strip())

    return chunks

# Main function to process the IPC documents in a folder and save chunks to a single pickle file
def process_ipc_folder(folder_path, output_file):
    all_chunks = []

    # Iterate through all PDF files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            file_path = os.path.join(folder_path, filename)
            print(f"Processing file: {file_path}")
            # Load the IPC text and split into chunks
            ipc_text = load_ipc_pdf(file_path)
            ipc_chunks = split_text(ipc_text)
            all_chunks.extend(ipc_chunks)

    # Save all chunks to a pickle file
    with open(output_file, 'wb') as f:
        pickle.dump(all_chunks, f)

# Example usage: Process all PDFs in '/content/Dataset' and save to 'ipc_chunks_all.pkl'
process_ipc_folder('/content/Dataset', '/content/ipc_chunks_all.pkl')


Processing file: /content/Dataset/IPC_186045.pdf
Processing file: /content/Dataset/A1860-45.pdf
Processing file: /content/Dataset/1360312590693-12.Cyber-Laws-chapter-in-Legal-Aspects-Book.pdf


# **Using MiniLM-L6-V2 model**

In [59]:
import pickle
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Load pre-trained SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to load chunks from a pickle file
def load_chunks(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Function to calculate weighted query embedding
def calculate_weighted_query_embedding(roberta_output):
    embeddings = []
    weights = []
    for keyword, weight in roberta_output:
        embedding = model.encode(keyword, convert_to_tensor=True).cpu().numpy()
        embeddings.append(embedding)
        weights.append(weight)
    embeddings = np.vstack(embeddings)
    weights = np.array(weights).reshape(-1, 1)
    weighted_embedding = np.sum(embeddings * weights, axis=0) / np.sum(weights)
    return weighted_embedding

# Main function to process the IPC document
def process_ipc():
    try:
        # Load preprocessed chunks and embeddings
        ipc_chunks = load_chunks('/content/ipc_chunks_all.pkl')
        ipc_embeddings = load_chunks('/content/ipc_embeddings_st.pkl')
        ipc_embeddings = np.vstack(ipc_embeddings)  # Ensure embeddings are a NumPy array
    except (FileNotFoundError, EOFError):
        print("Preprocessed chunks or embeddings not found. Please ensure the files exist.")
        return

    # Initialize FAISS index
    dimension = ipc_embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(ipc_embeddings.astype('float32'))  # Ensure ipc_embeddings are float32

    # Example input from the RoBERTa model
    roberta_output = [('67', 0.8539), ('ipc', 0.7163), ('section', 0.6221)]
    query_embedding = calculate_weighted_query_embedding(roberta_output)

    # Ensure the query_embedding has the same dimension as the FAISS index
    assert query_embedding.shape[0] == dimension, f"Dimension mismatch: query ({query_embedding.shape[0]}) vs index ({dimension})"

    # Search in FAISS index
    k = 5  # Number of top results to retrieve
    query_embedding = query_embedding.reshape(1, -1).astype('float32')  # Ensure query_embedding is float32
    distances, indices = index.search(query_embedding, k)

    # Prepare and return results
    results = []
    for i, idx in enumerate(indices[0]):
        result = {
            "distance": distances[0][i],
            "text": ipc_chunks[idx]
        }
        results.append(result)
        print(f"Result {i+1}:")
        print(f"Distance: {result['distance']:.4f}")
        print(f"Text: {result['text']}\n")

    return results

# Run the main processing function
process_ipc()

Result 1:
Distance: 0.9334
Text: Section 67-A deals with publishing or transmitting of material containi ng sexually explicit act in 
electronic form.  Contents of Section 67 when combined with  the material containing sexually explicit 
material attract penalty under this Section. 
Child Pornography  has been exclusively dealt with under Section 67B.

Result 2:
Distance: 0.9588
Text: Section 69A inserted in the ITAA, vests with the Centra l Government or any of its officers 
with the powers to issue directions for blocking for publi c access of any information through 
any computer resource, under the same circumstances as me ntioned above.

Result 3:
Distance: 0.9891
Text: 376C.  Sexual intercourse by a person in authority.  
376D.  Gang rape . 
376E. Punishment for repeat offenders.  
Of Unnatural offences  
377. Unnatural offences.  
 
CHAPTER XVII  
OF OFFENCES AGAINST PROPERTY  
Of Theft  
378. Theft.  
379. Punishment for theft.

Result 4:
Distance: 0.9924
Text: Ins. by Act 43 o

[{'distance': 0.93342936,
  'text': 'Section 67-A deals with publishing or transmitting of material containi ng sexually explicit act in \nelectronic form.  Contents of Section 67 when combined with  the material containing sexually explicit \nmaterial attract penalty under this Section. \nChild Pornography  has been exclusively dealt with under Section 67B.'},
 {'distance': 0.95875084,
  'text': 'Section 69A inserted in the ITAA, vests with the Centra l Government or any of its officers \nwith the powers to issue directions for blocking for publi c access of any information through \nany computer resource, under the same circumstances as me ntioned above.'},
 {'distance': 0.98911446,
  'text': '376C.  Sexual intercourse by a person in authority.  \n376D.  Gang rape . \n376E. Punishment for repeat offenders.  \nOf Unnatural offences  \n377. Unnatural offences.  \n \nCHAPTER XVII  \nOF OFFENCES AGAINST PROPERTY  \nOf Theft  \n378. Theft.  \n379. Punishment for theft.'},
 {'distance': 0.

# Time **Report**

In [60]:
import time

# Start the timer
start = time.time()

# Code to measure
# Example function or process
def example_process():
    time.sleep(2)  # Simulate a process that takes 2 seconds

example_process()

# End the timer
end = time.time()

# Print the elapsed time
print("Time taken: ", end - start, "seconds")


Time taken:  2.002437114715576 seconds


# FineTuning on IPC **CodeBook (Pending)**