In [None]:
# Step 1: Install Dependencies
!pip install transformers PyPDF2

# Step 2: Import Libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from PyPDF2 import PdfReader

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### **Sequence classifier try**

# **Original Un-tuned (Without FAISS)**

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from PyPDF2 import PdfReader
import re

# Load tokenizer and model for Legal-BERT
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained(
    "nlpaueb/legal-bert-base-uncased",
    num_labels=1  # Assuming regression for relevance scores
)

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Function to read the IPC PDF and extract text
def load_ipc_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

# Function to split text into manageable chunks
def split_text(text, chunk_size=512):
    text_chunks = re.split(r'(?<=\.)\s', text)  # Split at sentence boundaries
    chunks = []
    current_chunk = ""
    for sentence in text_chunks:
        if len(current_chunk) + len(sentence) <= chunk_size:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

# Dummy input (keywords) from a hypothetical RoBERTa model
query_keywords = {
    "theft": 0.9,
    "punishment": 0.8,
    "Maharashtra": 0.7
}

# Function to format input for Legal-BERT
def format_input(keywords, text_chunks):
    formatted_inputs = []
    for chunk in text_chunks:
        # Create input by combining keywords and text chunks
        input_text = " ".join(keywords.keys()) + " " + chunk
        formatted_inputs.append(input_text)
    return formatted_inputs

# Function to calculate relevance scores
def calculate_relevance(formatted_inputs):
    relevance_scores = []
    for input_text in formatted_inputs:
        inputs = tokenizer(input_text, return_tensors='pt', truncation=True, padding=True).to(device)
        outputs = model(**inputs)
        score = outputs.logits.squeeze().item()
        relevance_scores.append(score)
    return relevance_scores

# Load the IPC text
ipc_text = load_ipc_pdf('/content/Indian Penal Code Book.pdf')

# Split the IPC text into chunks
ipc_chunks = split_text(ipc_text)

# Format inputs for Legal-BERT
formatted_inputs = format_input(query_keywords, ipc_chunks)

# Calculate relevance scores
relevance_scores = calculate_relevance(formatted_inputs)

# Combine chunks with their scores
results = list(zip(ipc_chunks, relevance_scores))

# Sort results by relevance score in descending order
results.sort(key=lambda x: x[1], reverse=True)

# Print top results
top_n = 5  # Number of top results to display
for i, (chunk, score) in enumerate(results[:top_n]):
    print(f"Result {i+1}:")
    print(f"Score: {score:.4f}")
    print(f"Text: {chunk}\n")


In [None]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [None]:
pip install torch transformers PyPDF2 tqdm numpy

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

# **# Code to create Embeddings**

In [None]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/227.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/227.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-3.0.1


In [None]:
import torch
from sentence_transformers import SentenceTransformer
import pickle
import numpy as np
from tqdm import tqdm

# Load pre-trained SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to load IPC chunks from a pickle file
def load_chunks(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Function to generate embeddings using SentenceTransformer
def generate_embeddings(chunks):
    embeddings = []
    for chunk in tqdm(chunks, desc="Generating embeddings"):
        embedding = model.encode(chunk, convert_to_tensor=True).cpu().numpy()
        embeddings.append(embedding)
    return np.vstack(embeddings)

# Main function to process the IPC document and save embeddings
def process_ipc(file_path, output_file):
    # Load IPC chunks
    ipc_chunks = load_chunks(file_path)

    # Generate embeddings for the chunks
    embeddings = generate_embeddings(ipc_chunks)

    # Save embeddings to a pickle file
    with open(output_file, 'wb') as f:
        pickle.dump(embeddings, f)

    print(f"Embeddings saved to {output_file}")

# Example usage: Process IPC document and save embeddings
process_ipc('/content/ipc_chunks_all.pkl', 'ipc_embeddings_st.pkl')


Generating embeddings: 100%|██████████| 2369/2369 [02:24<00:00, 16.39it/s]

Embeddings saved to ipc_embeddings_st.pkl





# **Code to create Chunks**

In [None]:
import os
import re
import pickle
from PyPDF2 import PdfReader

# Function to read the IPC PDF and extract text
def load_ipc_pdf(file_path):
    text = ""
    try:
        reader = PdfReader(file_path)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    except Exception as e:
        print(f"Error reading PDF file: {e}")
    return text

# Function to split text into manageable chunks
def split_text(text, max_chunk_size=512, min_chunk_size=256):
    # Split at sentence boundaries while respecting chunk size constraints
    sentences = re.split(r'(?<=\.)\s', text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_chunk_size:
            current_chunk += sentence + " "
            # Ensure minimum chunk size is maintained
            if len(current_chunk) >= min_chunk_size:
                chunks.append(current_chunk.strip())
                current_chunk = ""
        else:
            # If the current sentence exceeds max_chunk_size, split it forcibly
            while len(sentence) > max_chunk_size:
                chunks.append(sentence[:max_chunk_size].strip())
                sentence = sentence[max_chunk_size:]
            current_chunk = sentence + " "

    # Append the remaining chunk if it meets the minimum size
    if len(current_chunk) >= min_chunk_size:
        chunks.append(current_chunk.strip())

    return chunks

# Main function to process the IPC documents in a folder and save chunks to a single pickle file
def process_ipc_folder(folder_path, output_file):
    all_chunks = []

    # Iterate through all PDF files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            file_path = os.path.join(folder_path, filename)
            print(f"Processing file: {file_path}")
            # Load the IPC text and split into chunks
            ipc_text = load_ipc_pdf(file_path)
            ipc_chunks = split_text(ipc_text)
            all_chunks.extend(ipc_chunks)

    # Save all chunks to a pickle file
    with open(output_file, 'wb') as f:
        pickle.dump(all_chunks, f)

# Example usage: Process all PDFs in '/content/Dataset' and save to 'ipc_chunks_all.pkl'
process_ipc_folder('/content/Dataset', '/content/ipc_chunks_all.pkl')


Processing file: /content/Dataset/IPC_186045.pdf
Processing file: /content/Dataset/A1860-45.pdf
Processing file: /content/Dataset/1360312590693-12.Cyber-Laws-chapter-in-Legal-Aspects-Book.pdf


# **Importing PKL files - end working model**

In [None]:
#Deleted cause idk-

Result 1:
Distance: 266.2577
Text: 24 of 1995, Section 11.

Result 2:
Distance: 272.1407
Text: 28 of 1993, section
2.

Result 3:
Distance: 276.4108
Text: [s 193] Punishment for false evidence.

Result 4:
Distance: 291.9615
Text: [s 192] Fabricating false evidence.

Result 5:
Distance: 303.7898
Text: This was an exception of the general rule
of presumption of innocence of the accused.



[{'distance': 266.25766, 'text': '24 of 1995, Section 11.'},
 {'distance': 272.14072, 'text': '28 of 1993, section\n2.'},
 {'distance': 276.4108, 'text': '[s 193] Punishment for false evidence.'},
 {'distance': 291.9615, 'text': '[s 192] Fabricating false evidence.'},
 {'distance': 303.78976,
  'text': 'This was an exception of the general rule\nof presumption of innocence of the accused.'}]

# **This is the final model, the one below is for experimentation. (it has a different output stance**

# Further **Optimized**

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
import pickle
import faiss
import numpy as np

# Load tokenizer and model for Legal-BERT
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Function to load chunks from a pickle file
def load_chunks(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Function to calculate weighted query embedding
def calculate_weighted_query_embedding(roberta_output):
    embeddings = []
    weights = []
    for keyword, weight in roberta_output:
        inputs = tokenizer(keyword, return_tensors='pt', truncation=True, padding=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state[:, 0, :].cpu().numpy())
        weights.append(weight)
    embeddings = np.vstack(embeddings)
    weights = np.array(weights).reshape(-1, 1)
    weighted_embedding = np.sum(embeddings * weights, axis=0) / np.sum(weights)
    return weighted_embedding

# Main function to process the IPC document
def process_ipc():
    try:
        # Load preprocessed chunks and embeddings
        ipc_chunks = load_chunks('/content/ipc_chunks.pkl')
        ipc_embeddings = load_chunks('/content/ipc_embeddings_st.pkl')
        ipc_embeddings = np.vstack(ipc_embeddings)  # Ensure embeddings are a NumPy array
    except (FileNotFoundError, EOFError):
        print("Preprocessed chunks or embeddings not found. Please ensure the files exist.")
        return

    # Initialize FAISS index
    dimension = ipc_embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(ipc_embeddings.astype('float32'))  # Ensure ipc_embeddings are float32

    # Example input from the RoBERTa model
    roberta_output = [('7', 0.9539), ('ipc', 0.7163), ('section', 0.8221)]
    query_embedding = calculate_weighted_query_embedding(roberta_output)

    # Search in FAISS index
    k = 5  # Number of top results to retrieve
    query_embedding = query_embedding.reshape(1, -1).astype('float32')  # Ensure query_embedding is float32
    distances, indices = index.search(query_embedding, k)

    # Prepare and return results
    results = []
    for i, idx in enumerate(indices[0]):
        result = {
            "distance": distances[0][i],
            "text": ipc_chunks[idx]
        }
        results.append(result)
        print(f"Result {i+1}:")
        print(f"Distance: {result['distance']:.4f}")
        print(f"Text: {result['text']}\n")

    return results

# Run the main processing function
process_ipc()


Result 1:
Distance: 131.8139
Text: the State within 
which the offender is sentenced.]  
56. [Sentence of Europeans and Americans to penal servitude. Proviso as to sentence for term 
exceeding ten years but not for life .] Rep. by the Criminal Law (Removal of Racial Discriminations) Act, 
1949 (1 7 of 1949) ( w.

Result 2:
Distance: 133.0580
Text: e. f.  6-4-1949).  
57. Fra ctions of terms of punishment .—In calculating fractions of terms  of punishment, 
2[impri sonment] for life shall be reckoned as equivalent to2[imprisonment] for twenty years.  
58. [Offenders sentenced to transportation how dealt with until transported .] Rep.

Result 3:
Distance: 134.9268
Text: by the Trade and Merchandise Marks Act , 1958 (43 of 1958),  s. 135 and 
Sch. (w. e. f.  25-11-1959).  
479. Property mark .—A mark used for denoting that movable  property belongs to a particular 
person is called a property mark.  
480. [Using a false trade mark .] Rep.

Result 4:
Distance: 135.3202
Text: to the will, a

[{'distance': 131.81387,
  'text': 'the State within \nwhich the offender is sentenced.]  \n56. [Sentence of Europeans and Americans to penal servitude. Proviso as to sentence for term \nexceeding ten years but not for life .] Rep. by the Criminal Law (Removal of Racial Discriminations) Act, \n1949 (1 7 of 1949) ( w.'},
 {'distance': 133.05804,
  'text': 'e. f.  6-4-1949).  \n57. Fra ctions of terms of punishment .—In calculating fractions of terms  of punishment, \n2[impri sonment] for life shall be reckoned as equivalent to2[imprisonment] for twenty years.  \n58. [Offenders sentenced to transportation how dealt with until transported .] Rep.'},
 {'distance': 134.92682,
  'text': 'by the Trade and Merchandise Marks Act , 1958 (43 of 1958),  s. 135 and \nSch. (w. e. f.  25-11-1959).  \n479. Property mark .—A mark used for denoting that movable  property belongs to a particular \nperson is called a property mark.  \n480. [Using a false trade mark .] Rep.'},
 {'distance': 135.32018,
  't

In [None]:
import time

# Start the timer
start = time.time()

# Code to measure
# Example function or process
def example_process():
    time.sleep(2)  # Simulate a process that takes 2 seconds

example_process()

# End the timer
end = time.time()

# Print the elapsed time
print("Time taken: ", end - start, "seconds")


Time taken:  2.0033864974975586 seconds


# **Using MiniLM-L6-V2 model**

In [None]:
import pickle
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Load pre-trained SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to load chunks from a pickle file
def load_chunks(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Function to calculate weighted query embedding
def calculate_weighted_query_embedding(roberta_output):
    embeddings = []
    weights = []
    for keyword, weight in roberta_output:
        embedding = model.encode(keyword, convert_to_tensor=True).cpu().numpy()
        embeddings.append(embedding)
        weights.append(weight)
    embeddings = np.vstack(embeddings)
    weights = np.array(weights).reshape(-1, 1)
    weighted_embedding = np.sum(embeddings * weights, axis=0) / np.sum(weights)
    return weighted_embedding

# Main function to process the IPC document
def process_ipc():
    try:
        # Load preprocessed chunks and embeddings
        ipc_chunks = load_chunks('/content/ipc_chunks_all.pkl')
        ipc_embeddings = load_chunks('/content/ipc_embeddings_st.pkl')
        ipc_embeddings = np.vstack(ipc_embeddings)  # Ensure embeddings are a NumPy array
    except (FileNotFoundError, EOFError):
        print("Preprocessed chunks or embeddings not found. Please ensure the files exist.")
        return

    # Initialize FAISS index
    dimension = ipc_embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(ipc_embeddings.astype('float32'))  # Ensure ipc_embeddings are float32

    # Example input from the RoBERTa model
    roberta_output = [('69', 0.8539), ('ipc', 0.7163), ('section', 0.6221)]
    query_embedding = calculate_weighted_query_embedding(roberta_output)

    # Ensure the query_embedding has the same dimension as the FAISS index
    assert query_embedding.shape[0] == dimension, f"Dimension mismatch: query ({query_embedding.shape[0]}) vs index ({dimension})"

    # Search in FAISS index
    k = 5  # Number of top results to retrieve
    query_embedding = query_embedding.reshape(1, -1).astype('float32')  # Ensure query_embedding is float32
    distances, indices = index.search(query_embedding, k)

    # Prepare and return results
    results = []
    for i, idx in enumerate(indices[0]):
        result = {
            "distance": distances[0][i],
            "text": ipc_chunks[idx]
        }
        results.append(result)
        print(f"Result {i+1}:")
        print(f"Distance: {result['distance']:.4f}")
        print(f"Text: {result['text']}\n")

    return results

# Run the main processing function
process_ipc()

Result 1:
Distance: 0.9252
Text: Section 67-A deals with publishing or transmitting of material containi ng sexually explicit act in 
electronic form.  Contents of Section 67 when combined with  the material containing sexually explicit 
material attract penalty under this Section. 
Child Pornography  has been exclusively dealt with under Section 67B.

Result 2:
Distance: 0.9254
Text: Section 69A inserted in the ITAA, vests with the Centra l Government or any of its officers 
with the powers to issue directions for blocking for publi c access of any information through 
any computer resource, under the same circumstances as me ntioned above.

Result 3:
Distance: 0.9610
Text: 376C.  Sexual intercourse by a person in authority.  
376D.  Gang rape . 
376E. Punishment for repeat offenders.  
Of Unnatural offences  
377. Unnatural offences.  
 
CHAPTER XVII  
OF OFFENCES AGAINST PROPERTY  
Of Theft  
378. Theft.  
379. Punishment for theft.

Result 4:
Distance: 1.0146
Text: 68 
 possession 

[{'distance': 0.9251923,
  'text': 'Section 67-A deals with publishing or transmitting of material containi ng sexually explicit act in \nelectronic form.  Contents of Section 67 when combined with  the material containing sexually explicit \nmaterial attract penalty under this Section. \nChild Pornography  has been exclusively dealt with under Section 67B.'},
 {'distance': 0.92542076,
  'text': 'Section 69A inserted in the ITAA, vests with the Centra l Government or any of its officers \nwith the powers to issue directions for blocking for publi c access of any information through \nany computer resource, under the same circumstances as me ntioned above.'},
 {'distance': 0.9610262,
  'text': '376C.  Sexual intercourse by a person in authority.  \n376D.  Gang rape . \n376E. Punishment for repeat offenders.  \nOf Unnatural offences  \n377. Unnatural offences.  \n \nCHAPTER XVII  \nOF OFFENCES AGAINST PROPERTY  \nOf Theft  \n378. Theft.  \n379. Punishment for theft.'},
 {'distance': 1.01

# **Added Embedding Normalization**

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
import pickle
import faiss
import numpy as np
from sklearn.preprocessing import normalize

# Load tokenizer and model for Legal-BERT
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Function to load chunks from a pickle file
def load_chunks(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Function to calculate weighted query embedding
def calculate_weighted_query_embedding(roberta_output):
    embeddings = []
    weights = []
    for keyword, weight in roberta_output:
        inputs = tokenizer(keyword, return_tensors='pt', truncation=True, padding=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state[:, 0, :].cpu().numpy())
        weights.append(weight)
    embeddings = np.vstack(embeddings)
    weights = np.array(weights).reshape(-1, 1)
    weighted_embedding = np.sum(embeddings * weights, axis=0) / np.sum(weights)
    return weighted_embedding

# Main function to process the IPC document
def process_ipc():
    try:
        # Load preprocessed chunks and embeddings
        ipc_chunks = load_chunks('/content/ipc_chunks.pkl')
        ipc_embeddings = load_chunks('/content/ipc_embeddings.pkl')
        ipc_embeddings = np.vstack(ipc_embeddings)  # Ensure embeddings are a NumPy array

        # Normalize IPC embeddings
        ipc_embeddings = normalize(ipc_embeddings, axis=1, norm='l2')

    except (FileNotFoundError, EOFError):
        print("Preprocessed chunks or embeddings not found. Please ensure the files exist.")
        return

    # Initialize FAISS index
    dimension = ipc_embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(ipc_embeddings.astype('float32'))  # Ensure ipc_embeddings are float32

    # Example input from the RoBERTa model
    roberta_output = [('146', 0.8539), ('ipc', 0.7163), ('section', 0.6221)]
    query_embedding = calculate_weighted_query_embedding(roberta_output)

    # Normalize query embedding
    query_embedding = normalize(query_embedding.reshape(1, -1), norm='l2')

    # Search in FAISS index
    k = 5  # Number of top results to retrieve
    query_embedding = query_embedding.astype('float32')  # Ensure query_embedding is float32
    distances, indices = index.search(query_embedding, k)

    # Prepare and return results
    results = []
    for i, idx in enumerate(indices[0]):
        result = {
            "distance": distances[0][i],
            "text": ipc_chunks[idx]
        }
        results.append(result)
        print(f"Result {i+1}:")
        print(f"Distance: {result['distance']:.4f}")
        print(f"Text: {result['text']}\n")

    return results

# Run the main processing function
process_ipc()


AssertionError: 

# FineTuning on IPC **CodeBook**

In [None]:
!pip install accelerate -U

import accelerate
print(accelerate.__version__)

0.31.0


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from tqdm import tqdm
import pickle
import numpy as np
import os

# Load tokenizer and model for Legal-BERT
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("nlpaueb/legal-bert-base-uncased")

# Function to read the IPC PDF and extract text
def load_ipc_pdf(file_path):
    text = ""
    try:
        with open(file_path, 'rb') as f:
            ipc_chunks = pickle.load(f)
    except Exception as e:
        print(f"Error reading pkl file: {e}")
    return ipc_chunks

# Function to fine-tune Legal-BERT on IPC chunks
def fine_tune_legal_bert(ipc_chunks, labels):
    # Tokenize IPC chunks
    tokenized_data = tokenizer(ipc_chunks, truncation=True, padding=True, return_tensors='pt')


    labels = [0] * len(ipc_chunks)

    # Prepare training and validation sets
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        tokenized_data['input_ids'], labels, random_state=42, test_size=0.2
    )

    # Define training arguments
    training_args = TrainingArguments(
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=3,
        evaluation_strategy="epoch",
        logging_dir='./logs',
        logging_steps=100,
        output_dir='./output',
    )

    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_texts,
        eval_dataset=val_texts,
        compute_metrics=None,
    )

    # Fine-tune the model with tqdm progress bar
    progress_bar = tqdm(range(training_args.num_train_epochs), desc="Training")
    for epoch in progress_bar:
        trainer.train()
        progress_bar.set_postfix({"epoch": epoch + 1})

    # Save the fine-tuned model
    output_model_dir = './fine_tuned_legal_bert'
    model.save_pretrained(output_model_dir)
    tokenizer.save_pretrained(output_model_dir)

# Main function to process the IPC document and fine-tune Legal-BERT
def process_ipc(file_path):
    # Load the IPC chunks from pkl file
    ipc_chunks = load_ipc_pdf(file_path)

    # Fine-tune Legal-BERT on IPC chunks
    fine_tune_legal_bert(ipc_chunks, labels=None)  # Replace with actual labels if available

# Example usage: Process IPC document and fine-tune Legal-BERT
process_ipc('/content/ipc_chunks.pkl')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`