### CODE TO COMBINE ALL CASE SUMMARIES TO MAKE PRE TRAINING DATASET

In [None]:
import os

# Set your folder path containing the case text files
folder_path = 'G:\\college stuff\\sem6\\NLP project indian\\pre-train-data\\dataset'  # Replace with your actual path
output_file = 'combined_cases.txt'

# Create/open the output file in write mode
with open(output_file, 'w', encoding='utf-8') as outfile:
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as infile:
                case_text = infile.read().strip()
                outfile.write("<START CASE>\n")
                outfile.write(case_text + "\n")
                outfile.write("<END CASE>\n\n")

print(f"All cases have been combined into '{output_file}'.")


### CODE TO SUMMARIES CASE TEXT 

In [None]:
import os
from dotenv import load_dotenv
from langchain_community.llms import Ollama
from transformers import BartTokenizer
from langchain.text_splitter import TokenTextSplitter
import ollama
import shutil
import re
import json
from tqdm import tqdm

load_dotenv()

MODEL = "llama3.1:8b"

model = Ollama(model = MODEL)

In [None]:
prompt = """ 
You are a legal assistant AI trained to analyze Indian legal cases. Your task is to extract and present key information from the given case in a structured and organized format.

Analyze the case and return the following sections with appropriate headings:

- FACTS
- ARGUMENTS
- OBSERVATIONS
- JUDGMENT

If any section is missing or unclear, state: "Information not available" under that heading.

Format your response using XML-like tags for each section like this:

<FACTS>
[Extracted facts]
</FACTS>

<ARGUMENTS>
[Extracted arguments]
</ARGUMENTS>

<OBSERVATIONS>
[Extracted observations]
</OBSERVATIONS>

<JUDGMENT>
[Extracted judgment]
</JUDGMENT>

"""

In [None]:
folder_path = "G:\\college stuff\\sem6\\NLP project indian\\pre-train-data\\new"
output_folder = "G:\\college stuff\\sem6\\NLP project indian\\pre-train-data\\dataset"
move_folder = "G:\\college stuff\\sem6\\NLP project indian\\pre-train-data\\processed"

# Get total number of files for the progress bar
total_files = 0
for _, _, files in os.walk(folder_path):
    total_files += len(files)

# Create a progress bar
progress_bar = tqdm(total=total_files, desc="Processing Files", unit="file")

# List all files in the folder
file_count = 0
for root, dirs, files in os.walk(folder_path):
    for filename in files:
        file_path = os.path.join(root, filename)  # Create full file path
        
        with open(file_path, "r", encoding="utf-8") as f:
            case_text = f.read()
        
        tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
        tokens = tokenizer(case_text, return_tensors="pt")['input_ids']
        
        # Create a token splitter based on BART's token size limit
        token_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=100)
        
        # Split into chunks
        chunks = token_splitter.split_text(case_text)
        
        input_text = f"{case_text}\n\nQuestion: {prompt}\nAnswer:"
        
        response = ollama.chat(
            model='llama3.1:8b',
            messages=[{'role': 'user', 'content': input_text}]
        )
        
        # Write the output to the summary file
        output_file_path = os.path.join(output_folder, filename.replace(".txt", "_pretrain.txt"))
        with open(output_file_path, "w", encoding="utf-8") as out_file:
            out_file.write(response['message']['content'])
        
        # Move the processed file to the move_folder
        destination_path = os.path.join(move_folder, filename)
        shutil.move(file_path, destination_path)
        
        # Update progress bar
        progress_bar.update(1)
        progress_bar.set_postfix({"Current File": filename})
        
        file_count += 1

# Close the progress bar when done
progress_bar.close()
print(f"Completed processing {file_count} files.")

### Creating Pre-training dataset

In [None]:
import os

# Set your folder path containing the case text files
folder_path = 'G:\\college stuff\\sem6\\NLP project indian\\pre-train-data\\dataset'  # Replace with your actual path
output_file = 'combined_cases.txt'

# Create/open the output file in write mode
with open(output_file, 'w', encoding='utf-8') as outfile:
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as infile:
                case_text = infile.read().strip()
                outfile.write("<START CASE>\n")
                outfile.write(case_text + "\n")
                outfile.write("<END CASE>\n\n")

print(f"All cases have been combined into '{output_file}'.")
