In [12]:
import os
import torch

# Check for the availability of CUDA-enabled GPU
if torch.cuda.is_available():
    # Set device to GPU
    torch.cuda.set_device(0)
    device = torch.device("cuda")
    print("Switched to GPU")
else:
    # Use CPU if no GPU is available
    device = torch.device("cpu")
    print("No GPU available, using CPU")

print("Using device:", device)


No GPU available, using CPU
Using device: cpu


In [14]:
import torch

print("CUDA Available:", torch.cuda.is_available())
print("Number of CUDA devices:", torch.cuda.device_count())


CUDA Available: False
Number of CUDA devices: 0


In [1]:
import torch

print("PyTorch version:", torch.__version__)
import torch

print("CUDA version:", torch.version.cuda)


PyTorch version: 2.2.0+cpu
CUDA version: None


In [2]:
import os

judgement_folder = os.path.join(os.getcwd(), "judgement")
summary_folder = os.path.join(os.getcwd(), "summary")

judgements = []
summaries = []

for filename in os.listdir(judgement_folder):
    with open(os.path.join(judgement_folder, filename), 'r', encoding='utf-8') as file:
        judgements.append(file.read())
    
    with open(os.path.join(summary_folder, filename), 'r', encoding='utf-8') as file:
        summaries.append(file.read())


In [3]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam

class LegalDataset(Dataset):
    def __init__(self, judgements, summaries, tokenizer, max_length=512):
        self.judgements = judgements
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.judgements)

    def __getitem__(self, idx):
        input_text = self.judgements[idx]
        target_text = self.summaries[idx]

        encoding = self.tokenizer(input_text, max_length=self.max_length, return_tensors='pt', truncation=True, padding='max_length')
        labels = self.tokenizer(target_text, max_length=self.max_length, return_tensors='pt', truncation=True, padding='max_length').input_ids

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': labels.squeeze()
        }

# Load pre-trained model and tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Create an optimizer
optimizer = Adam(model.parameters(), lr=1e-4)

# Instantiate the dataset and dataloader
dataset = LegalDataset(judgements, summaries, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Fine-tune the model using your legal dataset
for epoch in range(3):
    for batch in dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# Save the fine-tuned model
model.save_pretrained('fine_tuned_legal_summarizer')


KeyboardInterrupt: 

In [9]:
print("Input Encoding:", input_encoding)


Input Encoding: {'input_ids': tensor([[25024,   465,     5,   301,     4,  7765,    13, 24319,     5, 25024,
            45,     8,  1592,  2243,    13,     3,  2047, 11346,  2693,     6,
         16327,     9,    63,     6,    16,     3,     9,  2848,   365,  1375,
             3,  3539,    13,     8,  2557, 20110,  1104,  1983,     6,   335,
          2884,     5,   480,     5,   329,     5,  9810,  5605,    41,   567,
             5,   276,     5,  1823,   189, 16658,     6,    28,   376,   201,
            21,     8,  8319,     3,  6761,     5,     3,    31,   283,     5,
           254,     5,  2821,   138,   900,    26,     6, 10154,  2146,    21,
          1547,    41,   566,     5,   446,     5,  2432,  3380,   291,     6,
            28,   376,   201,    21,     8,  3531,   295,     5, 10247,     5,
           932,  2208,     5,    37,  7661,    13,     8,  2243,    47,  3566,
            57,  7934, 11120,     3, 25000,   308,  4800,  5478,   683,  5033,
           446,     5,

In [9]:
summary_ids = fine_tuned_model.generate(**input_encoding, max_length=150, use_cache=False)


NameError: name 'input_encoding' is not defined

In [3]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Specify the path to the directory where the fine-tuned model and tokenizer will be saved
model_directory_path = 'fine_tuned_legal_summarizer'

# Load the original T5 model and tokenizer
original_model = T5ForConditionalGeneration.from_pretrained('t5-small')
original_tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Load your fine-tuned model and save both the model and tokenizer
fine_tuned_model = T5ForConditionalGeneration.from_pretrained('fine_tuned_legal_summarizer')
fine_tuned_model.save_pretrained(model_directory_path)
original_tokenizer.save_pretrained(model_directory_path)

# Example: Specify the path to your text file
text_file_path = '2.txt'

# Read the content of the text file
with open(text_file_path, 'r', encoding='utf-8') as file:
    input_text = file.read()

# Tokenize and generate summary
input_encoding = original_tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True, padding=True)
summary_ids = fine_tuned_model.generate(**input_encoding, max_length=150, use_cache=False)

generated_summary = original_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Print the generated summary
print("Generated Summary:", generated_summary)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Generated Summary: 


In [11]:
output_file_path = 'generated_summary.txt'
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    output_file.write(generated_summary)

print(f"Generated summary saved to {output_file_path}")

Generated summary saved to generated_summary.txt
