In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Data Preprocessing**

In [None]:
import os

base_path = "/content/drive/MyDrive/Colab Notebooks/GenAIproject/COLIEE2025statute_data-English"
train_folder = os.path.join(base_path, "train")
text_folder = os.path.join(base_path, "text")


In [None]:
import os

# Print all XML filenames
for filename in os.listdir(train_folder):
    if filename.endswith(".xml"):
        print(filename)


riteval_H18_jp.xml
riteval_H19_jp.xml
riteval_H20_jp.xml
riteval_H21_jp.xml
riteval_H23_jp.xml
riteval_H24_jp.xml
riteval_H27_jp.xml
riteval_H28_jp.xml
riteval_H30_jp.xml
riteval_R01_jp.xml
riteval_R04_jp.xml
riteval_R05_jp.xml
riteval_H29_jp.xml
riteval_R02_jp.xml
riteval_H25_jp.xml
riteval_H26_jp.xml
riteval_H22_jp.xml
riteval_R03_jp.xml


In [None]:
import os

xml_files = [f for f in os.listdir(train_folder) if f.endswith(".xml")]
sample_file = os.path.join(train_folder, xml_files[0])
print(f"Inspecting: {sample_file}")

import xml.etree.ElementTree as ET

# Load and parse the file
tree = ET.parse(sample_file)  # make sure sample_file has the correct path
root = tree.getroot()

# Print the full XML structure to inspect
ET.dump(root)



Inspecting: /content/drive/MyDrive/Colab Notebooks/GenAIproject/COLIEE2025statute_data-English/train/riteval_H18_jp.xml
<dataset>
<pair id="H18-1-1" label="Y">
<t1>
Article 572: Even if the seller has made a special agreement to the effect that he/she will not be liable for the warranty in the cases provided for in the main text of Article 562, paragraph 1 or Article 565, he/she cannot be exempt from liability for facts that he/she knew but did not disclose, and for rights that he/she established for a third party or transferred to a third party.
</t1>
<t2>
A special agreement exempting the seller from warranty liability can be made, but even in that case, if the seller has established rights for a third party regarding the subject matter, the seller cannot be exempt from liability.
</t2>
</pair>

<pair id="H18-1-2" label="N">
<t1>
Article 565: The provisions of the preceding three articles shall apply mutatis mutandis to cases where the rights transferred by the seller to the buyer do

**Parsing all the xml files**

In [None]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

def parse_all_files(folder_path):
    data = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.xml'):
            filepath = os.path.join(folder_path, filename)
            tree = ET.parse(filepath)
            root = tree.getroot()

            for pair in root.findall('pair'):
                pair_id = pair.attrib.get('id')
                label = pair.attrib.get('label')  # only in training set

                # t1 is the article (law text), t2 is the query/hypothesis
                article = pair.findtext('t1', default='').strip()
                query = pair.findtext('t2', default='').strip()

                sample = {
                    'id': pair_id,
                    'article': article,
                    'query': query
                }

                if label is not None:
                    sample['label'] = label

                data.append(sample)

    return pd.DataFrame(data)

# Example usage:
df = parse_all_files(train_folder)
print(df.head())
df.to_csv("/content/drive/MyDrive/Colab Notebooks/GenAIproject/parsed_legal_qa_trainenglish.csv", index=False)


        id                                            article  \
0  H18-1-1  Article 572: Even if the seller has made a spe...   
1  H18-1-2  Article 565: The provisions of the preceding t...   
2  H18-1-3  Article 568: A purchaser at an auction based o...   
3  H18-2-1  Article 697. A person who begins to manage aff...   
4  H18-2-2  Article 698: If a manager manages affairs in o...   

                                               query label  
0  A special agreement exempting the seller from ...     Y  
1  There is a time limit for pursuing liability f...     N  
2  Since a forced auction is also a sale, the war...     N  
3  Even if you repair a neighbor's fence to prote...     Y  
4  If a person pushes away a person who is about ...     Y  


In [None]:
import os
test_file_path = "/content/drive/MyDrive/Colab Notebooks/GenAIproject/test-for-task3/test-for-task3/TestData_en.xml"

print(f"Inspecting: {test_file_path}")

import xml.etree.ElementTree as ET

# Load and parse the file
tree = ET.parse(test_file_path)  # make sure sample_file has the correct path
root = tree.getroot()

# Print the full XML structure to inspect
ET.dump(root)



Inspecting: /content/drive/MyDrive/Colab Notebooks/GenAIproject/test-for-task3/test-for-task3/TestData_en.xml
<dataset>
<pair id="R06-01-A">

<t2>
When a petition for the commencement of curatorship is filed for a person who constantly lacks the capacity to appreciate a given situation due to mental disabilities, the family court may decide to establish a curatorship issue.
</t2>
</pair>

<pair id="R06-01-E">

<t2>
If a person under adult guardianship purchases daily necessities without the consent of the adult guardian, the adult guardian may cancel the contract involving the purchase.
</t2>
</pair>

<pair id="R06-01-I">

<t2>
When a petition for the commencement of assistantship is filed by someone other than the person in question, the family court cannot issue a ruling to commence assistance without the consent of the person concerned.
</t2>
</pair>

<pair id="R06-01-O">

<t2>
If the assistant in court refuses to consent to an action that requires their consent despite there being 

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd

def parse_test_file(file_path):
    data = []

    tree = ET.parse(file_path)
    root = tree.getroot()

    for pair in root.findall('pair'):
        pair_id = pair.attrib.get('id')

        article = pair.findtext('t1', default='').strip()
        query = pair.findtext('t2', default='').strip()

        data.append({
            'id': pair_id,
            'article': article,
            'query': query
        })

    return pd.DataFrame(data)

# Example usage:
test_file_path = "/content/drive/MyDrive/Colab Notebooks/GenAIproject/test-for-task3/test-for-task3/TestData_en.xml"
df_test = parse_test_file(test_file_path)
print(df_test.head())

# Save to CSV if needed
df_test.to_csv("/content/drive/MyDrive/Colab Notebooks/GenAIproject/parsed_legal_qa_testenglish.csv", index=False)


         id article                                              query
0  R06-01-A          When a petition for the commencement of curato...
1  R06-01-E          If a person under adult guardianship purchases...
2  R06-01-I          When a petition for the commencement of assist...
3  R06-01-O          If the assistant in court refuses to consent t...
4  R06-03-A          When the other party makes a manifestation of ...


**Parsing the code law Text Files**

In [None]:
civil_code_file = os.path.join(text_folder, "civil_code_jp-1to724-2.txt")

# Open and print a portion of the file
with open(civil_code_file, 'r', encoding='utf-8') as file:
    lines = file.readlines()

print(f"Total lines: {len(lines)}")
print("\n--- First 30 lines ---\n")
for line in lines[:30]:
    print(line.strip())


Total lines: 2866

--- First 30 lines ---

Civil Code (Parts 1, 2, and 3)
Part 1: General Provisions
Chapter 1: General Provisions
(Basic Principles)
Article 1: Private rights must conform to the public welfare.
2: Rights must be exercised and obligations must be performed in good faith.
3: Abuse of rights is not permitted.
(Standards for interpretation)
Article 2: This Act must be interpreted in accordance with the dignity of the individual and the essential equality of the sexes.
Chapter 2: Persons
Section 1: Capacity to have rights
Article 3: The enjoyment of private rights begins at birth.
2: Foreigners shall enjoy private rights, except in cases prohibited by law or treaty.
Section 2: Capacity to make decisions
Article 3-2: If a party to a legal act does not have capacity to make decisions at the time of expressing his or her intention, the legal act is invalid.
Section 3: Capacity to make decisions
(Adulthood)
Article 4: A person becomes an adult at the age of 20.
(Legal acts of 

In [None]:
import os
import re

civil_code_file = os.path.join(text_folder, "civil_code_jp-1to724-2.txt")

with open(civil_code_file, 'r', encoding='utf-8') as file:
    lines = file.readlines()

article_dict = {}
current_article = None
article_text = []

for line in lines:
    line = line.strip()
    if not line:
        continue  # skip empty lines

    # Start of a new article
    article_match = re.match(r'Article\s+(\d+[-\d+]*)\:', line)
    if article_match:
        # Save the previous article before starting a new one
        if current_article:
            article_dict[current_article] = ' '.join(article_text).strip()

        current_article = f"Article {article_match.group(1)}"
        article_text = [line.split(":", 1)[1].strip()]  # get the first line of the article
    elif re.match(r'^\d+:', line):
        # Sub-clause of the current article
        clause_text = line.split(":", 1)[1].strip()
        article_text.append(clause_text)
    elif current_article:
        # Continuation of the current article (often a new paragraph)
        article_text.append(line)

# Save the last article
if current_article:
    article_dict[current_article] = ' '.join(article_text).strip()

print(f"Total articles parsed: {len(article_dict)}")
print("\nSample:\n")
for k in list(article_dict.keys())[:5]:
    print(k, "→", article_dict[k])


Total articles parsed: 476

Sample:

Article 1 → Private rights must conform to the public welfare. Rights must be exercised and obligations must be performed in good faith. Abuse of rights is not permitted. (Standards for interpretation)
Article 2 → This Act must be interpreted in accordance with the dignity of the individual and the essential equality of the sexes. Chapter 2: Persons Section 1: Capacity to have rights
Article 3 → The enjoyment of private rights begins at birth. Foreigners shall enjoy private rights, except in cases prohibited by law or treaty. Section 2: Capacity to make decisions
Article 3-2 → If a party to a legal act does not have capacity to make decisions at the time of expressing his or her intention, the legal act is invalid. Section 3: Capacity to make decisions (Adulthood)
Article 4 → A person becomes an adult at the age of 20. (Legal acts of minors)


In [None]:
import csv

output_path_csv = "/content/drive/MyDrive/Colab Notebooks/GenAIproject/parsed_civil_code.csv"
with open(output_path_csv, "w", newline='', encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Article", "Text"])
    for article_id, text in article_dict.items():
        writer.writerow([article_id, text])

print(f"Saved parsed articles to {output_path_csv}")


Saved parsed articles to /content/drive/MyDrive/Colab Notebooks/GenAIproject/parsed_civil_code.csv


# **Fine Tuning for legal Entailment task**

Fine tuning the complete model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/GenAIproject/parsed_legal_qa_trainenglish.csv")

# Split into 80% train and 20% validation
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])


In [None]:
from sentence_transformers import SentenceTransformer, models

# Load transformer
word_embedding_model = models.Transformer("nlpaueb/legal-bert-base-uncased")

# Add pooling layer
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode='mean'
)

# SentenceTransformer model
embedding_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])





The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from sentence_transformers import losses

train_loss = losses.CosineSimilarityLoss(model=embedding_model)


In [None]:
pip install datasets



In [None]:
from sentence_transformers import InputExample
from datasets import IterableDataset
from sentence_transformers import InputExample, SentencesDataset
from datasets import Dataset
# Step 1: Clean and preprocess the data
train_df = train_df.dropna(subset=["article", "query", "label"])
train_df['label'] = train_df['label'].map({'Y': 1, 'N': 0})

# Step 2: Create InputExample objects for the Sentence-Transformers Trainer
train_examples = [
    InputExample(texts=[row['query'], row['article']], label=float(row['label']))
    for _, row in train_df.iterrows()
]

# Now use SentencesDataset
#train_dataset = SentencesDataset(train_examples, model=embedding_model)
# Convert SentencesDataset to Hugging Face Dataset
#hf_dataset = Dataset.from_dict({
  #  "texts": [[ex.texts[0], ex.texts[1]] for ex in train_examples],
 #   "label": [ex.label for ex in train_examples]
#})
# Step 2: Convert data to the correct Hugging Face Dataset format with separate columns for query and article
hf_dataset = Dataset.from_dict({
    "query": train_df["query"].tolist(),
    "article": train_df["article"].tolist(),
    "label": train_df["label"].tolist()
})


# Now, train_dataset is iterable and you can pass it to your trainer


In [None]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sklearn.metrics import accuracy_score, f1_score

# Prepare your validation dataset (assume your validation dataset has 'query', 'article', and 'label')
# Convert 'label' into 0 (No) and 1 (Yes) for binary classification

# For example, label might already be in a "Yes"/"No" format, so just encode it
val_labels = [1 if label == 'Y' else 0 for label in val_df["label"]]

# Create the ClassificationEvaluator
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_df["query"],   # Use the 'query' as sentence1
    sentences2=val_df["article"], # Use the 'article' as sentence2
    scores=val_labels,
    main_similarity="cosine"
)


In [None]:
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
 # Define the training arguments
args = SentenceTransformerTrainingArguments(
output_dir="/content/drive/MyDrive/Colab Notebooks/GenAIproject/output/base_embedding_model",
num_train_epochs=1,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
warmup_steps=100,
fp16=True,
eval_steps=100,
logging_steps=100,
report_to="none"
)

In [None]:
!pip install -U "sentence-transformers[train]"

Collecting sentence-transformers[train]
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers[train])
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers[train])
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers[train])
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers[train])
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers[train])
  Downloading nvidia_cublas

In [None]:
# 10. Start training
import os
os.environ["WANDB_DISABLED"] = "true"



In [None]:
pip install wandb



In [None]:
from sentence_transformers.trainer import SentenceTransformerTrainer
 # Train embedding model
trainer = SentenceTransformerTrainer(
 model=embedding_model,
 args=args,
 train_dataset=hf_dataset,
 loss=train_loss,
 evaluator=evaluator
 )
trainer.train()


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
100,0.2954
200,0.2807


TrainOutput(global_step=240, training_loss=0.2829511602719625, metrics={'train_runtime': 3016.31, 'train_samples_per_second': 0.318, 'train_steps_per_second': 0.08, 'total_flos': 0.0, 'train_loss': 0.2829511602719625, 'epoch': 1.0})

In [None]:
# loading the saved model
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("/content/drive/MyDrive/Colab Notebooks/GenAIproject/output/base_embedding_model//checkpoint-240")






In [None]:
# Show the first 10 rows of the raw validation data
print(val_df[["query", "article", "label"]].head(10))

# Optionally, check unique label values to see what exactly is present
print("Unique label values in val_df:", val_df["label"].unique())


                                                  query  \
276   In a contract that involves the completion of ...   
800   When the due date of the claim that is the obj...   
226   When a debtor who is solely responsible for mo...   
1088  In a lawsuit in which an agent claims the paym...   
1149  Concurrent debt assumption shall be validly es...   
666   The validity of a contract for the benefit of ...   
1170  A has concluded a loan contract with B to allo...   
549   When a defect in the object of work was caused...   
674   The administrator appointed by the absentee ma...   
0     A special agreement exempting the seller from ...   

                                                article label  
276   Article 637: In the case prescribed in the mai...     N  
800   Article 366: A pledgee may directly collect th...     N  
226   Article 147: In the event of any of the follow...     N  
1088  Article 632: A contract comes into effect when...     N  
1149  Article 470: The assumer

In [None]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Ensure the DataFrame index is reset (to avoid KeyError issues)
val_df = val_df.reset_index(drop=True)
val_df['label'] = val_df['label'].map({'Y': 1, 'N': 0})
# Convert labels to float list (if already numeric 0/1)
val_labels = val_df["label"].astype(float).tolist()

# Ensure sentences are passed as lists of strings
queries = val_df["query"].astype(str).tolist()
articles = val_df["article"].astype(str).tolist()

# Create the evaluator
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=queries,       # Hypotheses
    sentences2=articles,     # Articles
    scores=val_labels,       # Ground-truth entailment (1 or 0)
    main_similarity="cosine" # Similarity metric
)
import pandas as pd

val_display_df = pd.DataFrame({
    "query": queries,
    "article": articles,
    "label": val_labels
})

# Show the first few rows
print(val_display_df.head(10))

                                               query  \
0  In a contract that involves the completion of ...   
1  When the due date of the claim that is the obj...   
2  When a debtor who is solely responsible for mo...   
3  In a lawsuit in which an agent claims the paym...   
4  Concurrent debt assumption shall be validly es...   
5  The validity of a contract for the benefit of ...   
6  A has concluded a loan contract with B to allo...   
7  When a defect in the object of work was caused...   
8  The administrator appointed by the absentee ma...   
9  A special agreement exempting the seller from ...   

                                             article  label  
0  Article 637: In the case prescribed in the mai...    0.0  
1  Article 366: A pledgee may directly collect th...    0.0  
2  Article 147: In the event of any of the follow...    0.0  
3  Article 632: A contract comes into effect when...    0.0  
4  Article 470: The assumer of a concurrent debt ...    1.0  
5  (Contrac

In [None]:
evaluator(embedding_model)

{'pearson_cosine': np.float64(0.27215176540383806),
 'spearman_cosine': np.float64(0.26044746396146756)}

**Resuming the training for more epochs**

In [None]:
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

from datasets import IterableDataset
# Step 1: Load the model from your checkpoint
embedding_model = SentenceTransformer("/content/drive/MyDrive/Colab Notebooks/GenAIproject/output/base_embedding_model/checkpoint-240")

# Step 2: Define the training arguments (run for 2 more epochs)
args = SentenceTransformerTrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/GenAIproject/output/base_embedding_model",
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,                  # Run evaluation every 10 steps
    logging_steps=100,               # Log training metrics every 10 steps
    eval_strategy="steps",    # Required to evaluate during training
    logging_dir="./logs",
    save_strategy="epoch",          # Save checkpoint after each epoch
    report_to="none"                # Disable logging to external tools
)

train_loss = losses.CosineSimilarityLoss(model=embedding_model)

# Step 4: Initialize the trainer
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=hf_dataset,
    loss=train_loss,
    evaluator=evaluator
)

# Step 5: Resume training
trainer.train(resume_from_checkpoint=True)


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Pearson Cosine,Spearman Cosine
300,0.2423,No log,0.199302,0.213411
400,0.2244,No log,0.310168,0.296778


TrainOutput(global_step=480, training_loss=0.11357930699984232, metrics={'train_runtime': 3417.8668, 'train_samples_per_second': 0.562, 'train_steps_per_second': 0.14, 'total_flos': 0.0, 'train_loss': 0.11357930699984232, 'epoch': 2.0})

In [None]:
evaluator(embedding_model)

{'pearson_cosine': 0.29055880137514484, 'spearman_cosine': 0.2802968087899397}

In [None]:
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

from datasets import IterableDataset
# Step 1: Load the model from your checkpoint
embedding_model = SentenceTransformer("/content/drive/MyDrive/Colab Notebooks/GenAIproject/output/base_embedding_model/checkpoint-480")

# Step 2: Define the training arguments (run for 2 more epochs)
args = SentenceTransformerTrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/GenAIproject/output/base_embedding_model",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,                  # Run evaluation every 10 steps
    logging_steps=100,               # Log training metrics every 10 steps
    eval_strategy="steps",    # Required to evaluate during training
    logging_dir="./logs",
    save_strategy="epoch",          # Save checkpoint after each epoch
    report_to="none"                # Disable logging to external tools
)

train_loss = losses.CosineSimilarityLoss(model=embedding_model)

# Step 4: Initialize the trainer
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=hf_dataset,
    loss=train_loss,
    evaluator=evaluator
)

# Step 5: Resume training
trainer.train(resume_from_checkpoint=True)


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Pearson Cosine,Spearman Cosine
500,0.1786,No log,0.267869,0.256959
600,0.1587,No log,0.293583,0.288357
700,0.1576,No log,0.273085,0.255395


TrainOutput(global_step=720, training_loss=0.053464014331499735, metrics={'train_runtime': 3542.0189, 'train_samples_per_second': 0.813, 'train_steps_per_second': 0.203, 'total_flos': 0.0, 'train_loss': 0.053464014331499735, 'epoch': 3.0})

In [None]:
evaluator(embedding_model)

{'pearson_cosine': 0.2731916042696001, 'spearman_cosine': 0.25503400628097517}