In [1]:
# Check GPU and set seeds for reproducibility
import os, random, time
import torch
print("PyTorch version:", torch.__version__)
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
else:
    print("No GPU available")

# Reproducibility
SEED = 42
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
import numpy as np
np.random.seed(SEED)s
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


PyTorch version: 2.8.0+cu126
GPU: Tesla T4


In [2]:
!pip install --upgrade pip --quiet
!pip install torch torchvision torchaudio --quiet
!pip install pandas scikit-learn matplotlib seaborn tqdm --quiet


In [3]:
# Cell 2: Unzip the uploaded archive safely

import os

# Path to your uploaded zip
zip_path = "archive.zip"

# Make sure the dataset folder exists fresh
if os.path.exists("dataset"):
    !rm -rf dataset

# Unzip archive into 'dataset' folder, overwrite if necessary
!unzip -o "{zip_path}" -d dataset

# List all CSV files to confirm
import glob
csv_files = glob.glob("dataset/**/*.csv", recursive=True)
print("Number of CSV files found:", len(csv_files))
print(csv_files[:10])  # show first 10 files


Archive:  archive.zip
  inflating: dataset/absence-of-certain-changes-or-events.csv  
  inflating: dataset/absence-of-certain-changes.csv  
  inflating: dataset/acceleration.csv  
  inflating: dataset/access-to-information.csv  
  inflating: dataset/access.csv      
  inflating: dataset/accounting-terms.csv  
  inflating: dataset/additional-agreements.csv  
  inflating: dataset/additional-documents.csv  
  inflating: dataset/adjustments.csv  
  inflating: dataset/affirmative-covenants.csv  
  inflating: dataset/agreement.csv   
  inflating: dataset/agreements.csv  
  inflating: dataset/amendment-and-waiver.csv  
  inflating: dataset/amendment-waiver.csv  
  inflating: dataset/amendment.csv   
  inflating: dataset/amendments-and-waivers.csv  
  inflating: dataset/amendments-etc.csv  
  inflating: dataset/amendments-waivers.csv  
  inflating: dataset/amendments.csv  
  inflating: dataset/applicable-law.csv  
  inflating: dataset/application-of-proceeds.csv  
  inflating: dataset/appointm

In [4]:
# Imports
import os
import glob
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Fix random seed
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)


Using device: cuda


In [5]:
# Build clause pairs
def build_pairs_from_folder(folder_path, max_per_category=None, neg_ratio=1):
    files = glob.glob(os.path.join(folder_path, "**/*.csv"), recursive=True)
    if len(files) == 0:
        raise ValueError(f"No CSV files found in {folder_path}.")

    categories = {}
    for f in files:
        df = pd.read_csv(f)
        if 'clause_text' not in df.columns:
            continue
        texts = df['clause_text'].dropna().astype(str).tolist()
        if max_per_category:
            texts = texts[:max_per_category]
        cat = os.path.splitext(os.path.basename(f))[0]
        categories[cat] = texts

    pairs = []
    # Positive pairs
    for cat, texts in categories.items():
        for i in range(len(texts)):
            for j in range(i+1, len(texts)):
                pairs.append((texts[i], texts[j], 1))
    # Negative pairs
    cats = list(categories.keys())
    for cat_a in cats:
        for cat_b in cats:
            if cat_a == cat_b:
                continue
            list_a = categories[cat_a]
            list_b = categories[cat_b]
            for a in list_a:
                for _ in range(neg_ratio):
                    b = random.choice(list_b)
                    pairs.append((a, b, 0))
    random.shuffle(pairs)
    return pairs

# Build pairs (use max_per_category=50 to speed up in Colab)
pairs = build_pairs_from_folder("dataset", max_per_category=50, neg_ratio=1)
print("Total pairs:", len(pairs))


Total pairs: 8249580


In [6]:
# Split dataset
train_pairs, temp_pairs = train_test_split(pairs, test_size=0.2, random_state=SEED)
val_pairs, test_pairs = train_test_split(temp_pairs, test_size=0.5, random_state=SEED)

print(f"Train: {len(train_pairs)}, Val: {len(val_pairs)}, Test: {len(test_pairs)}")


Train: 6599664, Val: 824958, Test: 824958


In [7]:
# Simple tokenization (word-level, limited vocab)
from collections import Counter

MAX_SEQ_LEN = 50  # limit to reduce memory usage

def build_vocab(pairs, min_freq=1):
    counter = Counter()
    for t1, t2, _ in pairs:
        counter.update(t1.lower().split())
        counter.update(t2.lower().split())
    vocab = {w:i+2 for i, (w, freq) in enumerate(counter.items()) if freq >= min_freq}  # 0=PAD, 1=UNK
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    return vocab

vocab = build_vocab(train_pairs)
vocab_size = len(vocab)
print("Vocab size:", vocab_size)

def encode_text(text, vocab, max_len=MAX_SEQ_LEN):
    tokens = [vocab.get(w, vocab['<UNK>']) for w in text.lower().split()]
    tokens = tokens[:max_len]
    tokens += [vocab['<PAD>']] * (max_len - len(tokens))
    return tokens

class ClausePairDataset(Dataset):
    def __init__(self, pairs, vocab):
        self.pairs = pairs
        self.vocab = vocab

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        t1, t2, label = self.pairs[idx]
        t1_enc = torch.tensor(encode_text(t1, self.vocab))
        t2_enc = torch.tensor(encode_text(t2, self.vocab))
        label = torch.tensor(label, dtype=torch.float)
        return t1_enc, t2_enc, label

# Dataloaders
BATCH_SIZE = 32
train_dataset = ClausePairDataset(train_pairs, vocab)
val_dataset = ClausePairDataset(val_pairs, vocab)
test_dataset = ClausePairDataset(test_pairs, vocab)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)


Vocab size: 38814


In [8]:
# Simple tokenization (word-level, limited vocab)
from collections import Counter

MAX_SEQ_LEN = 50  # limit to reduce memory usage

def build_vocab(pairs, min_freq=1):
    counter = Counter()
    for t1, t2, _ in pairs:
        counter.update(t1.lower().split())
        counter.update(t2.lower().split())
    vocab = {w:i+2 for i, (w, freq) in enumerate(counter.items()) if freq >= min_freq}  # 0=PAD, 1=UNK
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    return vocab

vocab = build_vocab(train_pairs)
vocab_size = len(vocab)
print("Vocab size:", vocab_size)

def encode_text(text, vocab, max_len=MAX_SEQ_LEN):
    tokens = [vocab.get(w, vocab['<UNK>']) for w in text.lower().split()]
    tokens = tokens[:max_len]
    tokens += [vocab['<PAD>']] * (max_len - len(tokens))
    return tokens

class ClausePairDataset(Dataset):
    def __init__(self, pairs, vocab):
        self.pairs = pairs
        self.vocab = vocab

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        t1, t2, label = self.pairs[idx]
        t1_enc = torch.tensor(encode_text(t1, self.vocab))
        t2_enc = torch.tensor(encode_text(t2, self.vocab))
        label = torch.tensor(label, dtype=torch.float)
        return t1_enc, t2_enc, label

# Dataloaders
BATCH_SIZE = 32
train_dataset = ClausePairDataset(train_pairs, vocab)
val_dataset = ClausePairDataset(val_pairs, vocab)
test_dataset = ClausePairDataset(test_pairs, vocab)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)


Vocab size: 38814


In [9]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, hidden_dim=64):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim*4, 1)  # concat two clause representations

    def forward(self, x1, x2):
        # Encode first clause
        e1 = self.embedding(x1)
        _, (h1, _) = self.lstm(e1)
        h1 = torch.cat((h1[-2], h1[-1]), dim=1)
        # Encode second clause
        e2 = self.embedding(x2)
        _, (h2, _) = self.lstm(e2)
        h2 = torch.cat((h2[-2], h2[-1]), dim=1)
        # Concatenate and classify
        out = torch.cat([h1, h2], dim=1)
        out = torch.sigmoid(self.fc(out))
        return out.squeeze()


In [10]:
class CNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, num_filters=64, kernel_sizes=[3,4,5]):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.convs = nn.ModuleList([nn.Conv1d(embed_dim, num_filters, k) for k in kernel_sizes])
        self.fc = nn.Linear(num_filters*len(kernel_sizes)*2, 1)

    def forward(self, x1, x2):
        def encode(x):
            e = self.embedding(x).permute(0,2,1)  # batch x embed x seq
            conv_out = [torch.relu(conv(e)) for conv in self.convs]
            pool_out = [torch.max(c, dim=2)[0] for c in conv_out]
            return torch.cat(pool_out, dim=1)
        h1 = encode(x1)
        h2 = encode(x2)
        out = torch.cat([h1,h2], dim=1)
        out = torch.sigmoid(self.fc(out))
        return out.squeeze()


In [11]:
def train_model(model, train_loader, val_loader, epochs=3, lr=1e-3):
    model = model.to(device)
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    train_losses, val_losses = [], []
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for t1, t2, labels in train_loader:
            t1, t2, labels = t1.to(device), t2.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(t1, t2)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        train_losses.append(total_loss/len(train_loader))

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for t1, t2, labels in val_loader:
                t1, t2, labels = t1.to(device), t2.to(device), labels.to(device)
                outputs = model(t1, t2)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
        val_losses.append(val_loss/len(val_loader))

        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {train_losses[-1]:.4f} | Val Loss: {val_losses[-1]:.4f}")
    return train_losses, val_losses


In [12]:
def evaluate_model(model, loader):
    model.eval()
    y_true, y_pred, y_prob = [], [], []
    with torch.no_grad():
        for t1, t2, labels in loader:
            t1, t2 = t1.to(device), t2.to(device)
            outputs = model(t1, t2)
            y_prob.extend(outputs.cpu().numpy())
            y_pred.extend((outputs.cpu().numpy() >= 0.5).astype(int))
            y_true.extend(labels.numpy())
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_prob)
    return accuracy, precision, recall, f1, roc_auc


In [None]:
bilstm_model = BiLSTMClassifier(vocab_size)
train_model(bilstm_model, train_loader, val_loader, epochs=3)
metrics = evaluate_model(bilstm_model, test_loader)
print("BiLSTM Test Metrics: Accuracy {:.4f}, Precision {:.4f}, Recall {:.4f}, F1 {:.4f}, ROC-AUC {:.4f}".format(*metrics))


In [None]:
cnn_model = CNNClassifier(vocab_size)
train_model(cnn_model, train_loader, val_loader, epochs=3)
metrics = evaluate_model(cnn_model, test_loader)
print("CNN Test Metrics: Accuracy {:.4f}, Precision {:.4f}, Recall {:.4f}, F1 {:.4f}, ROC-AUC {:.4f}".format(*metrics))
