In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel, TFBertForSequenceClassification


Define the device

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Load the dataset

In [3]:
data = pd.read_csv("contracts-clauses-datasets.csv")
sentences = data["Text"].values
labels = data["Label"].values
data_top = data.head() 
data_top 

def convert_labels(labels):
    new_labels = []
    for label in labels:
        try:
            new_label = int(label)
            new_labels.append(new_label)
        except ValueError:
            continue
    return new_labels
all_labels = []
for label in labels:
    all_labels.append(label)

# Find the unique label values
unique_labels = set(all_labels)

# Print the unique label values
print("Unique label values:", unique_labels)
label_map = {
    'investment_company': 0,
    'WHEREAS': 1,
    'seed': 2,
    'Insurance': 3,
    'esop': 4,
    'compensation': 5,
    'Definitions': 6,
    'financing': 7,
    'payment_terms': 8,
    'Headings': 9,
    'loans': 10,
    'shares': 11,
    'Notices': 12,
    'Severability': 13,
    'taxes': 14,
    'Assignment': 15,
    'stock_option': 16,
    'payment': 17,
    'vesting': 18,
    'Miscellaneous': 19,
    'private_equity': 20,
    'investments': 21,
    'Governing': 22,
    'interest': 23,
    'grant_of_option': 24,
    'conversion_of_shares': 25,
    'foreign_investors': 26,
    'Entire': 27,
    'Termination': 28,
    'Indemnification': 29,
    'ownership_of_shares': 30,
    'investment-company-act': 31,
    'dividends': 32,
    'grant': 33,
    'base-salary': 34,
    'NOW': 35,
    'Representations': 36,
    'Confidentiality': 37,
    'board': 38,
    'employee_benefits': 39,
    'Counterparts': 40,
    'capitalization': 41
}



Unique label values: {'Severability', 'compensation', 'Entire', 'grant', 'investment_company', 'Assignment', 'employee_benefits', 'payment', 'base-salary', 'Confidentiality', 'WHEREAS', 'stock_option', 'seed', 'Insurance', 'conversion_of_shares', 'payment_terms', 'financing', 'grant_of_option', 'investment-company-act', 'Headings', 'investments', 'Miscellaneous', 'ownership_of_shares', 'Governing', 'private_equity', 'Definitions', 'taxes', 'shares', 'Representations', 'capitalization', 'Termination', 'NOW', 'esop', 'foreign_investors', 'dividends', 'loans', 'Indemnification', 'Counterparts', 'Notices', 'board', 'interest', 'vesting'}


Tokenize the sentences

In [4]:
tokenizer = BertTokenizer.from_pretrained('google/bert_uncased_L-2_H-128_A-2')
sentences = [str(sentence) for sentence in sentences]

encoded_sentences = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors="pt")


Define the dataset class

In [6]:
class ContractsDataset(Dataset):
    def __init__(self, encoded_sentences, labels):
        self.input_ids = encoded_sentences["input_ids"]
        self.attention_mask = encoded_sentences["attention_mask"]
        self.labels = labels
        self.label_map = {
            'investment_company': 0,
            'WHEREAS': 1,
            'seed': 2,
            'Insurance': 3,
            'esop': 4,
            'compensation': 5,
            'Definitions': 6,
            'financing': 7,
            'payment_terms': 8,
            'Headings': 9,
            'loans': 10,
            'shares': 11,
            'Notices': 12,
            'Severability': 13,
            'taxes': 14,
            'Assignment': 15,
            'stock_option': 16,
            'payment': 17,
            'vesting': 18,
            'Miscellaneous': 19,
            'private_equity': 20,
            'investments': 21,
            'Governing': 22,
            'interest': 23,
            'grant_of_option': 24,
            'conversion_of_shares': 25,
            'foreign_investors': 26,
            'Entire': 27,
            'Termination': 28,
            'Indemnification': 29,
            'ownership_of_shares': 30,
            'investment-company-act': 31,
            'dividends': 32,
            'grant': 33,
            'base-salary': 34,
            'NOW': 35,
            'Representations': 36,
            'Confidentiality': 37,
            'board': 38,
            'employee_benefits': 39,
            'Counterparts': 40,
            'capitalization': 41
        }
        
    def __getitem__(self, idx):
        label_id = self.label_map[self.labels[idx]]
        return {
                "input_ids": torch.tensor(self.input_ids[idx], dtype=torch.long),
                "attention_mask": torch.tensor(self.attention_mask[idx], dtype=torch.long),
                "labels": torch.tensor(label_id, dtype=torch.long)
                }
        
    def __len__(self):
        return len(self.labels)


Define the LinearBERT model

In [7]:
class LinearBERT(nn.Module):
    def __init__(self):
        super(LinearBERT, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.linear = nn.Linear(self.bert.config.hidden_size, 1)
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        logits = self.linear(pooled_output)
        return logits

Create the dataset and data loader

In [8]:
dataset = ContractsDataset(encoded_sentences, labels)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)


Define the model and optimizer

In [9]:
model = LinearBERT().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-5)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Define the loss function

In [10]:
criterion = nn.BCEWithLogitsLoss()

print("Number of sentences:", len(sentences))
print("Number of encoded sentences:", encoded_sentences["input_ids"].shape[0])
print("Number of labels:", len(labels))


Number of sentences: 21187
Number of encoded sentences: 21187
Number of labels: 21187


Train the model

In [None]:
num_epochs = 10
for epoch in range(num_epochs):
    running_loss = 0.0
    running_corrects = 0
    for data in dataloader:
        input_ids = data["input_ids"].to(device)
        attention_mask = data["attention_mask"].to(device)
        labels = data["labels"].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs.squeeze(), labels.float())
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * input_ids.size(0)
        preds = torch.sigmoid(outputs.squeeze()) >= 0.5
        running_corrects += torch.sum(preds == labels.byte())
    epoch_loss = running_loss / len(dataset)
    epoch_acc = running_corrects.double() / len(dataset)
    print('Epoch: {} Loss: {:.3f} Accuracy: {:.3f}'.format(epoch + 1, epoch_loss, epoch_acc))

  "input_ids": torch.tensor(self.input_ids[idx], dtype=torch.long),
  "attention_mask": torch.tensor(self.attention_mask[idx], dtype=torch.long),


Evaluate the model