<a href="https://colab.research.google.com/github/MeyerTalon/URAP-ML-Interview-Task/blob/main/legal_identifier_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Overview

**Disclaimer: The following Jupyter notebook was executed in Google Colab, not locally, after uploading the company_names_parsed.csv file.**

This notebook creates a dataset using the parsed data from task 1, fine-tunes a pre-trained BERT model to recognize the existence of legal identifiers within a company name, and pushes both the dataset and model to the Hugging Face Hub.

In [6]:
!pip install datasets evaluate transformers huggingface_hub pandas numpy



In [27]:
from huggingface_hub import notebook_login

# We need to login as we'll upload our model and dataset to the hub.
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
from datasets import load_dataset
import pandas as pd
import numpy as np

# Generate train, test, and validation splits.
df = pd.read_csv('company_names_parsed.csv')
df_train = df.sample(frac=0.8, random_state=42)
df_test = df.drop(df_train.index)
df_train.to_csv('train.csv', index=False)
df_test.to_csv('test.csv', index=False)

# Load the dataset and push it to Huggingface Hub for access anywhere.
dataset = load_dataset('csv', data_files={'train': 'train.csv', 'test': 'test.csv'})
dataset.push_to_hub('TalonMeyer/URAP_interview_task_dataset')

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/80 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/TalonMeyer/URAP_interview_task_dataset/commit/83005d4c8f04937229ddf3f32cd57d949c8ec2da', commit_message='Upload dataset', commit_description='', oid='83005d4c8f04937229ddf3f32cd57d949c8ec2da', pr_url=None, pr_revision=None, pr_num=None)

In [9]:
# Refactor dataset to match model parameters.
dataset = dataset.remove_columns(['location', 'base_name'])
dataset = dataset.rename_columns({'legal': 'labels', 'raw': 'text'})

def transform_null_to_one(row: dict) -> int:
    """
    This function is used to transform the dataset from containing the legal identifiers such as 'inc', 'corp', etc
    to simple 1s and 0s to allow model training.

    Args:
        row (dict): The row to change the label in.

    Returns:
        1 if there exists a legal identifier, 0 otherwise.
    """
    # Replace non-null values with 1, and null values with 0
    return 1 if row['labels'] is not None else 0

# Apply the transformation to the dataset
dataset = dataset.map(lambda x: {'labels': transform_null_to_one(x)})

Map:   0%|          | 0/79990 [00:00<?, ? examples/s]

Map:   0%|          | 0/19997 [00:00<?, ? examples/s]

In [10]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = 'google-bert/bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(components: dict) -> dict:
  """
  Tokenizes the components of the dataset.

  Args:
    components (dict): A dictionary containing the components of the dataset.

  Returns:
    A dictionary containing the tokenized components of the dataset.
  """

  return tokenizer(components['text'], padding='max_length', truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



Map:   0%|          | 0/79990 [00:00<?, ? examples/s]

Map:   0%|          | 0/19997 [00:00<?, ? examples/s]

In [11]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
import numpy as np
import evaluate

metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
  """
  Computes the accuracy of the model.

  Args:
    eval_pred: A tuple containing the logits and labels of the model.

  Returns:
    A dictionary containing the accuracy of the model.
  """
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [20]:
# Use only 1/10 of the dataset, Google Colab free tier will not allow us to train the whole dataset.
train_sample = tokenized_dataset['train'].shuffle(seed=42).select(range(int(0.1 * len(dataset['train']))))
test_sample = tokenized_dataset['test'].shuffle(seed=42).select(range(int(0.1 * len(dataset['test']))))

# Define training arguments and trainer.
training_args = TrainingArguments(output_dir='trainer', eval_strategy='epoch')
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_sample,
    eval_dataset=test_sample,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
    )

In [21]:
# Train the model.
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0675,0.059392,0.989495
2,0.0351,0.040795,0.992496
3,0.0227,0.049129,0.990995


TrainOutput(global_step=3000, training_loss=0.0420365317662557, metrics={'train_runtime': 2433.6098, 'train_samples_per_second': 9.861, 'train_steps_per_second': 1.233, 'total_flos': 6313875995473920.0, 'train_loss': 0.0420365317662557, 'epoch': 3.0})

In [32]:
# Save the model and post it to Hugging Face Hub.
kwargs = {
    'finetuned_from': model.config._name_or_path,
    'dataset': 'TalonMeyer/URAP_interview_task_dataset',
    'tasks': 'text-classification',
    'tags': ['text-classification'],
}
trainer.save_model('bert-base-cased-legal-keyword-identifier')
trainer.push_to_hub('bert-base-cased-legal-keyword-identifier', **kwargs)

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/TalonMeyer/trainer/commit/12b5b9f6ef767c914098067ba0a9dcc60361e18c', commit_message='bert-base-cased-legal-keyword-identifier', commit_description='', oid='12b5b9f6ef767c914098067ba0a9dcc60361e18c', pr_url=None, pr_revision=None, pr_num=None)

In [51]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-cased')
model = AutoModelForSequenceClassification.from_pretrained('TalonMeyer/bert-base-cased-legal-keyword-identifier')

# Tokenize the company name.
tokens = tokenizer('Chase Bank Inc', return_tensors='pt')

# Get the model output.
outputs = model(**tokens)

# Get the predicted labels.
predicted_labels = outputs.logits.argmax(dim=1)

# Check if the company name contains a legal identifier.
if 1 in predicted_labels[0]:
    print(True)
else:
    print(False)

True
