In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m33.8 MB/s[0m eta [36m0:00:0

In [2]:
import torch
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW

In [1]:
# Let's load the CSV file and display the first few rows to understand its structure and contents

import pandas as pd

# Load the CSV file
file_path = "/content/business_terms.csv"
business_terms_df = pd.read_csv(file_path)

# Show the first few rows
business_terms_df.head()


Unnamed: 0,Business Term,Business Term Description,Preferred Business Term,Category,Preferred Business Term Description
0,Primary Key,A unique identifier for an object,Object Identifier,Categorization,A unique identifier for an object
1,Counter Party Identifier,An identifier uniquely identifying a country,Country Identifier,Categorization,An identifier uniquely identifying a country
2,Foreign Key,A reference that differentiates an object,Differentiating Reference,Categorization,A reference that differentiates an object
3,Facility,A financial instrument for credit and risk con...,Credit and Risk Control,Business,A financial instrument for credit and risk con...
4,Counterparty,Individual or organization in trade or transac...,Trade Partner,Role,Individual or organization in trade or transac...


In [2]:
# Preprocessing the data into a suitable text-to-text transformation format

# Define a function to concatenate the input and output in a specific format
def create_example(row):
    input_text = f"Business Term: {row['Business Term']} | Business Term Description: {row['Business Term Description']}"
    target_text = f"Category: {row['Category']} | Preferred Business Term: {row['Preferred Business Term']} | Preferred Business Term Description: {row['Preferred Business Term Description']}"
    return input_text, target_text

# Apply the function to each row of the DataFrame
examples = [create_example(row) for _, row in business_terms_df.iterrows()]

# Separate the inputs and targets
input_texts, target_texts = zip(*examples)

# Create a DataFrame with the processed data
processed_data = pd.DataFrame({
    'input_text': input_texts,
    'target_text': target_texts
})

# Show a preview of the processed data
processed_data.head()


Unnamed: 0,input_text,target_text
0,Business Term: Primary Key | Business Term Des...,Category: Categorization | Preferred Business ...
1,Business Term: Counter Party Identifier | Busi...,Category: Categorization | Preferred Business ...
2,Business Term: Foreign Key | Business Term Des...,Category: Categorization | Preferred Business ...
3,Business Term: Facility | Business Term Descri...,Category: Business | Preferred Business Term: ...
4,Business Term: Counterparty | Business Term De...,Category: Role | Preferred Business Term: Trad...


In [3]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets (80% training, 20% validation)
train_data, val_data = train_test_split(processed_data, test_size=0.2, random_state=42)

# Show the shape of the training and validation sets to confirm the split
train_data.shape, val_data.shape


((37, 2), (10, 2))

In [4]:
train_data.to_csv("train_data.csv", index=False)
val_data.to_csv("val_data.csv", index=False)


In [5]:
from torch.utils.data import Dataset
import torch

class CustomDataset(Dataset):
    def __init__(self, input_texts, target_texts, tokenizer):
        self.input_texts = input_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        input_text = self.input_texts[idx]
        target_text = self.target_texts[idx]
        input_tokens = self.tokenizer(input_text, truncation=True, padding='max_length', return_tensors='pt')
        target_tokens = self.tokenizer(target_text, truncation=True, padding='max_length', return_tensors='pt')
        return input_tokens, target_tokens

def load_dataset(file_path, tokenizer):
    data = pd.read_csv(file_path)
    input_texts = data['input_text'].tolist()
    target_texts = data['target_text'].tolist()
    dataset = CustomDataset(input_texts, target_texts, tokenizer)
    return dataset

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

train_dataset = load_dataset("train_data.csv", tokenizer)
val_dataset = load_dataset("val_data.csv", tokenizer)


NameError: ignored

In [8]:
from transformers import GPT2LMHeadModel, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = GPT2LMHeadModel.from_pretrained("gpt2")

# Custom collate function to handle our custom dataset
def collate_fn(batch):
    input_tokens = [item[0] for item in batch]
    target_tokens = [item[1] for item in batch]
    input_tokens = tokenizer.pad(input_tokens, return_tensors='pt')
    target_tokens = tokenizer.pad(target_tokens, return_tensors='pt')
    return {'input_ids': input_tokens['input_ids'], 'attention_mask': input_tokens['attention_mask'], 'labels': target_tokens['input_ids']}

training_args = Seq2SeqTrainingArguments(
    per_device_train_batch_size=1,  # Reduced batch size
    per_device_eval_batch_size=1,   # Reduced batch size
    num_train_epochs=3,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=100,
    output_dir="./model",
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collate_fn,
)

trainer.train()




Step,Training Loss,Validation Loss
100,No log,0.091572


TrainOutput(global_step=111, training_loss=0.285426715472797, metrics={'train_runtime': 3345.1657, 'train_samples_per_second': 0.033, 'train_steps_per_second': 0.033, 'total_flos': 58006831104000.0, 'train_loss': 0.285426715472797, 'epoch': 3.0})

In [17]:
from transformers import pipeline

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

input_text = "Business Term: Trade Facility | Business Term Description: A financial instrument for credit risk and control"
prediction = generator(input_text, max_length=30)[0]['generated_text']
print(prediction)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Business Term: Trade Facility | Business Term Description: A financial instrument for credit risk and control Business Term: Ass- A securities Term Term Businessization:


## T5

In [9]:
data = business_terms_df

In [10]:
# Define the task prefix
task_prefix = "Translate: "

# Create input and target texts
data['input_text'] = task_prefix + "Business Term: " + data['Business Term'] + " | Business Term Description: " + data['Business Term Description']
data['target_text'] = "Category: " + data['Category'] + " | Preferred Business Term: " + data['Preferred Business Term'] + " | Preferred Business Term Description: " + data['Preferred Business Term Description']

# Split the data into training and validation sets
train_data, val_data = train_test_split(data[['input_text', 'target_text']], test_size=0.2, random_state=42)

In [11]:
from transformers import T5Tokenizer
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.input_texts = data['input_text'].tolist()
        self.target_texts = data['target_text'].tolist()
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        input_text = self.input_texts[idx]
        target_text = self.target_texts[idx]
        inputs = self.tokenizer(input_text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
        targets = self.tokenizer(target_text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
        return inputs, targets

tokenizer = T5Tokenizer.from_pretrained("t5-small")
train_dataset = CustomDataset(train_data, tokenizer)
val_dataset = CustomDataset(val_data, tokenizer)


In [15]:
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

def collate_fn(batch):
    inputs = [item[0] for item in batch]
    targets = [item[1] for item in batch]
    input_ids = torch.stack([item['input_ids'].squeeze() for item in inputs])
    attention_mask = torch.stack([item['attention_mask'].squeeze() for item in inputs])
    target_ids = torch.stack([item['input_ids'].squeeze() for item in targets])
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': target_ids}

model = T5ForConditionalGeneration.from_pretrained("t5-small")

training_args = Seq2SeqTrainingArguments(
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=100,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=100,
    output_dir="./model",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collate_fn,
)

trainer.train()


Step,Training Loss,Validation Loss
100,No log,0.121187
200,No log,0.058135
300,No log,0.036482
400,No log,0.032496
500,0.371800,0.032385
600,0.371800,0.031473
700,0.371800,0.032405
800,0.371800,0.031082
900,0.371800,0.03292
1000,0.011600,0.031099


TrainOutput(global_step=1900, training_loss=0.10377485588977212, metrics={'train_runtime': 25983.8988, 'train_samples_per_second': 0.142, 'train_steps_per_second': 0.073, 'total_flos': 500764665446400.0, 'train_loss': 0.10377485588977212, 'epoch': 100.0})

In [16]:
input_text = "Translate: Business Term: Trade Facility | Business Term Description: A financial instrument for credit risk and control"
input_tokens = tokenizer(input_text, return_tensors="pt")
output_tokens = model.generate(input_tokens["input_ids"])
prediction = tokenizer.decode(output_tokens[0])
print(prediction)




<pad>Category: Business | Preferred Business Term: Credit Risk and Control | Preferred Business


In [17]:
input_text = "Translate: Business Term: Trade Facility | Business Term Description: A financial instrument for credit risk and control"
input_tokens = tokenizer(input_text, return_tensors="pt")
output_tokens = model.generate(input_tokens["input_ids"], num_beams=5, max_length=100, temperature=1.0) # Adjust generation parameters as needed
prediction = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
print(prediction)


Category: Business | Preferred Business Term: Credit Risk and Control | Preferred Business Term Description: A financial instrument for credit risk and control
