In [90]:
import json
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch
import random

# Load the JSONL file
file_path = "./nlp.jsonl"
additional_file_path = "./output.jsonl"
data = []
with open(file_path, 'r') as f:
    for line in f:
        data.append(json.loads(line))

        
with open(additional_file_path, 'r') as f:
    for line in f:
        data.append(json.loads(line))

print(len(data))
random.shuffle(data)
# Prepare the dataset
df = pd.DataFrame(data)

30000


In [105]:
import re
# Define the label mapping
label_list = ['O', 'B-TOOL', 'B-HEADING', 'B-TARGET']
label_map = {label: i for i, label in enumerate(label_list)}

num_to_text = {
    "0": "zero",
    "1": "one",
    "2": "two",
    "3": "three",
    "4": "four",
    "5": "five",
    "6": "six",
    "7": "seven",
    "8": "eight",
    "9": "niner"
}

# Helper function to find the index of a sublist in a list
def find_sublist(lst, sublist):
    for i in range(len(lst) - len(sublist) + 1):
        if lst[i:i+len(sublist)] == sublist:
            return i
    return -1


def convert_num_to_text(digits):
    digits = str(digits)
    words = [num_to_text[digit] for digit in digits]
    return words
    
# Helper function to create labels
def create_labels(transcript, tool, heading, target):
    tokens = re.split(r'(\W+)', transcript)
    tokens = [word for word in tokens if word.strip()]
    labels = ['O'] * len(tokens)
    
    #print(tokens)
    
    # Process heading
    heading_tokens = convert_num_to_text(heading)
    #print(heading_tokens)
    start_idx = find_sublist(tokens, heading_tokens)
    if start_idx != -1:
        for i in range(start_idx, start_idx + len(heading_tokens)):
            labels[i] = 'B-HEADING'
    
    # Process tool
    tool_tokens = re.split(r'(\W+)', tool)
    tool_tokens = [word for word in tool_tokens if word.strip()]
    start_idx = find_sublist(tokens, tool_tokens)
    if start_idx != -1:
        for i in range(start_idx, start_idx + len(tool_tokens)):
            labels[i] = 'B-TOOL'
    
    # Process target
    target_tokens = re.split(r'(\W+)', target)
    target_tokens = [word for word in target_tokens if word.strip()]
    start_idx = find_sublist(tokens, target_tokens)
    if start_idx != -1:
        for i in range(start_idx, start_idx + len(target_tokens)):
            labels[i] = 'B-TARGET'
    
    #print(labels)
    
    return tokens, [label_map[label] for label in labels]


# Apply the function to the DataFrame
results = df.apply(lambda row: create_labels(row['transcript'], row['tool'], row['heading'], row['target']), axis=1)

# Create new columns from the results
df['tokens'] = results.apply(lambda x: x[0])
df['labels'] = results.apply(lambda x: x[1])

# Display the DataFrame
#print(df['tokens'][0])
#print(df['labels'][0])
# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df)

train_dataset = dataset.select(range(int(0.8 * len(dataset))))
test_dataset = dataset.select(range(int(0.8 * len(dataset)), len(dataset)))
test_dataset

Dataset({
    features: ['key', 'transcript', 'tool', 'heading', 'target', 'tokens', 'labels'],
    num_rows: 6000
})

In [106]:
train_dataset

Dataset({
    features: ['key', 'transcript', 'tool', 'heading', 'target', 'tokens', 'labels'],
    num_rows: 24000
})

In [107]:
# Set up the device
print(torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

True


In [108]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

# Tokenize the dataset
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True, padding="max_length", max_length=25)
    labels = []
    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_tokenized_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

# print(tokenized_dataset)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/24000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

In [109]:
print(train_tokenized_dataset[0])

{'key': 14785, 'transcript': 'target is tiny, yellow and white glider. tool to deploy is patriot missile system. heading is two niner five', 'tool': 'patriot missile system', 'heading': '295', 'target': 'tiny, yellow and white glider', 'tokens': ['target', 'is', 'tiny', ', ', 'yellow', 'and', 'white', 'glider', '. ', 'tool', 'to', 'deploy', 'is', 'patriot', 'missile', 'system', '. ', 'heading', 'is', 'two', 'niner', 'five'], 'labels': [-100, 0, 0, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 2, 2, -100, 2, -100], 'input_ids': [101, 4539, 2003, 4714, 1010, 3756, 1998, 2317, 18788, 1012, 6994, 2000, 21296, 2003, 16419, 7421, 2291, 1012, 5825, 2003, 2048, 3157, 2099, 2274, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [110]:
# Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

In [111]:
# Training arguments with logging enabled
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    save_total_limit=1,  # Only keep the best model
    load_best_model_at_end=True,
    save_strategy="epoch"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=test_tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the model
trainer.save_model("./fine-tuned-model30000")



Epoch,Training Loss,Validation Loss
1,0.0022,0.003759
2,0.0017,0.001697
3,0.0014,0.002584
4,0.0013,0.00133
5,0.0011,0.001784
6,0.0011,0.001961
7,0.001,0.002127
8,0.0011,0.004722
9,0.0009,0.002832
10,0.0005,0.003252


In [144]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Load the fine-tuned model and tokenizer
model_name = "./fine-tuned-model30000"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Define label mapping
id2label = {0: 'O', 1: 'B-TOOL', 2: 'B-HEADING', 3: 'B-TARGET'}

# Mapping text numbers to their digit equivalents
text_to_num = {
    "zero": "0",
    "one": "1",
    "two": "2",
    "three": "3",
    "four": "4",
    "five": "5",
    "six": "6",
    "seven": "7",
    "eight": "8",
    "niner": "9"
}

def convert_heading_to_number(heading_tokens):
    # Strip punctuation and convert text to numbers
    output = "".join([text_to_num.get(token.rstrip('.,'), token) for token in heading_tokens])
    
    while len(output) < 3:
        output = "0" + output
    
    return output

def predict(transcript):
    # Tokenize the input transcript
    tokens = transcript.replace('.', '').split()
    
    tokenized_inputs = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", padding=True, truncation=True)
    word_ids = tokenized_inputs.word_ids()
    print(tokenized_inputs)
    
    inputs = {k: v.to('cuda' if torch.cuda.is_available() else 'cpu') for k, v in tokenized_inputs.items()}
    
    # Get the model predictions
    with torch.no_grad():
        outputs = model(**inputs).logits
    
    # Convert the predictions to labels
    predictions = torch.argmax(outputs, dim=2).cpu().numpy()[0]

    # Align the labels with the original tokens
    aligned_labels = []
    previous_word_idx = None
    for word_idx, prediction in zip(word_ids, predictions):
        if word_idx is None or word_idx == previous_word_idx:
            continue
        aligned_labels.append((tokens[word_idx], id2label[prediction]))
        previous_word_idx = word_idx

    # Convert the aligned labels to the desired output format
    tool, heading, target = [], [], []
    print(aligned_labels)
    for token, label in aligned_labels:
        if label == 'B-TOOL':
            tool.append(token)
        elif label == 'B-HEADING':
            heading.append(token)
        elif label == 'B-TARGET':
            target.append(token)
    
    output = {
        "tool": " ".join(tool).strip(',.'),
        "heading": convert_heading_to_number(heading),
        "target": " ".join(target).strip(',.')
    }
    
    return output

# Example usage
example_transcript = "target is black red and brown helicopter tool to deploy is geostationary satellites. heading is seven one zero"
print(predict(example_transcript))


{'input_ids': tensor([[  101,  4539,  2003,  2304,  2417,  1998,  2829,  7739,  6994,  2000,
         21296,  2003, 20248, 20100,  5649, 14549,  5825,  2003,  2698,  2028,
          5717,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
[('target', 'O'), ('is', 'O'), ('black', 'B-TARGET'), ('red', 'B-TARGET'), ('and', 'B-TARGET'), ('brown', 'B-TARGET'), ('helicopter', 'B-TARGET'), ('tool', 'O'), ('to', 'O'), ('deploy', 'O'), ('is', 'O'), ('geostationary', 'B-TOOL'), ('satellites', 'B-TOOL'), ('heading', 'O'), ('is', 'O'), ('seven', 'B-HEADING'), ('one', 'B-HEADING'), ('zero', 'B-HEADING')]
{'tool': 'geostationary satellites', 'heading': '710', 'target': 'black red and brown helicopter'}
