In [1]:
import os
import pandas as pd
from transformers import BertTokenizer
import spacy
from spacy.training import Example
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_tsv_dir = "dataset/dataset/train/boxes_transcripts_labels"
val_tsv_dir = "dataset/dataset/val/boxes_transcripts"
val_ann_tsv_dir = "dataset/dataset/val_w_ann/boxes_transcripts_labels"

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [4]:
def load_and_parse_tsv_files(directory):
    all_data = []
    for file in os.listdir(directory):
        if file.endswith('.tsv'):
            file_path = os.path.join(directory, file)
            df = pd.read_csv(file_path, sep=',', header=None)
            # print(df)
            df.columns = ['start_index', 'end_index', 'x_tl', 'y_tl', 'x_br', 'y_br', 'transcript', 'label']
            all_data.append(df)
    return pd.concat(all_data, ignore_index=True)

In [89]:
train_df = load_and_parse_tsv_files(train_tsv_dir)

In [90]:
print(train_df[:5])

   start_index  end_index  x_tl  y_tl  x_br  y_br  transcript  label
0           33         33   215     4   227    21           a  OTHER
1           35         44   235     3   308    21  Employee's  OTHER
2           46         51   311     3   349    20      social  OTHER
3           53         60   352     3   401    20    security  OTHER
4           62         67   404     3   457    21      number  OTHER


In [91]:
val_df = load_and_parse_tsv_files(val_ann_tsv_dir)

In [79]:
def convert_to_hf_format(df):
    df['transcript'] = df['transcript'].astype(str)
    texts = df['transcript'].tolist()
    labels = df['label'].tolist()
    
    encodings = tokenizer(texts, is_split_into_words=False, padding=True, truncation=True, return_tensors='pt')
    
    # Map string labels to integer indices
    label_map = {label: idx for idx, label in enumerate(set(labels))}
    label_indices = [label_map[label] for label in labels]
    
    # Ensure label_indices is a list of lists (one per sequence)
    # Here we assume each entry in labels corresponds to a single sequence of labels
    # Adjust according to how labels are structured
    max_length = encodings['input_ids'].shape[1]
    padded_labels = [label_indices[i] + [-100] * (max_length - 1) for i in range(len(label_indices))]
    
    encodings['labels'] = torch.tensor(padded_labels)
    return encodings


# from sklearn.preprocessing import LabelEncoder

# def convert_to_hf_format(df):
#     # Convert transcript to strings
#     df['transcript'] = df['transcript'].astype(str)
#     texts = df['transcript'].tolist()
    
#     # Convert string labels to integer IDs
#     label_encoder = LabelEncoder()
#     df['label'] = label_encoder.fit_transform(df['label'])
#     labels = df['label'].tolist()
    
#     # Tokenize texts
#     encodings = tokenizer(texts, is_split_into_words=True, padding=True, truncation=True, return_tensors='pt')
    
#     # Convert labels to tensor
#     encodings['labels'] = torch.tensor(labels, dtype=torch.long)
    
#     # Print shapes for debugging
#     print("Input IDs shape:", encodings['input_ids'].shape)
#     print("Attention Mask shape:", encodings['attention_mask'].shape)
#     print("Labels shape:", encodings['labels'].shape)
    
#     return encodings

In [55]:
# def convert_to_hf_format(df):
#     df['transcript'] = df['transcript'].astype(str)
#     texts = df['transcript'].tolist()
#     labels = df['label'].tolist()
#     encodings = tokenizer(texts, is_split_into_words=True, padding=True, truncation=True)
#     encodings['labels'] = labels
#     return encodings

In [80]:
train_encodings = convert_to_hf_format(train_df)
val_encodings = convert_to_hf_format(val_df)

TypeError: unsupported operand type(s) for +: 'int' and 'list'

In [58]:
import torch

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    
    def __getitem__(self, idx):
        # Return a dictionary of tensors for each batch
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Ensure 'labels' key is included and correctly formatted
        if 'labels' in item:
            item['labels'] = torch.tensor(item['labels'])
        return item
    
    def __len__(self):
        return len(self.encodings['input_ids'])


In [59]:
train_dataset = CustomDataset(train_encodings)
val_dataset = CustomDataset(val_encodings)

In [60]:
print(train_dataset[0])

{'input_ids': tensor([  101,   170, 18653,  1643, 26179,  3051,   112,   188,  1934,  2699,
         1295, 19770,   117,   138, 19515,  4084,  1566,   117,   159, 26868,
         1204,  1103,   146,  8900,  9059, 13068, 25434,   118,  5692,   118,
        22173,  1477,   152, 20660,  1302,   119, 17733,  1571,   118,  1288,
         1604,  6820,  9272,   106, 11696,   107, 14516,   199,  4956,  1120,
         7001,   119,   178,  1733,   119,  1301,  1964,   120,   174,  8702,
         1513,   119,   171, 18653,  1643, 26179,  1200,  9117,  1295,   113,
          142, 11607,   114,   160, 12062,   117, 10538,   117,  1168,  9806,
         3467,  2467,  3641,  1114, 17674,  3413,   118,  3236,  1580, 23124,
         1580,  1475,  3746,  1545,  1545,  1580,   119,  5004, 11084,  1604,
         1477,   119,  5429,   172, 18653,  1643, 26179,  1200,   112,   188,
         1271,   117,  4134,   117,  1105, 26248,  3463,  3563,  2699, 13588,
         3563,  2699,  3641,  1114, 17674,  6266, 

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(item['labels'])


In [61]:
from transformers import BertForTokenClassification, Trainer, TrainingArguments

In [62]:
num_labels = len(set(train_df['label']))

In [71]:
model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=num_labels)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [75]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)



In [76]:
trainer.train()

  0%|          | 0/3 [01:47<?, ?it/s]
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(item['labels'])


ValueError: Expected input batch_size (512) to match target batch_size (1).

In [107]:
def convert_to_spacy_format(df):
    texts = df['transcript'].tolist()
    entities = [(row['start_index'], row['end_index'], row['label']) for _, row in df.iterrows()]
    # print(texts)
    return list(zip(texts, [{'entities': entities}]))

In [108]:
spacy_train_data = convert_to_spacy_format(train_df)
spacy_val_data = convert_to_spacy_format(val_df)

In [122]:
print(train_df[:2])

   start_index  end_index  x_tl  y_tl  x_br  y_br  transcript  label
0           33         33   215     4   227    21           a  OTHER
1           35         44   235     3   308    21  Employee's  OTHER
