In [1]:
# import os
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

import pandas as pd
import json

# Load your JSON data
# json_data = '''
# [
#     {
#         "transcript_id": "CNN-116133",
#         "statement_id": "05bb4c67-de24-4100-bc7a-bdae533e5d14",
#         "matched_terms": {
#             "kind of": {
#                 "correct": "hedge"
#             }
#         },
#         "previous_statement": "We have changed. ...",
#         "statement": "We all share your frustration. Thank you, Pat. Thank you, Lieutenant Rogers. Up next, in their own words, I`m going to talk to a guy who set up a web site that tracks the hate speech that`s flowing out of the Middle East. And later, we`ll update this terrible story coming out of Virginia. It`s the worst mass school shooting in U.S. history. I`ll talk to a criminal justice expert about whether we can ever stop these <KIND OF> killings."
#     }
# ]
# '''
with open('human_annotated_dataset.json', 'r') as f:
    json_data = f.read()

# Convert JSON to DataFrame
data = json.loads(json_data)
df = pd.DataFrame(data)

# Function to create separate rows for each marker
def create_marker_rows(row):
    statement = row['statement']
    matched_terms = row['matched_terms']
    rows = []
    for term, details in matched_terms.items():
        label = details['correct']
        context = statement.replace(f'<{term.upper()}>', f'[START] {term.upper()} [END]')
        context = context.replace('<', '').replace('>', '')
        rows.append({
            'transcript_id': row['transcript_id'],
            'statement_id': row['statement_id'],
            'context': context,
            'label': label_map[label]
        })
    return rows

# Map labels to numerical values (e.g., 'hedge' -> 0, 'authority' -> 1, 'none' -> 2)
label_map = {'hedge': 2, 'authority': 1, 'none': 0}


# Create a new DataFrame with separate rows for each marker
new_rows = []
for _, row in df.iterrows():
    new_rows.extend(create_marker_rows(row))

new_df = pd.DataFrame(new_rows)

# Check the resulting DataFrame
print(new_df[['context', 'label']])

                                                context  label
0     I'm [START] THINKING [END] now of issues that ...      0
1     We all share your frustration. Thank you, Pat....      2
2     Good morning, everyone. We'll be right back. I...      0
3     Yes, yes. I THINK she's ridiculous and be more...      0
4     Thanks for the question. That was on OBVIOUSLY...      0
...                                                 ...    ...
1156  It's very worrisome. Thanks very much. Dan Lot...      0
1157  "Nixon," a fantastic new flick and it's [START...      0
1158  Listen, I THINK that ultimately all the nation...      0
1159  All right. Jim Sciutto, thank you. In OUTFRONT...      0
1160  Spanish Media say that back in 2004, police co...      0

[1161 rows x 2 columns]


In [2]:
# Tokenize and Train the Model
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset


# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # Adjust num_labels to 3

# device='cuda' if torch.cuda.is_available() else 'cpu'
device='cpu'
model.to(device)
print("device:", device)


# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['context'], padding="max_length", truncation=True, max_length=512)

# Convert DataFrame to Dataset
dataset = Dataset.from_pandas(new_df)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Convert labels to int64
tokenized_datasets = tokenized_datasets.map(lambda examples: {'label': examples['label']}, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=1000,
    save_total_limit=2,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
)

# Train the model
trainer.train()

# Save the model and tokenizer
model.save_pretrained('./model')
tokenizer.save_pretrained('./model')

  from .autonotebook import tqdm as notebook_tqdm
2024-06-04 23:41:47.469076: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-04 23:41:47.717733: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


device: cpu


Map: 100%|██████████| 1161/1161 [00:01<00:00, 706.40 examples/s]
Map: 100%|██████████| 1161/1161 [00:00<00:00, 232538.41 examples/s]


Step,Training Loss
10,1.1688
20,1.1187
30,1.0992
40,1.0109
50,0.9428
60,0.9156
70,0.8605
80,0.8842
90,0.911
100,0.9187


('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.txt',
 './model/added_tokens.json')