In [5]:
import pandas as pd
import json

with open('human_annotated_dataset.json', 'r') as f:
    json_data = f.read()

# Convert JSON to DataFrame
data = json.loads(json_data)
df = pd.DataFrame(data)

# Function to create separate rows for each marker
def create_marker_rows(row):
    statement = row['statement']
    matched_terms = row['matched_terms']
    rows = []
    for term, details in matched_terms.items():
        label = details['correct']
        context = statement.replace(f'<{term.upper()}>', f'[START] {term.upper()} [END]')
        context = context.replace('<', '').replace('>', '')
        rows.append({
            'transcript_id': row['transcript_id'],
            'statement_id': row['statement_id'],
            'context': context,
            'label': label_map[label]
        })
    return rows

# Map labels to numerical values (e.g., 'hedge' -> 0, 'authority' -> 1, 'none' -> 2)
label_map = {'hedge': 2, 'authority': 1, 'none': 0}


# Create a new DataFrame with separate rows for each marker
new_rows = []
for _, row in df.iterrows():
    new_rows.extend(create_marker_rows(row))

new_df = pd.DataFrame(new_rows)

# Check the resulting DataFrame
print(new_df[['context', 'label']])

                                                context  label
0     I'm [START] THINKING [END] now of issues that ...      0
1     We all share your frustration. Thank you, Pat....      2
2     Good morning, everyone. We'll be right back. I...      0
3     Yes, yes. I THINK she's ridiculous and be more...      0
4     Thanks for the question. That was on OBVIOUSLY...      0
...                                                 ...    ...
1156  It's very worrisome. Thanks very much. Dan Lot...      0
1157  "Nixon," a fantastic new flick and it's [START...      0
1158  Listen, I THINK that ultimately all the nation...      0
1159  All right. Jim Sciutto, thank you. In OUTFRONT...      0
1160  Spanish Media say that back in 2004, police co...      0

[1161 rows x 2 columns]


In [7]:
# Tokenize and Train the Model
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # Adjust num_labels to 3

device='cuda' if torch.cuda.is_available() else 'cpu'
# device='cpu'
model.to(device)
print("device:", device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


device: cuda


In [11]:
from datasets import Dataset, load_metric
from sklearn.model_selection import train_test_split

accuracy_metric = load_metric("accuracy")
precision_metric = load_metric("precision", config_name="multiclass")
recall_metric = load_metric("recall", config_name="multiclass")
f1_metric = load_metric("f1", config_name="multiclass")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['context'], padding="max_length", truncation=True, max_length=512)

# Split the data into training and evaluation sets
train_df, eval_df = train_test_split(new_df, test_size=0.2, random_state=42)

# Convert DataFrame to Dataset
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

# Tokenize the datasets
train_tokenized = train_dataset.map(tokenize_function, batched=True)
eval_tokenized = eval_dataset.map(tokenize_function, batched=True)

import numpy as np
from datasets import load_metric

# Load evaluation metric
accuracy_metric = load_metric("accuracy")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
f1_metric = load_metric("f1")

# Define a function to compute metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    # Compute metrics
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average="macro")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="macro")
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")

    # Compute F1 for each label individually
    f1_per_class = f1_metric.compute(predictions=predictions, references=labels, average=None)
    label_f1_scores = f1_per_class['f1']

    return {
        'accuracy': accuracy['accuracy'],
        'precision': precision['precision'],
        'recall': recall['recall'],
        'f1': f1['f1'],
        'f1_label_0': label_f1_scores[0],
        'f1_label_1': label_f1_scores[1],
        'f1_label_2': label_f1_scores[2]
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=1000,
    save_total_limit=2,
    evaluation_strategy="epoch",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=eval_tokenized,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()

import datetime

# Define model parameters to include in the filename
num_epochs = training_args.num_train_epochs
batch_size = training_args.per_device_train_batch_size

# Get current datetime to use as a unique identifier
current_time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Define the directory using the timestamp and model parameters
model_dir = f'./models/model_epochs-{num_epochs}_batch-{batch_size}_{current_time}'
tokenizer_dir = f'./models/tokenizer_epochs-{num_epochs}_batch-{batch_size}_{current_time}'

# Save the model and tokenizer with detailed names
model.save_pretrained(model_dir)
tokenizer.save_pretrained(tokenizer_dir)

print(f"Model saved in directory: {model_dir}")
print(f"Tokenizer saved in directory: {tokenizer_dir}")

# Print evaluation results
print("Evaluation results:", eval_results)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Map: 100%|██████████| 928/928 [00:00<00:00, 945.23 examples/s]
Map: 100%|██████████| 233/233 [00:00<00:00, 999.89 examples/s] 
You can avoid this message in future by passing the argument `trust_r

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,F1 Label 0,F1 Label 1,F1 Label 2
1,0.1829,0.612327,0.866953,0.755996,0.820656,0.778503,0.904215,0.555556,0.87574
2,0.002,0.64547,0.88412,0.826121,0.810748,0.818102,0.910448,0.666667,0.877193
3,0.2223,0.699901,0.862661,0.752305,0.794107,0.767005,0.898876,0.529412,0.872727
4,0.4263,0.854361,0.854077,0.729109,0.723815,0.725157,0.890511,0.413793,0.871166
5,0.0595,0.840208,0.866953,0.790317,0.819287,0.802491,0.88806,0.645161,0.874251
6,0.1599,0.692714,0.905579,0.815327,0.869995,0.837357,0.927757,0.666667,0.917647


Model saved in directory: ./model_epochs-6_batch-8_2024-06-05_00-15-10
Tokenizer saved in directory: ./tokenizer_epochs-6_batch-8_2024-06-05_00-15-10
Evaluation results: {'eval_loss': 0.6927139163017273, 'eval_accuracy': 0.9055793991416309, 'eval_precision': 0.8153267784846733, 'eval_recall': 0.8699947543276796, 'eval_f1': 0.8373567931608639, 'eval_f1_label_0': 0.9277566539923955, 'eval_f1_label_1': 0.6666666666666666, 'eval_f1_label_2': 0.9176470588235294, 'eval_runtime': 2.9543, 'eval_samples_per_second': 78.867, 'eval_steps_per_second': 10.155, 'epoch': 6.0}


In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Compute the confusion matrix
cm = confusion_matrix(true_labels, predictions, labels=unique_labels)
display_labels = [reversed_label_map[label] for label in unique_labels]

# Display the confusion matrix
fig, ax = plt.subplots(figsize=(10, 10))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=display_labels)
disp.plot(values_format='d', cmap='Blues', ax=ax)
plt.title('Confusion Matrix')
plt.show()