In [1]:
import torch
from transformers import BertTokenizer, BertForTokenClassification

# Replace these with the actual directory paths where you saved your model and tokenizer
model_directory = 'data/models/weighted_model_nonaug_epochs-6_batch-8_2024-06-05_23-27-25'
tokenizer_directory = 'data/models/weighted_tokenizer_nonaug_epochs-6_batch-8_2024-06-05_23-27-25'

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained(tokenizer_directory)
model = BertForTokenClassification.from_pretrained(model_directory)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
import torch.nn.functional as F

def classify_tokens_with_average(statement, previous_statement, model=model, tokenizer=tokenizer, label_map={0: "none", 1: "authority", 2: "hedge"}):
    statement_start_text = "Speaker 1: " + previous_statement + " Speaker 2: "
    statement_tokens = len(tokenizer.tokenize(statement))
    combined_context = statement_start_text + statement
    combined_context = combined_context.replace("<", "").replace(">", "")
    statement_start = len(tokenizer.tokenize(statement_start_text))  # Start index of the statement tokens

    # Tokenize the input text
    inputs = tokenizer(combined_context, return_tensors="pt", truncation=True, max_length=512, padding="max_length")
    pad_token_id = tokenizer.pad_token_id

    # Move tensors to the same device as model
    input_ids = inputs['input_ids'].to(model.device)
    attention_mask = inputs['attention_mask'].to(model.device)

    # Predict
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    # Get the predictions and compute the softmax to obtain the probabilities
    logits = outputs.logits
    probabilities = F.softmax(logits, dim=-1)

    # Extract relevant tokens and probabilities for the 'statement' only
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    statement_probs = probabilities[0][statement_start:statement_start + statement_tokens]

    label_sums = [0.0] * len(label_map)
    label_counts = [0] * len(label_map)
    token_softmax = []

    for idx, prob in enumerate(statement_probs):
        token_id = input_ids[0][statement_start + idx]
        if token_id != pad_token_id:
            token = tokens[statement_start + idx]
            token_prob = {label_map[label_id]: round(prob[label_id].item(), 2) for label_id in range(len(label_map))}
            max_label = label_map[prob.argmax().item()]
            token_softmax.append((f" '{token}' : '{max_label}'"))

            for label_id in range(len(label_map)):
                label_sums[label_id] += prob[label_id].item()
                label_counts[label_id] += 1

    average_scores = {label_map[label_id]: (label_sums[label_id] / label_counts[label_id] if label_counts[label_id] > 0 else 0) 
                      for label_id in range(len(label_map))}

    return statement_tokens, average_scores, token_softmax



# statement= "I'm <THINKING> now of issues that states have dealt with, like eminent domain, that the federal government hasn't weighed in on."
# previous_statement = "And so you'll see states look to Massachusetts, which this year approved a bill that will require health insurance for all of its residents. That will come online next year, and other states will be watching to see if that's something they want to imitate. You know, the Democrats in Congress are talking about moving a minimum wage bill. We already have half the states with minimum wage levels higher than the federal law because they've gone ahead and acted. Well, the federal minimum wage hasn't changed since 1997."
# statement_tokens, average_scores, token_softmax = classify_tokens_with_average(statement, previous_statement, model, tokenizer, label_map)

# print("Average scores for the statement:")
# for label, score in average_scores.items():
#     print(f"{label}: {score:.4f}")
    
# for item in token_softmax:
#     token, probs, label = item
#     print(f"{token} {label}: {probs}")

In [3]:
from util import get_json_response_gemini
from prompts import speaker_prompt

import json

with open('data/news_dialogue_sample.json') as f:
    json_transcripts = json.load(f)

In [37]:
speaker_prompt_alt1 = """You are a transcript annotator, and your job is to clarify who """
speaker_prompt_alt2 = """is in the following dialogue:

-----BEGIN TRANSCRIPT-----
"""

speaker_prompt_alt3 = """
-----END TRANSCRIPT-----

For the given name, your job is to return a JSON with consistent details so that multiple statements can be matched to the same speaker. The JSON should be a dictionary in the following key and values:

* key: """

speaker_prompt_alt4 = """
* values: name, occupation
   * name: The full name of the speaker in the format "LASTNAME, FIRSTNAME." If only first or last name is available, use that. If the name is undecipherable or can't be derived, use "UNKNOWN"
   * occupation: The speaker's occupation, if known

Your options for occupation are:
* Unknown
* News Media
* American Politician - Republican
* American Politician - Democrat
* Government Official
* Military
* Other
Do not make wild guesses about the speaker's occupation.
"""
name = "TEST"
transcript = "TRANSCRIPT"
text = speaker_prompt_alt1 + name + speaker_prompt_alt2 + transcript + speaker_prompt_alt3 + name + speaker_prompt_alt4
print(text)

You are a transcript annotator, and your job is to clarify who TESTis in the following dialogue:

-----BEGIN TRANSCRIPT-----
TRANSCRIPT
-----END TRANSCRIPT-----

For the given name, your job is to return a JSON with consistent details so that multiple statements can be matched to the same speaker. The JSON should be a dictionary in the following key and values:

* key: TEST
* values: name, occupation
   * name: The full name of the speaker in the format "LASTNAME, FIRSTNAME." If only first or last name is available, use that. If the name is undecipherable or can't be derived, use "UNKNOWN"
   * occupation: The speaker's occupation, if known

Your options for occupation are:
* Unknown
* News Media
* American Politician - Republican
* American Politician - Democrat
* Government Official
* Military
* Other
Do not make wild guesses about the speaker's occupation.



In [45]:
from prompts import speaker_prompt_alt
# Process the transcript with the new function

def process_transcript(transcript, model_name='gemini-1.5-pro', model_inference_name = "unknown", system_prompt=speaker_prompt, verbose=False):
    utt = transcript['utt']
    speakers = transcript['speaker']

    gemini_input = ""
    for i in range(len(utt)):
        speaker = speakers[i]
        gemini_input += f"<SPEAKER: {speaker}>: {utt[i]}\n"

    gemini_output = get_json_response_gemini(
        gemini_input,
        model_name=model_name,
        system_prompt=system_prompt,
        verbose=verbose)
    
    transcript['gemini_output'] = gemini_output

    speaker_list = []
    
    for i in range(len(utt)):
        speaker = speakers[i]
        try:
            new_speaker = gemini_output[speaker]
        except:
            print(f"Speaker {speaker} not found in Gemini output, trying alternate prompt.")
            # print("Alt Input:", alt_string[:150])
            print("speaker:", speaker)
            # print("gemini_input:", gemini_input[:150])
            print("gemini_input:", gemini_input)
            
            new_input = speaker_prompt_alt1 + speaker + speaker_prompt_alt2 + gemini_input + speaker_prompt_alt3 + speaker + speaker_prompt_alt4
            try:
                gemini_alt = get_json_response_gemini(
                    new_input,
                    model_name=model_name,
                    system_prompt=new_input,
                    verbose=verbose, custom=True)
                print("Gemini alt:", json.dumps(gemini_alt, indent=2))
                transcript['gemini_output'].update(gemini_alt)
                print("transcript['gemini_output']: ", json.dumps(transcript['gemini_output'], indent=2))
                new_speaker = gemini_alt[speaker]
                print("Successful Gemini Alternate Found")
            except Exception as e:
                print(f"Speaker {speaker} not found in alternate Gemini output, assigning key, value: ")
                gemini_alt = {speaker: {"name": "UNKNOWN", "occupation": "Unknown"}}
                transcript['gemini_output'].update(gemini_alt)
                new_speaker = gemini_alt[speaker]
        speaker_list.append(new_speaker)
    transcript['speakers_formatted'] = speaker_list
    
    utterance_grouping = {}
    occupation_summary = {} # This is to store details about each indiviidual occupation
    speaker_summary = {} # This is to store details about each indiviidual speaker
    
    for speaker, details in gemini_output.items():
        name = details['name']
        occupation = details['occupation']

        if name not in speaker_summary:
            speaker_summary[name] = {
                "statements": 0,
                "total_tokens": 0,
                "weights": {"none": 0, "hedge": 0, "authority": 0},
                "num_statements": {"none": 0, "hedge": 0, "authority": 0}
            }

        if occupation not in occupation_summary:
            occupation_summary[occupation] = {
                "statements": 0,
                "total_tokens": 0,
                "weights": {"none": 0, "hedge": 0, "authority": 0},
                "num_statements": {"none": 0, "hedge": 0, "authority": 0}
            }
    
    # print("Initializing Speaker Summary:")
    # print(json.dumps(speaker_summary, indent=4))
    # print("Initializing Occupation Summary:")
    # print(json.dumps(occupation_summary, indent=4))

    for i in range(len(utt)):
        current_statement = utt[i]
        if i == 0:
            previous_statement = "None"
        else:
            previous_statement = utt[i-1]
        
        statement_tokens, average_scores, token_softmax = classify_tokens_with_average(current_statement, previous_statement)
        statement_tokens = statement_tokens
        speaker_name = speaker_list[i]["name"]
        speaker_occupation = speaker_list[i]["occupation"]
        utt_category = max(average_scores, key=average_scores.get)
        utterance_details = {
            "summary": {
                "labeling_model": model_inference_name, 
                "statement": current_statement,
                "tokens": statement_tokens,
                "category": utt_category,
                "speaker_name": speaker_name,
                "speaker_occupation": speaker_occupation,
                "statement_tokens": statement_tokens,
                "label_scores": average_scores
            },
            "token_softmax_str": token_softmax
        }
        utterance_grouping[f"utterance_{i}"] = utterance_details

        occupation_summary[speaker_occupation]["statements"] += 1
        occupation_summary[speaker_occupation]["total_tokens"] += statement_tokens
        for label in occupation_summary[speaker_occupation]["weights"]:
            occupation_summary[speaker_occupation]["weights"][label] += average_scores[label] * statement_tokens
        occupation_summary[speaker_occupation]["num_statements"][utt_category] += 1
        
        speaker_summary[name]["statements"] += 1
        speaker_summary[name]["total_tokens"] += statement_tokens
        for label in speaker_summary[speaker_name]["weights"]:
            speaker_summary[speaker_name]["weights"][label] += average_scores[label] * statement_tokens
        speaker_summary[speaker_name]["num_statements"][utt_category] += 1

    for occupation, summary in occupation_summary.items():
        summary["avg_scores"] = {}
        for label in summary["weights"]:
            if summary["total_tokens"] == 0:
                summary["avg_scores"][label] = 0
            else:
                summary["avg_scores"][label] = summary["weights"][label] / summary["total_tokens"]

    for name, summary in speaker_summary.items():
        summary["avg_scores"] = {}
        for label in summary["weights"]:
            if summary["total_tokens"] == 0:
                summary["avg_scores"][label] = 0
            else:
                summary["avg_scores"][label] = summary["weights"][label] / summary["total_tokens"]
    
    transcript['detailed_output'] = utterance_grouping
    transcript['occupation_summary'] = occupation_summary
    transcript['speaker_summary'] = speaker_summary

    return transcript

def compact_token_softmax(token_softmax):
    return [f"{token}: {scores}, max_label: {max_label}" for token, scores, max_label in token_softmax]

In [None]:
import random

# Load the transcripts
with open('data/news_dialogue_sample.json') as f:
    json_transcripts = json.load(f)
    
    # Specify the number of samples you want to draw
num_samples = 10  # Change this to the desired number of samples

# Randomly sample from the transcripts
sampled_transcripts = random.sample(json_transcripts, num_samples)

In [46]:
# Process each transcript and compact the token_softmax
scored_transcripts = []

for transcript in sampled_transcripts:  # Assuming you only want to process the first transcript for now
    transcript_out = process_transcript(transcript, model_inference_name="Standard Loss")
    print("Transcript processed")
    
    # for key, utterance_details in transcript_out['detailed_output'].items():
    #     utterance_details['token_softmax_str'] = compact_token_softmax(utterance_details['token_softmax'])
    
    scored_transcripts.append(transcript_out)

# Save the processed transcripts to a new JSON file
with open('data/news_dialogue_sample_scored.json', 'w') as f:
    json.dump(scored_transcripts, f, indent=2)

# print(json.dumps(scored_transcripts, indent=2))

Speaker COOPER not found in Gemini output, trying alternate prompt.
speaker: COOPER
gemini_input: <SPEAKER: WHITFIELD>: Here's what's happening right "Now In The News." Two missing boys enjoy their first full day back with their families. One boy had been missing since Monday, the other since October of 2002. They were rescued from a suburban St. Louis apartment yesterday. A suspect is in custody. A winter storm crippling the nation's mid-section -- snow and freezing rain, icy roads, power outages, all of that. The Associated Press reports at least six deaths on ice-covered roads in Oklahoma and Missouri, all the result of traffic accidents. Air travel is also seriously affected and more icy weather is expected. U.S. Secretary of State Condoleezza Rice is in Jerusalem. It's the first stop on a Middle East tour to push the Israeli-Palestinian peace process, and to persuade governments in the region to support President Bush's plan to send more U.S. troops to Iraq. President Bush has bee

ValueError: The `response.text` quick accessor only works when the response contains a valid `Part`, but none was returned. Check the `candidate.safety_ratings` to see if the response was blocked.

In [None]:
import json

def summary_values(transcripts, summary = "occupation"):
    
    summary = f"{summary}_summary"
    # Initialize a master summary dictionary
    master_summary = {}

    for transcript in transcripts:
        summary_details = transcript[summary]
        
        for summary_title, summary in summary_details.items():
            if summary_title not in master_summary:
                master_summary[summary_title] = {
                    "statements": 0,
                    "total_tokens": 0,
                    "weights": {"none": 0, "hedge": 0, "authority": 0},
                    "num_statements": {"none": 0, "hedge": 0, "authority": 0}
                }

            master_summary[summary_title]["statements"] += summary["statements"]
            master_summary[summary_title]["total_tokens"] += summary["total_tokens"]

            for label in master_summary[summary_title]["weights"]:
                master_summary[summary_title]["weights"][label] += summary["weights"][label]
                master_summary[summary_title]["num_statements"][label] += summary["num_statements"][label]

    # Calculate the average scores
    for summary_title, summary in master_summary.items():
        summary["avg_scores"] = {}
        for label in summary["weights"]:
            if summary["total_tokens"] > 0:
                summary["avg_scores"][label] = summary["weights"][label] / summary["total_tokens"]
            else:
                summary["avg_scores"][label] = 0

    return master_summary

# Load the transcripts
with open('data/news_dialogue_sample_scored.json') as f:
    scored_transcripts = json.load(f)

# Sum values across all transcripts
occupation_summary = summary_values(scored_transcripts, summary = "occupation")
speaker_summary = summary_values(scored_transcripts, summary = "speaker")

# Print the final occupation summary
print(json.dumps(occupation_summary, indent=2))

{
  "News Media": {
    "statements": 141,
    "total_tokens": 9173,
    "weights": {
      "none": 2937.9860741496086,
      "hedge": 2723.966061577201,
      "authority": 3511.047871917486
    },
    "num_statements": {
      "none": 17,
      "hedge": 6,
      "authority": 118
    },
    "avg_scores": {
      "none": 0.3202862830207793,
      "hedge": 0.2969547652433447,
      "authority": 0.3827589525692234
    }
  },
  "Unknown": {
    "statements": 15,
    "total_tokens": 453,
    "weights": {
      "none": 145.53834465146065,
      "hedge": 131.20535695552826,
      "authority": 176.25629745423794
    },
    "num_statements": {
      "none": 5,
      "hedge": 1,
      "authority": 9
    },
    "avg_scores": {
      "none": 0.3212766990098469,
      "hedge": 0.28963654957070256,
      "authority": 0.38908674934710363
    }
  },
  "Other": {
    "statements": 7,
    "total_tokens": 268,
    "weights": {
      "none": 79.54381296038628,
      "hedge": 80.22241213917732,
      "auth

In [None]:
test_transcript = json_transcripts[1]
print("Original Transcript:")
print(json.dumps(test_transcript['utt'], indent=2))

Original Transcript:
[
  "Happening now, answering ISIS threats. The White House warns the terror group, \"If you come after Americans, we will come after you.\" But is the U.S. ready to take that fight beyond Iraq and into Syria? And ISIS warns America, we thirst for your blood. New details on the chilling e-mail sent to the family of a beheaded American. And I'll also speak with the lawmaker who represents the family of another ISIS hostage. And a new controversy in Ferguson. A St. Louis County Police officer who was involved in crowd control is relieved of duty after inflammatory statements. Wolf Blitzer is off tonight. I'm Brianna Keilar. You're in",
  "Is the U.S. moving toward an all-out war with ISIS? The White House is making it clear that the brutal beheading of an American hostage will not go unanswered.",
  "If you come after Americans, we're going to come after you wherever you are. And that's going to guide our planning in the days to come.",
  "That comes a day after a st

In [None]:
# transcript_out = process_transcript(test_transcript)

In [None]:
# print("\n\nProcessed Transcript:")
# print(json.dumps(transcript_out['detailed_output'], indent=2))

In [None]:
scored_transcripts = []

transcripts = json_transcripts[:1]

for transcript in transcripts:
    transcript_out = process_transcript(transcript)
    # print(json.dumps(transcript_out, indent=2))
    for u in transcript_out['detailed_output']:
        u['token_softmax'] = compact_token_softmax(u['token_softmax'])        
    for key, utterance_details in transcript_out['detailed_output'].items():
        utterance_details['token_softmax'] = compact_token_softmax(utterance_details['token_softmax'])
    scored_transcripts.append(transcript_out)

with open('data/news_dialogue_sample_scored.json', 'w') as f:
    json.dump(scored_transcripts, f, indent=2)

Received response from model.

Categorizing Utterances:


TypeError: string indices must be integers

In [None]:
for transcript in scored_transcripts:
    for k, v in transcript["detailed_output"].items():
        print(json.dumps(v["summary"], indent=2))

{
  "statement": "You're watching CNN. This is CONNECT THE WORLD. I'm Becky Anderson. Welcome back, it's just half past seven here in the UAE. They call it the forgotten war but not for those living the bloodshed every single day. And it is important that you know about it. I'm talking about the conflict in Yemen, where today the Saudi-led coalition carried out several air strikes in what is the Houthi controlled capital of Sanaa. Now Saudi state-run news says forces hit a number of what it calls legitimate military targets. That's after Houthi militia claimed responsibility for the drone attack that targeted two oil pumping stations in Saudi Arabia. It's messy times. Early I spoke to Martin Griffiths who's the U.N. Special Counsel Envoy for Yemen to the Security Council and I got his take on how he sees the situation on the ground right now.",
  "category": "authority",
  "speaker_name": "ANDERSON, BECKY",
  "speaker_occupation": "News Media",
  "statement_tokens": 185,
  "label_score

In [None]:
for item in scored_transcripts:
    print(json.dumps(item["detailed_output"]["summary"], indent=2))
#     print(json.dumps(item['detailed_output']['summary'], indent=2))

TypeError: list indices must be integers or slices, not str

In [None]:
from transformers import AutoModelForSequenceClassification

# Load the pre-trained model
original_model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Initialize the model and ensure it and its weights are on the correct device
model_with_loss = CustomModel(original_model.to(device), class_weights_tensor.to(device))

from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch",
)

# Initialize Trainer with the custom model
trainer = Trainer(
    model=model_with_loss,  # Ensure this is your custom model accepting weights
    args=training_args,
    train_dataset=train_tokenized,  # Ensure datasets are correctly tokenized
    eval_dataset=eval_tokenized,
    compute_metrics=compute_metrics,  # Custom metrics function if needed
    callbacks=[CustomSaveCallback('./weighted_checkpoints', training_args.per_device_train_batch_size, tokenizer)]
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)

import datetime
# Get current datetime to use as a unique identifier
current_time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Define model parameters to include in the filename
num_epochs = training_args.num_train_epochs
batch_size = training_args.per_device_train_batch_size

# Define the directory using the timestamp and model parameters
model_dir = f'./models/weighted_model_epochs-{num_epochs}_batch-{batch_size}_{current_time}'
tokenizer_dir = f'./models/weighted_tokenizer_epochs-{num_epochs}_batch-{batch_size}_{current_time}'

# Save the model and tokenizer with detailed names
model.save_pretrained(model_dir)
tokenizer.save_pretrained(tokenizer_dir)

print(f"Model saved in directory: {model_dir}")
print(f"Tokenizer saved in directory: {tokenizer_dir}")

# Print evaluation results
print("Evaluation results:", eval_results)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,F1 Label 0,F1 Label 1,F1 Label 2
1,No log,1.006237,0.578544,0.438279,0.470243,0.389573,0.642857,0.0,0.525862
2,No log,0.79858,0.777778,0.491021,0.547693,0.508683,0.847262,0.0,0.678788
3,No log,0.774772,0.900383,0.944083,0.65217,0.705556,0.933333,0.333333,0.85
4,0.883900,0.678693,0.91954,0.810561,0.813295,0.811909,0.948509,0.6,0.887218
5,0.883900,0.853811,0.91954,0.790169,0.772017,0.780339,0.952128,0.5,0.888889
6,0.883900,0.660894,0.934866,0.867939,0.827,0.844624,0.956757,0.666667,0.910448


Saved model and tokenizer to ./weighted_checkpoints/model_checkpoint_epoch-1.0_batch-8


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saved model and tokenizer to ./weighted_checkpoints/model_checkpoint_epoch-2.0_batch-8


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saved model and tokenizer to ./weighted_checkpoints/model_checkpoint_epoch-3.0_batch-8
Saved model and tokenizer to ./weighted_checkpoints/model_checkpoint_epoch-4.0_batch-8
Saved model and tokenizer to ./weighted_checkpoints/model_checkpoint_epoch-5.0_batch-8
Saved model and tokenizer to ./weighted_checkpoints/model_checkpoint_epoch-6.0_batch-8


Evaluation results: {'eval_loss': 0.6608937978744507, 'eval_accuracy': 0.9348659003831418, 'eval_precision': 0.8679385267620562, 'eval_recall': 0.826999726999727, 'eval_f1': 0.8446237282058178, 'eval_f1_label_0': 0.9567567567567568, 'eval_f1_label_1': 0.6666666666666666, 'eval_f1_label_2': 0.9104477611940298, 'eval_runtime': 2.1763, 'eval_samples_per_second': 119.93, 'eval_steps_per_second': 15.164, 'epoch': 6.0}
Model saved in directory: ./models/weighted_model_epochs-6_batch-8_2024-06-05_13-57-48
Tokenizer saved in directory: ./models/weighted_tokenizer_epochs-6_batch-8_2024-06-05_13-57-48
Evaluation results: {'eval_loss': 0.6608937978744507, 'eval_accuracy': 0.9348659003831418, 'eval_precision': 0.8679385267620562, 'eval_recall': 0.826999726999727, 'eval_f1': 0.8446237282058178, 'eval_f1_label_0': 0.9567567567567568, 'eval_f1_label_1': 0.6666666666666666, 'eval_f1_label_2': 0.9104477611940298, 'eval_runtime': 2.1763, 'eval_samples_per_second': 119.93, 'eval_steps_per_second': 15.164

In [None]:
from transformers import AutoModelForSequenceClassification

# Load the pre-trained model
original_model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Initialize the model and ensure it and its weights are on the correct device
model_with_loss = CustomModel(original_model.to(device), class_weights_tensor.to(device))

from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch",
)

# Initialize Trainer with the custom model
trainer = Trainer(
    model=model_with_loss,  # Ensure this is your custom model accepting weights
    args=training_args,
    train_dataset=train_tokenized,  # Ensure datasets are correctly tokenized
    eval_dataset=eval_tokenized,
    compute_metrics=compute_metrics,  # Custom metrics function if needed
    callbacks=[CustomSaveCallback('./weighted_checkpoints', training_args.per_device_train_batch_size, tokenizer)]
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)

import datetime
# Get current datetime to use as a unique identifier
current_time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Define model parameters to include in the filename
num_epochs = training_args.num_train_epochs
batch_size = training_args.per_device_train_batch_size

# Define the directory using the timestamp and model parameters
model_dir = f'./models/weighted_model_epochs-{num_epochs}_batch-{batch_size}_{current_time}'
tokenizer_dir = f'./models/weighted_tokenizer_epochs-{num_epochs}_batch-{batch_size}_{current_time}'

# Save the model and tokenizer with detailed names
model.save_pretrained(model_dir)
tokenizer.save_pretrained(tokenizer_dir)

print(f"Model saved in directory: {model_dir}")
print(f"Tokenizer saved in directory: {tokenizer_dir}")

# Print evaluation results
print("Evaluation results:", eval_results)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,F1 Label 0,F1 Label 1,F1 Label 2
1,No log,1.006237,0.578544,0.438279,0.470243,0.389573,0.642857,0.0,0.525862
2,No log,0.79858,0.777778,0.491021,0.547693,0.508683,0.847262,0.0,0.678788
3,No log,0.774772,0.900383,0.944083,0.65217,0.705556,0.933333,0.333333,0.85
4,0.883900,0.678693,0.91954,0.810561,0.813295,0.811909,0.948509,0.6,0.887218
5,0.883900,0.853811,0.91954,0.790169,0.772017,0.780339,0.952128,0.5,0.888889
6,0.883900,0.660894,0.934866,0.867939,0.827,0.844624,0.956757,0.666667,0.910448


Saved model and tokenizer to ./weighted_checkpoints/model_checkpoint_epoch-1.0_batch-8


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saved model and tokenizer to ./weighted_checkpoints/model_checkpoint_epoch-2.0_batch-8


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saved model and tokenizer to ./weighted_checkpoints/model_checkpoint_epoch-3.0_batch-8
Saved model and tokenizer to ./weighted_checkpoints/model_checkpoint_epoch-4.0_batch-8
Saved model and tokenizer to ./weighted_checkpoints/model_checkpoint_epoch-5.0_batch-8
Saved model and tokenizer to ./weighted_checkpoints/model_checkpoint_epoch-6.0_batch-8


Evaluation results: {'eval_loss': 0.6608937978744507, 'eval_accuracy': 0.9348659003831418, 'eval_precision': 0.8679385267620562, 'eval_recall': 0.826999726999727, 'eval_f1': 0.8446237282058178, 'eval_f1_label_0': 0.9567567567567568, 'eval_f1_label_1': 0.6666666666666666, 'eval_f1_label_2': 0.9104477611940298, 'eval_runtime': 2.1763, 'eval_samples_per_second': 119.93, 'eval_steps_per_second': 15.164, 'epoch': 6.0}
Model saved in directory: ./models/weighted_model_epochs-6_batch-8_2024-06-05_13-57-48
Tokenizer saved in directory: ./models/weighted_tokenizer_epochs-6_batch-8_2024-06-05_13-57-48
Evaluation results: {'eval_loss': 0.6608937978744507, 'eval_accuracy': 0.9348659003831418, 'eval_precision': 0.8679385267620562, 'eval_recall': 0.826999726999727, 'eval_f1': 0.8446237282058178, 'eval_f1_label_0': 0.9567567567567568, 'eval_f1_label_1': 0.6666666666666666, 'eval_f1_label_2': 0.9104477611940298, 'eval_runtime': 2.1763, 'eval_samples_per_second': 119.93, 'eval_steps_per_second': 15.164

In [None]:
speaker_list = []
for i in range(len(utt)):
    speaker = speakers[i]
    print("Original:", speaker)
    new_speaker = json_response[speaker]
    speaker_list.append(new_speaker)

transcript['formatted_speaker_list'] = speaker_list

# Save the updated transcripts to a new JSON file
with open('data/news_dialogue_sample_with_speaker_id.json', 'w') as f:
    json.dump(json_transcripts, f, indent=4)

print(json.dumps(transcript, indent=4))

Original: BRIANNA KEILAR, CNN ANCHOR
Original: THE SITUATION ROOM. KEILAR


KeyError: 'THE SITUATION ROOM. KEILAR'

In [None]:
import json

print(json.dumps(json_response, indent=4))

{
    "BRIANNA KEILAR, CNN ANCHOR": {
        "name": "KEILAR, BRIANNA",
        "occupation": "News Media"
    },
    "KEILAR": {
        "name": "KEILAR, BRIANNA",
        "occupation": "News Media"
    },
    "UNIDENTIFIED MALE": {
        "name": "UNIDENTIFIED MALE",
        "occupation": "Unknown"
    },
    "MICHELLE KOSINSKI, CNN WHITE HOUSE CORRESPONDENT": {
        "name": "KOSINSKI, MICHELLE",
        "occupation": "News Media"
    },
    "KOSINSKI (voice-over)": {
        "name": "KOSINSKI, MICHELLE",
        "occupation": "News Media"
    },
    "KOSINSKI": {
        "name": "KOSINSKI, MICHELLE",
        "occupation": "News Media"
    },
    "CHUCK HAGEL, U.S. SECRETARY OF DEFENSE": {
        "name": "HAGEL, CHUCK",
        "occupation": "Government Official"
    },
    "BRIAN TODD, CNN CORRESPONDENT": {
        "name": "TODD, BRIAN",
        "occupation": "News Media"
    },
    "TODD": {
        "name": "TODD, BRIAN",
        "occupation": "News Media"
    },
    "CHRIS V

In [None]:
For each transcript, get the details of the speakers that are provided
Ask GPT-4 to identify whether the speaker is a politician (and party), a journalist, or a member of the public