In [2]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the data
data = pd.read_csv('movie_actor_sentences_with_labels.csv')
device  = 'cuda:4'
# Load the tokenizer and model
# Replace 'path_to_llama_8b_tokenizer' and 'path_to_llama_8b_model' with the actual paths or identifiers
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Meta-Llama-3.1-8B')
model = AutoModelForCausalLM.from_pretrained('meta-llama/Meta-Llama-3.1-8B',cache_dir="/mnt/nfs1/tianyizhou/cache").to(device)

# Set the model to evaluation mode
model.eval()


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:07<00:00,  1.78s/it]


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (n

In [3]:
# Add a pad token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '<PAD>'})
    model.resize_token_embeddings(len(tokenizer))
    print("Added pad token to tokenizer and resized model embeddings.")

# Set pad_token_id
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids('<PAD>')
model.config.pad_token_id = tokenizer.pad_token_id
print(f"Tokenizer pad_token_id: {tokenizer.pad_token_id}")
print(f"Model pad_token_id: {model.config.pad_token_id}")
def predict_character(row):
    sentence = row['sentence']
    label = row['label']

    # Prepare the input for the model
    print(f"Input sentence: {sentence}")
    print(f"Actual label: {label}")

    # Tokenize the input and get input_ids and attention_mask
    encoding = tokenizer(sentence, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Generate the model's prediction
    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=50,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            early_stopping=True,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode the output
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    print(f"Generated text: {generated_text}")

    # Extract the character name from the generated text
    if 'played as' in generated_text:
        predicted_character = generated_text.split('played as')[-1].strip().rstrip('.')
    else:
        predicted_character = ''
    print(f"Predicted label: {predicted_character}\n")

    return predicted_character

# Apply the prediction function to each row
data['predicted_label'] = data.apply(predict_character, axis=1)

# Compare the predicted labels with the actual labels
data['is_match'] = data.apply(
    lambda row: row['label'].strip().lower() == row['predicted_label'].strip().lower(), axis=1)

# Filter out the rows where the prediction does not match the label
filtered_data = data[data['is_match']]

# Save the filtered data
filtered_data.to_csv('filtered_movie_actor_sentences.csv', index=False)

# Print some examples
print(filtered_data[['sentence', 'label']].head())

Added pad token to tokenizer and resized model embeddings.
Tokenizer pad_token_id: 128256
Model pad_token_id: 128256
Input sentence: In Miss Jerry, Blanche Bayliss played as 
Actual label: Miss Geraldine Holbrook (Miss Jerry)




Generated text: In Miss Jerry, Blanche Bayliss played as 25-year-old Jerry who was a governess to the children of the wealthy and eccentric Mrs. Van Wyck (played by Helen Ware). The show was about Jerry’s life and the
Predicted label: 25-year-old Jerry who was a governess to the children of the wealthy and eccentric Mrs. Van Wyck (played by Helen Ware). The show was about Jerry’s life and the

Input sentence: In Miss Jerry, William Courtenay played as 
Actual label: Mr. Hamilton
Generated text: In Miss Jerry, William Courtenay played as 11-year-old Jerry. He was a bit of a loner, and his best friend was his dog, Spot. Jerry’s father was an alcoholic, but he didn’t know it,
Predicted label: 11-year-old Jerry. He was a bit of a loner, and his best friend was his dog, Spot. Jerry’s father was an alcoholic, but he didn’t know it,

Input sentence: In Miss Jerry, Chauncey Depew played as 
Actual label: Chauncey Depew - the Director of the New York Central Railroad
Generated text: In Miss Jer

In [6]:
# Reattempting to display the dataframe without ace_tools, and also preparing to export it as CSV for the user
import pandas as pd

# Creating the dataframe again with the input data
data = {
    "Input Sentence": [
        "In Miss Jerry, William Courtenay played as", 
        "In Miss Jerry, Chauncey Depew played as", 
        "In The Corbett-Fitzsimmons Fight, James J. Corbett played as", 
        "In The Corbett-Fitzsimmons Fight, Bob Fitzsimmons played as", 
        "In The Corbett-Fitzsimmons Fight, Billy Madden played as", 
        "In The Corbett-Fitzsimmons Fight, George Siler played as", 
        "In The Corbett-Fitzsimmons Fight, John L. Sullivan played as", 
        "In The Story of the Kelly Gang, Elizabeth Tait played as", 
        "In The Story of the Kelly Gang, John Tait played as", 
        "In The Story of the Kelly Gang, Nicholas Brierley played as", 
        "In The Story of the Kelly Gang, Norman Campbell played as", 
        "In The Story of the Kelly Gang, Godfrey Cass played as", 
        "In The Story of the Kelly Gang, Will Coyne played as", 
        "In The Story of the Kelly Gang, Sam Crewes played as", 
        "In The Story of the Kelly Gang, Jack Ennis played as", 
        "In The Story of the Kelly Gang, John Forde played as", 
        "In Robbery Under Arms, Jim Gerald played as", 
        "In Robbery Under Arms, George Merriman played as", 
        "In Robbery Under Arms, Lance Vane played as", 
        "In Robbery Under Arms, William Duff played as", 
        "In Robbery Under Arms, Arthur Guest played as", 
        "In Hamlet, Fernanda Negri Pouget played as", 
        "In The Fairylogue and Radio-Plays, L. Frank Baum played as", 
        "In The Fairylogue and Radio-Plays, Frank Burns played as", 
        "In The Fairylogue and Radio-Plays, George E. Wilson played as"
    ],
    "Actual Label": [
        "Mr. Hamilton", 
        "Chauncey Depew - the Director of the New York Central Railroad", 
        "Self", 
        "Self", 
        "Self - Sullivan's Manager", 
        "Self - Referee", 
        "Self - Master of Ceremonies", 
        "Kate Kelly", 
        "School Master", 
        "Joe Byrne", 
        "Steve Hart", 
        "Ned Kelly", 
        "Joe Byrne", 
        "Dan Kelly", 
        "Steve Hart", 
        "Dan Kelly", 
        "Warrigal", 
        "Warder", 
        "Inspector of Police", 
        "Trooper", 
        "Curate", 
        "Ofelia", 
        "The Wizard of Oz Man", 
        "Para Bruin - the Rubber Bear", 
        "Nick Chopper - Tin Woodman"
    ],
    "Generated Text": [
        "11-year-old Jerry. He was a bit of a loner, and his best friend was his dog, Spot. Jerry’s father was an alcoholic, but he didn’t know it", 
        "1 of the 3 leading roles. Miss Geraldine was a Broadway play, written by John Kendrick Bangs, which premiered at the Lyceum Theatre in New York City on February", 
        "1-5. The fight was held in Carson City, Nevada, and was the first time a fight had ever been broadcast live on the radio. In", 
        "8:1 underdog against Jim Corbet. Fitz was the British champion, Corbert was undefeated in 17 fights. The fight was on September 23", 
        "1 of the most effective, and most controversial, boxing managers in history. Madden, a former boxer himself, was a well known figure in the boxing scene, recognized for", 
        "1st Referee. In The Championship Prize Fight Between James J. Corbet and Peter F. Fitzsimons, he played 2nd Ref. He was", 
        "1st, which is the heavier weight. He weighed 198 lbs. and was 6'1\" tall. His opponent, James J. Corbet", 
        "1st Wife of Ned Kelly. In Ned and the Constable, she played the role of Maggie. She was born on 12 March 1868 in Melbourne, Victoria", 
        "2nd Constable Fitzpatrick. This was the first full-length feature film made in Australia.", 
        "2nd Constable. In The Mystery of a Hansom Cab, he played the role of Dr. Martin.", 
        "1st Trooper. He was born on 27 May 1879 in South Melbourne, Victoria, Australia.", 
        "1st Constable.", 
        "2nd Constable Arthur Jones. In The Nightingale, he played the role of Mr. Hare.", 
        "the leader of a group of police who attempted to capture the bushranger Ned Kelly.", 
        "17-year-old Ned Kelly. He was 24 years old at the time of filming.", 
        "1st Constable John Kelly.", 
        "2nd lead.", 
        "17-year-old Captain Starlight, who is a wanted man.", 
        "14-year-old Jim the Boy.", 
        "1st Lt. Jack O'Day.", 
        "1st Lieutenant.", 
        "1st Witch.", 
        "1) the Wizard of Oz and 2) a narrator.", 
        "2 characters: \"The Spirit of the Woods\" and \"A Fairy.\"", 
        "1st Fairy and 2nd Fairy."
    ],
    "Predicted Label": [
        "11-year-old Jerry. He was a bit of a loner, and his best friend was his dog, Spot. Jerry’s father was an alcoholic, but he didn’t know it", 
        "1 of the 3 leading roles. Miss Geraldine was a Broadway play, written by John Kendrick Bangs, which premiered at the Lyceum Theatre in New York City on February", 
        "1-5. The fight was held in Carson City, Nevada, and was the first time a fight had ever been broadcast live on the radio. In", 
        "8:1 underdog against Jim Corbet. Fitz was the British champion, Corbert was undefeated in 17 fights. The fight was on September 23", 
        "1 of the most effective, and most controversial, boxing managers in history. Madden, a former boxer himself, was a well known figure in the boxing scene, recognized for", 
        "1st Referee. In The Championship Prize Fight Between James J. Corbet and Peter F. Fitzsimons, he played 2nd Ref. He was", 
        "1st, which is the heavier weight. He weighed 198 lbs. and was 6'1\" tall. His opponent, James J. Corbet", 
        "1st Wife of Ned Kelly. In Ned and the Constable, she played the role of Maggie. She was born on 12 March 1868 in Melbourne, Victoria", 
        "2nd Constable Fitzpatrick. This was the first full-length feature film made in Australia.", 
        "2nd Constable. In The Mystery of a Hansom Cab, he played the role of Dr. Martin.", 
        "1st Trooper. He was born on 27 May 1879 in South Melbourne, Victoria, Australia.", 
        "1st Constable.", 
        "2nd Constable Arthur Jones. In The Nightingale, he played the role of Mr. Hare.", 
        "the leader of a group of police who attempted to capture the bushranger Ned Kelly.", 
        "17-year-old Ned Kelly. He was 24 years old at the time of filming.", 
        "1st Constable John Kelly.", 
        "2nd lead.", 
        "17-year-old Captain Starlight, who is a wanted man.", 
        "14-year-old Jim the Boy.", 
        "1st Lt. Jack O'Day.", 
        "1st Lieutenant.", 
        "1st Witch.", 
        "1) the Wizard of Oz and 2) a narrator.", 
        "2 characters: \"The Spirit of the Woods\" and \"A Fairy.\"", 
        "1st Fairy and 2nd Fairy."
    ]
}

df = pd.DataFrame(data)

# Saving this table as a CSV file for the user
file_path = "./generated_labels_comparison_table.csv"
df.to_csv(file_path, index=False)

file_path


'./generated_labels_comparison_table.csv'