In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import pandas as pd
import os
from peft import PeftModel
from typing import List
from tqdm import tqdm
import numpy as np

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [None]:
!pip install torch
!pip install peft
!pip install transformers
!pip install einops
!pip install sentencepiece
!pip install -U pandas

In [3]:
cache_dir = "/home/ec2-user/SageMaker"
os.environ['HF_HOME'] = cache_dir

In [None]:
def chunk_text(text: str, size: int) -> List[str]:
    """
    Splits a given text into smaller chunks of a specified size.

    Args:
        text (str): The input text to be chunked.
        size (int): The maximum size of each chunk.

    Returns:
        List[str]: A list containing text chunks.
    """
    return [text[i:i+size] for i in range(0, len(text), size)]


def extract_names(text: str,
                  tokenizer: AutoTokenizer,
                  model: torch.nn.Module,
                  chunk_size: int = 1000) -> str:
    """
    Extracts person names from the input text using a language model.

    Args:
        text (str): The input text containing potential person names.
        tokenizer (AutoTokenizer): Tokenizer associated with the language
        model.
        model (torch.nn.Module): The fine-tuned language model used for name
        extraction.
        chunk_size (int, optional): Maximum characters per chunk for processing.
        Defaults to 1000.

    Returns:
        str: Concatenated string of recognized person names and entities.
    """
    chunks = chunk_text(text, chunk_size)
    all_names = []

    for chunk in chunks:
        prompt = (
            # "Recognize all people names in the following text. "
            # "Format the answer as: person name: entity; person name: entity. \n\n"
            "Recognize all politically invlolved names in the text."
            "Return the names of the entities as a list, without duplicates.\n"
            "The answer format should be \"person: entity's name; person: entity's name\""
            f"Text: {chunk} \nAnswer:"
        )

        inputs = tokenizer(
            [prompt],
            return_tensors="pt",
            padding=True
        ).to(model.device)

        with torch.no_grad():
            output = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                use_cache=False
            )

        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
        response = generated_text.split("Answer:")[-1].strip()
        all_names.append(response)

    return "; ".join(all_names).strip()


def recognise_names(df: pd.DataFrame,
                    new_column_name: str = "recognized_names") -> pd.DataFrame:
    """
    Recognizes person names from interview questions and answers using a
    fine-tuned LLM.

    Args:
        df (pd.DataFrame): DataFrame containing 'interview_question'
        and 'interview_answer'.
        new_column_name (str, optional): Name of the new column to store
        recognized names. Defaults to 'recognized_names'.

    Returns:
        pd.DataFrame: The original DataFrame with an additional
        column of recognized names.
    """
    base_model_path = "internlm/internlm2_5-7b"
    lora_path = "Umean/B2NER-Internlm2.5-7B-LoRA"

    tokenizer = AutoTokenizer.from_pretrained(base_model_path,
                                              trust_remote_code=True,
                                              cache_dir=cache_dir)
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_path,
        trust_remote_code=True,
        torch_dtype=torch.float16,
        device_map="auto",
        cache_dir=cache_dir
    )

    model = PeftModel.from_pretrained(
        base_model, lora_path, torch_dtype=torch.float16, device_map="auto"
    )
    model.eval()

    unique_pairs = df[["interview_question", "interview_answer"]].drop_duplicates()

    names_map = {}
    for _, row in tqdm(unique_pairs.iterrows(),
                       total=len(unique_pairs),
                       desc="Extracting Names"):
        key = row["interview_question"]
        text = str(row["interview_question"] + row["interview_answer"])
        names_map[key] = extract_names(text, tokenizer, model)

    df[new_column_name] = df["interview_question"].map(names_map)
    return df

In [None]:
test_df = pd.read_csv('preprocessed_data/test_set.csv')
recognise_names(test_df, "recognised_names")

file_path = "./preprocessed_data/names/named_test_set2.csv"
test_df.to_csv(file_path)

In [10]:
df1 = pd.read_csv('preprocessed_data/names/named_test_set1.csv')["recognised_names"]
df1 = df1.str.replace(r'None;?|None', '', regex=True)
df1 = df1.str.replace(r'None', '', regex=True)
df1 = df1.str.replace(r'person:', '', regex=True)
df1 = df1.str.replace(r's name:', '', regex=True)
df1 = df1.str.replace(r'entity:', '', regex=True)
df1 = df1.str.replace(r'entity;?|', '', regex=True)
df1 = df1.str.replace(r',', ';', regex=True)
df1 = df1.str.replace(r'Mr. President;?', '', regex=True)
df1 = df1.str.replace(r'President;?', '', regex=True)
df1 = df1.str.replace(r'Mr. Prime Minister;', '', regex=True)
df1 = df1.str.replace(r'Prime Minister', '', regex=True)
df1 = df1.str.replace(r'person;?|', '', regex=True)
df1 = df1.str.replace(r'person', '', regex=True)
df1 = df1.str.replace(r'skeptics;', '', regex=True)
df1 = df1.str.replace(r'leaders;', '', regex=True)
df1 = df1.str.replace(r'person', '', regex=True)
df1 = df1.str.replace(r'Kevin Corke', '', regex=True)
df1 = df1.str.replace(r'Jessica', '', regex=True)
df1 = df1.str.replace(r'Prime;', '', regex=True)
df1 = df1.str.replace(r'R;', '', regex=True)
df1 = df1.str.replace(r'so-and-so;', '', regex=True)
df1 = df1.str.replace(r'ial;', '', regex=True)
df1 = df1.str.replace(r'dad;', '', regex=True)
df1 = df1.str.replace(r'father', '', regex=True)
df1 = df1.str.replace(r' ted;', '', regex=True)
df1 = df1.str.replace(r'so-and-so;', '', regex=True)
df1 = df1.str.replace(r'Lebanese', '', regex=True)
df1 = df1.str.replace(r'Government', '', regex=True)
df1 = df1.str.replace(r'Democrats', '', regex=True)
df1 = df1.str.replace(r'Republican leader', '', regex=True)
df1 = df1.str.replace(r'American people', '', regex=True)
df1 = df1.str.replace(r'American s;', '', regex=True)
df1 = df1.str.replace(r'ian people;', '', regex=True)
df1 = df1.str.replace(r'Israel;', '', regex=True)
df1 = df1.str.replace(r'of the United States;', '', regex=True)
df1 = df1.str.replace(r'of Lebanon', '', regex=True)
df1 = df1.str.replace(r'Lebanon', '', regex=True)
df1 = df1.str.replace(r'Congress', '', regex=True)
df1 = df1.str.replace(r'Presid', '', regex=True)
df1 = df1.str.replace(r' C;', '', regex=True)
df1 = df1.str.replace(r' U.S.;', '', regex=True)
df1 = df1.str.replace(r'Secretaries of Defence', '', regex=True)
df1 = df1.str.replace(r'National Security Advisers;', '', regex=True)
df1 = df1.str.replace(r'Republican chairman;', '', regex=True)
df1 = df1.str.replace(r'administration', '', regex=True)
df1 = df1.str.replace(r'bipartisan', '', regex=True)
df1 = df1.str.replace(r'-elect', '', regex=True)
df1 = df1.str.replace(r'Senators', '', regex=True)
df1 = df1.str.replace(r'Prime Minister;', '', regex=True)
df1 = df1.str.replace(r'Cuban people;', '', regex=True)
df1 = df1.str.replace(r'Cuban exiles;', '', regex=True)
df1 = df1.str.replace(r'Cuban;', '', regex=True)
df1 = df1.str.replace(r'U.N.;', '', regex=True)
df1 = df1.str.replace(r'Trade Minister;', '', regex=True)
df1 = df1.str.replace(r'U.S. Trade Representative', '', regex=True)
df1 = df1.str.replace(r'Americans', '', regex=True)
df1 = df1.str.replace(r'local', '', regex=True)
df1 = df1.str.replace(r'American citizens;', '', regex=True)
df1 = df1.str.replace(r'Border Patrol agent;', '', regex=True)
df1 = df1.str.replace(r'Member of', '', regex=True)
df1 = df1.str.replace(r'Defence Secretary;', '', regex=True)
df1 = df1.str.replace(r'FDA Commissioner', '', regex=True)
df1 = df1.str.replace(r'Ambassador;', '', regex=True)
df1 = df1.str.replace(r'Saddamists;', '', regex=True)
df1 = df1.str.replace(r'Palestinians;', '', regex=True)
df1 = df1.str.replace(r'Israelis;', '', regex=True)
df1 = df1.str.replace(r'nian citizens;', '', regex=True)
df1 = df1.str.replace(r'seniors;', '', regex=True)
df1 = df1.str.replace(r'marks;', '', regex=True)
df1 = df1.str.replace(r'Secretary of State', '', regex=True)
df1 = df1.str.replace(r'NATO;', '', regex=True)
df1 = df1.str.replace(r'lks', '', regex=True)
df1 = df1.str.replace(r'Social Security', '', regex=True)
df1 = df1.str.replace(r'House Republicans', '', regex=True)
df1 = df1.str.replace(r'terrorists', '', regex=True)
df1 = df1.str.replace(r'extremists', '', regex=True)
df1 = df1.str.replace(r'foreign', '', regex=True)
df1 = df1.str.replace(r'mother', '', regex=True)
df1 = df1.str.replace(r'Home Secretary', '', regex=True)
df1 = df1.str.replace(r'children', '', regex=True)
df1 = df1.str.replace(r'Iraq', '', regex=True)
df1 = df1.str.replace(r'Afghanistan', '', regex=True)
df1 = df1.str.replace(r'enemy', '', regex=True)
df1 = df1.str.replace(r'allies', '', regex=True)
df1 = df1.str.replace(r'Commander in Chief', '', regex=True)
df1 = df1.str.replace(r'American;', '', regex=True)
df1 = df1.str.replace(r'European political leadership', '', regex=True)
df1 = df1.str.replace(r'tribal chiefs', '', regex=True)
df1 = df1.str.replace(r'i people', '', regex=True)
df1 = df1.str.replace(r'Al Qaida', '', regex=True)
df1 = df1.str.replace(r'Jerusalem', '', regex=True)
df1 = df1.str.replace(r', Israelis, America, Britain, Palestinians, Palestinian', '', regex=True)
df1 = df1.str.replace(r'Prime;', '', regex=True)
df1 = df1.str.replace(r'Jackson;', '', regex=True)
df1 = df1.str.replace(r'Helen Thomas', '', regex=True)
df1 = df1.str.replace(r'Mike Emanuel', '', regex=True)
df1 = df1.str.replace(r'Terry', '', regex=True)
df1 = df1.str.replace(r'Mr.', '', regex=True)
df1 = df1.str.replace(r'Illinois National Guardsmen', '', regex=True)
df1 = df1.str.replace(r'Taliban', '', regex=True)
df1 = df1.str.replace(r'Israeli soldiers', '', regex=True)
df1 = df1.str.replace(r'American Urban Radio Networks', '', regex=True)
df1 = df1.str.replace(r'Iran', '', regex=True)
df1 = df1.str.replace(r'Russia', '', regex=True)
df1 = df1.str.replace(r'Members of', '', regex=True)
df1 = df1.str.replace(r'South', '', regex=True)
df1 = df1.str.replace(r"; 's;", ";", regex=True)
df1 = df1.str.replace(r"';", ";", regex=True)
df1 = df1.str.replace(r"'", " ", regex=True)
df1 = df1.str.replace(r'medical experts', '', regex=True)
df1 = df1.str.replace(r'Defense Secretary', '', regex=True)
df1 = df1.str.replace(r'Soviet', '', regex=True)
df1 = df1.str.replace(r'America', '', regex=True)
df1 = df1.str.replace(r'Republicans', '', regex=True)
df1 = df1.str.replace(r'Arab', '', regex=True)
df1 = df1.str.replace(r'Pakistan', '', regex=True)
df1 = df1.str.replace(r'Syria', '', regex=True)

df1.replace(r'^\s+$', np.nan, regex=True, inplace=True)
print(df1.count())
df1.to_csv('preprocessed_data/names/named_test_set_proc1.csv')

233


In [11]:
df1 = pd.read_csv('preprocessed_data/names/named_test_set2.csv')["recognised_names"]
df1 = df1.str.replace(r'None;?|None', '', regex=True)
df1 = df1.str.replace(r'None', '', regex=True)
df1 = df1.str.replace(r'person:', '', regex=True)
df1 = df1.str.replace(r's name:', '', regex=True)
df1 = df1.str.replace(r'entity:', '', regex=True)
df1 = df1.str.replace(r'entity;?|', '', regex=True)
df1 = df1.str.replace(r',', ';', regex=True)
df1 = df1.str.replace(r'Mr. President;?', '', regex=True)
df1 = df1.str.replace(r'President;?', '', regex=True)
df1 = df1.str.replace(r'Mr. Prime Minister;', '', regex=True)
df1 = df1.str.replace(r'Prime Minister;', '', regex=True)
df1 = df1.str.replace(r'Prime Minister', '', regex=True)
df1 = df1.str.replace(r'person;?|', '', regex=True)
df1 = df1.str.replace(r'person', '', regex=True)
df1 = df1.str.replace(r'skeptics;', '', regex=True)
df1 = df1.str.replace(r'leaders;', '', regex=True)
df1 = df1.str.replace(r'person', '', regex=True)
df1 = df1.str.replace(r'Kevin Corke', '', regex=True)
df1 = df1.str.replace(r'Jessica', '', regex=True)
df1 = df1.str.replace(r'Prime;', '', regex=True)
df1 = df1.str.replace(r'R;', '', regex=True)
df1 = df1.str.replace(r'so-and-so;', '', regex=True)
df1 = df1.str.replace(r'ial;', '', regex=True)
df1 = df1.str.replace(r'dad;', '', regex=True)
df1 = df1.str.replace(r'father', '', regex=True)
df1 = df1.str.replace(r' ted;', '', regex=True)
df1 = df1.str.replace(r'so-and-so;', '', regex=True)
df1 = df1.str.replace(r'Lebanese', '', regex=True)
df1 = df1.str.replace(r'Government', '', regex=True)
df1 = df1.str.replace(r'Democrats', '', regex=True)
df1 = df1.str.replace(r'Republican leader', '', regex=True)
df1 = df1.str.replace(r'American people', '', regex=True)
df1 = df1.str.replace(r'American s;', '', regex=True)
df1 = df1.str.replace(r'ian people;', '', regex=True)
df1 = df1.str.replace(r'Israel;', '', regex=True)
df1 = df1.str.replace(r'of the United States;', '', regex=True)
df1 = df1.str.replace(r'of Lebanon', '', regex=True)
df1 = df1.str.replace(r'Lebanon', '', regex=True)
df1 = df1.str.replace(r'Congress', '', regex=True)
df1 = df1.str.replace(r'Presid', '', regex=True)
df1 = df1.str.replace(r' C;', '', regex=True)
df1 = df1.str.replace(r' U.S.;', '', regex=True)
df1 = df1.str.replace(r'Secretaries of Defence', '', regex=True)
df1 = df1.str.replace(r'National Security Advisers;', '', regex=True)
df1 = df1.str.replace(r'Republican chairman;', '', regex=True)
df1 = df1.str.replace(r'administration', '', regex=True)
df1 = df1.str.replace(r'bipartisan', '', regex=True)
df1 = df1.str.replace(r'-elect', '', regex=True)
df1 = df1.str.replace(r'Senators', '', regex=True)
df1 = df1.str.replace(r'Prime Minister;', '', regex=True)
df1 = df1.str.replace(r'Cuban people;', '', regex=True)
df1 = df1.str.replace(r'Cuban exiles;', '', regex=True)
df1 = df1.str.replace(r'Cuban;', '', regex=True)
df1 = df1.str.replace(r'U.N.;', '', regex=True)
df1 = df1.str.replace(r'Trade Minister;', '', regex=True)
df1 = df1.str.replace(r'U.S. Trade Representative', '', regex=True)
df1 = df1.str.replace(r'Americans', '', regex=True)
df1 = df1.str.replace(r'local', '', regex=True)
df1 = df1.str.replace(r'American citizens;', '', regex=True)
df1 = df1.str.replace(r'Border Patrol agent;', '', regex=True)
df1 = df1.str.replace(r'Member of', '', regex=True)
df1 = df1.str.replace(r'Defence Secretary;', '', regex=True)
df1 = df1.str.replace(r'FDA Commissioner', '', regex=True)
df1 = df1.str.replace(r'Ambassador;', '', regex=True)
df1 = df1.str.replace(r'Saddamists;', '', regex=True)
df1 = df1.str.replace(r'Palestinians;', '', regex=True)
df1 = df1.str.replace(r'Israelis;', '', regex=True)
df1 = df1.str.replace(r'nian citizens;', '', regex=True)
df1 = df1.str.replace(r'seniors;', '', regex=True)
df1 = df1.str.replace(r'marks;', '', regex=True)
df1 = df1.str.replace(r'Secretary of State', '', regex=True)
df1 = df1.str.replace(r'NATO;', '', regex=True)
df1 = df1.str.replace(r'lks', '', regex=True)
df1 = df1.str.replace(r'Social Security', '', regex=True)
df1 = df1.str.replace(r'House Republicans', '', regex=True)
df1 = df1.str.replace(r'terrorists', '', regex=True)
df1 = df1.str.replace(r'extremists', '', regex=True)
df1 = df1.str.replace(r'foreign', '', regex=True)
df1 = df1.str.replace(r'mother', '', regex=True)
df1 = df1.str.replace(r'Home Secretary', '', regex=True)
df1 = df1.str.replace(r'children', '', regex=True)
df1 = df1.str.replace(r'Iraq', '', regex=True)
df1 = df1.str.replace(r'Afghanistan', '', regex=True)
df1 = df1.str.replace(r'enemy', '', regex=True)
df1 = df1.str.replace(r'allies', '', regex=True)
df1 = df1.str.replace(r'Commander in Chief', '', regex=True)
df1 = df1.str.replace(r'American;', '', regex=True)
df1 = df1.str.replace(r'European political leadership', '', regex=True)
df1 = df1.str.replace(r'tribal chiefs', '', regex=True)
df1 = df1.str.replace(r'i people', '', regex=True)
df1 = df1.str.replace(r'Al Qaida', '', regex=True)
df1 = df1.str.replace(r'Jerusalem', '', regex=True)
df1 = df1.str.replace(r', Israelis, America, Britain, Palestinians, Palestinian', '', regex=True)
df1 = df1.str.replace(r'Prime;', '', regex=True)
df1 = df1.str.replace(r'Jackson;', '', regex=True)
df1 = df1.str.replace(r'Helen Thomas', '', regex=True)
df1 = df1.str.replace(r'Mike Emanuel', '', regex=True)
df1 = df1.str.replace(r'Terry', '', regex=True)
df1 = df1.str.replace(r'Mr.', '', regex=True)
df1 = df1.str.replace(r'Illinois National Guardsmen', '', regex=True)
df1 = df1.str.replace(r'Taliban', '', regex=True)
df1 = df1.str.replace(r'Israeli soldiers', '', regex=True)
df1 = df1.str.replace(r'American Urban Radio Networks', '', regex=True)
df1 = df1.str.replace(r'Iran', '', regex=True)
df1 = df1.str.replace(r'Russia', '', regex=True)
df1 = df1.str.replace(r'Members of', '', regex=True)
df1 = df1.str.replace(r'South', '', regex=True)
df1 = df1.str.replace(r"; 's;", ";", regex=True)
df1 = df1.str.replace(r"';", ";", regex=True)
df1 = df1.str.replace(r"'", " ", regex=True)
df1 = df1.str.replace(r'medical experts', '', regex=True)
df1 = df1.str.replace(r'Defense Secretary', '', regex=True)
df1 = df1.str.replace(r'Soviet', '', regex=True)
df1 = df1.str.replace(r'America', '', regex=True)
df1 = df1.str.replace(r'Republicans', '', regex=True)
df1 = df1.str.replace(r'Arab', '', regex=True)
df1 = df1.str.replace(r'Pakistan', '', regex=True)
df1 = df1.str.replace(r'Syria', '', regex=True)

df1.replace(r'^\s+$', np.nan, regex=True, inplace=True)
print(df1.count())
df1.to_csv('preprocessed_data/names/named_test_set_proc2.csv')

241


In [12]:
df1 = pd.read_csv('preprocessed_data/names/named_test_set_proc1.csv')
df2 = pd.read_csv('preprocessed_data/names/named_test_set_proc2.csv')

def clean_and_split(s):
    if pd.isna(s) or not s.strip():
        return []
    return [item.strip() for item in s.split(';') if item.strip()]


def merge_row(row):
    combined = clean_and_split(row['df1']) + clean_and_split(row['df2'])
    return list(set(combined))

# Assuming df1 and df2 have one column each named 'names'
df_combined = pd.DataFrame({
    'df1': df1.iloc[:, 1],
    'df2': df2.iloc[:, 1]
})

df_combined['combined_entities'] = df_combined.apply(merge_row, axis=1)
df_combined = df_combined['combined_entities']
df_combined.to_csv('preprocessed_data/names/named_test_set_proc_final.csv')

In [13]:
import ast

df1 = pd.read_csv('preprocessed_data/names/named_test_set_proc_final.csv')
df2 = pd.read_csv('preprocessed_data/test_set.csv')

# Convert stringified lists to actual Python lists
df1['combined_entities'] = df1['combined_entities'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])

# filtered_df = df1[df1['combined_entities'].apply(lambda x: x != [])]
named_df = df2[df1['combined_entities'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
named_df.to_csv('preprocessed_data/test_set_named.csv')
unnamed_df = df2[df1['combined_entities'].apply(lambda x: isinstance(x, list) and len(x) == 0)]
unnamed_df.to_csv('preprocessed_data/test_set_unnamed.csv')

**Check names**

In [None]:
!pip install anthropic

In [40]:
import boto3
import json

# Set up AWS credentials
session = boto3.Session(
    aws_access_key_id='',
    aws_secret_access_key='',
    region_name='us-west-2'
)

# Create a Bedrock client
bedrock = session.client('bedrock-runtime')

# Claude 3 Haiku model ID
model_id = 'anthropic.claude-3-opus-20240229-v1:0'

# Messages API format with required version
body = {
    "messages": [
        {"role": "user", "content": "This is a list. Identify only the people names that are politically invlolved and return a list with only those names without dublicates: [Putin, Vladimir, John, pedophile, adios, Condi, Siniora, Senitor Warner, Taliban]"}
    ],
    "max_tokens": 300,
    "anthropic_version": "bedrock-2023-05-31"
}

# Call Claude 3 using Messages API
response = bedrock.invoke_model(
    modelId=model_id,
    contentType='application/json',
    accept='application/json',
    body=json.dumps(body)
)

result = json.loads(response['body'].read().decode())['content'][0]['text']
# Read and decode response
print(result)

Here is the list of names that are politically involved, without duplicates:

[Putin, Condi, Siniora, Senitor Warner, Taliban]


In [None]:
# Install the anthropic library
!pip install anthropic

import anthropic

# Initialize the Anthropics client
client = anthropic.Anthropic()

def is_person_name(entity):
    response = client.messages.create(
        model="claude-opus-4-1-20250805",
        max_tokens=1024,
        tools=[
            {
                "name": "is_person_name",
                "description": "Identify if the given entity is a person's name.",
                "input_schema": {
                    "type": "object",
                    "properties": {
                        "entity": {
                            "type": "string",
                            "description": "The entity to check, e.g. John Doe",
                        }
                    },
                    "required": ["entity"],
                },
            }
        ],
        messages=[{"role": "user", "content": f"Is '{entity}' a person's name?"}],
    )
    return response

# Example entity to test
entity = "John Doe"
result = is_person_name(entity)

print(result)