In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import pandas as pd
import os
from peft import PeftModel
from typing import List

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [None]:
!pip install torch
!pip install peft
!pip install transformers
!pip install einops
!pip install sentencepiece
!pip install -U pandas

In [2]:
cache_dir = "/home/ec2-user/SageMaker"
os.environ['HF_HOME'] = cache_dir

In [None]:
def recognise_names(df: pd.DataFrame,
                    new_column_name:
                    str = "recognized_names"):

    base_model_path = "internlm/internlm2_5-7b"
    lora_path = "Umean/B2NER-Internlm2.5-7B-LoRA"

    tokenizer = AutoTokenizer.from_pretrained(base_model_path,
                                              trust_remote_code=True,
                                              cache_dir=cache_dir)
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_path,
        trust_remote_code=True,
        torch_dtype=torch.float16,
        device_map="auto",
        cache_dir=cache_dir
    )

    model = PeftModel.from_pretrained(
        base_model, lora_path, torch_dtype=torch.float16, device_map="auto"
    )
    model.eval()

    def extract_names(text, chunk_size=1000):
        # Split text into manageable chunks
        def chunk_text(text, size):
            return [text[i:i+size] for i in range(0, len(text), size)]

        chunks = chunk_text(text, chunk_size)
        all_names = []

        for chunk in chunks:
            prompt = ("Recognize all people names in the following text. "
                      "Format the answer as: person name: entity; person name: entity. \n\n"
                      f"Text: {chunk} \nAnswer:")

            inputs = tokenizer(
                [prompt],
                return_tensors="pt",
                padding=True,
                # truncation=True  # ensure the prompt fits the model
            ).to(model.device)

            with torch.no_grad():
                output = model.generate(
                    input_ids=inputs["input_ids"],
                    attention_mask=inputs["attention_mask"],
                    use_cache=False
                )

            generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
            response = generated_text.split("Answer:")[-1].strip()
            if "None" not in generated_text:
                all_names.append(response)

        return "; ".join(all_names).strip()

    unique_pairs = df[["interview_question", "interview_answer"]].drop_duplicates()

    names_map = {
        row["interview_question"]: extract_names(str(row['interview_question'] + row['interview_answer']))
        for _, row in unique_pairs.iterrows()
    }

    df[new_column_name] = df["interview_question"].map(names_map)
    return df


df = pd.read_csv('preprocessed_data/train_set.csv')
recognise_names(df,
                "recognised_names")

file_path = "./preprocessed_data/named_train_set.csv"
df.to_csv(file_path)

In [None]:
def chunk_text(text: str, size: int) -> List[str]:
    """
    Splits a given text into smaller chunks of a specified size.

    Args:
        text (str): The input text to be chunked.
        size (int): The maximum size of each chunk.

    Returns:
        List[str]: A list containing text chunks.
    """
    return [text[i:i+size] for i in range(0, len(text), size)]


def extract_names(text: str,
                  tokenizer: AutoTokenizer,
                  model: torch.nn.Module,
                  chunk_size: int = 1000) -> str:
    """
    Extracts person names from the input text using a language model.

    Args:
        text (str): The input text containing potential person names.
        tokenizer (AutoTokenizer): Tokenizer associated with the language
        model.
        model (torch.nn.Module): The fine-tuned language model used for name
        extraction.
        chunk_size (int, optional): Maximum characters per chunk for processing.
        Defaults to 1000.

    Returns:
        str: Concatenated string of recognized person names and entities.
    """
    chunks = chunk_text(text, chunk_size)
    all_names = []

    for chunk in chunks:
        prompt = (
            "Recognize all people names in the following text. "
            "Format the answer as: person name: entity; person name: entity. \n\n"
            f"Text: {chunk} \nAnswer:"
        )

        inputs = tokenizer(
            [prompt],
            return_tensors="pt",
            padding=True
        ).to(model.device)

        with torch.no_grad():
            output = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                use_cache=False
            )

        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
        response = generated_text.split("Answer:")[-1].strip()
        if "None" not in generated_text:
            all_names.append(response)

    return "; ".join(all_names).strip()


def recognise_names(df: pd.DataFrame,
                    new_column_name: str = "recognized_names") -> pd.DataFrame:
    """
    Recognizes person names from interview questions and answers using a fine-tuned LLM.

    Args:
        df (pd.DataFrame): DataFrame containing 'interview_question' and 'interview_answer'.
        new_column_name (str, optional): Name of the new column to store recognized names. Defaults to 'recognized_names'.

    Returns:
        pd.DataFrame: The original DataFrame with an additional column of recognized names.
    """
    base_model_path = "internlm/internlm2_5-7b"
    lora_path = "Umean/B2NER-Internlm2.5-7B-LoRA"

    tokenizer = AutoTokenizer.from_pretrained(base_model_path,
                                              trust_remote_code=True,
                                              cache_dir=cache_dir)
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_path,
        trust_remote_code=True,
        torch_dtype=torch.float16,
        device_map="auto",
        cache_dir=cache_dir
    )

    model = PeftModel.from_pretrained(
        base_model, lora_path, torch_dtype=torch.float16, device_map="auto"
    )
    model.eval()

    unique_pairs = df[["interview_question", "interview_answer"]].drop_duplicates()

    names_map = {
        row["interview_question"]: extract_names(
            str(row["interview_question"] + row["interview_answer"]),
            tokenizer,
            model
        )
        for _, row in unique_pairs.iterrows()
    }

    df[new_column_name] = df["interview_question"].map(names_map)
    return df

In [None]:
train_df = pd.read_csv('preprocessed_data/train_set.csv')
recognise_names(train_df, "recognised_names")

file_path = "./preprocessed_data/named_train_set.csv"
train_df.to_csv(file_path)

test_df = pd.read_csv('preprocessed_data/test_set.csv')
recognise_names(test_df, "recognised_names")

file_path = "./preprocessed_data/named_test_set.csv"
test_df.to_csv(file_path)