In [None]:
import pandas as pd
from pathlib import Path

import re
from typing import Optional

In [None]:
class BackchannelIdentifier:
    """Two-stage backchannel detector."""

    # High-confidence anchored patterns for Stage 1
    HIGH_CONFIDENCE_PATTERNS = [
        r"^yes$", r"^yeah$", r"^yup$", r"^ya$", r"^uh\s*huh$", r"^uh$", r"^huh$",
        r"^mhm$", r"^mm$", r"^m$", r"^um$", r"^umm$", r"^mm-hm$",
        r"^oh$", r"^oh\s+boy$", r"^oh\s+yeah$", r"^cool$", r"^nice$", r"^right$",
        r"^what\??$"
    ]

    def __init__(self, max_words: int = 3, max_length: int = 15, custom_patterns: Optional[list[str]] = None):
        self.max_words = max_words
        self.max_length = max_length
        # Compile Stage 1 patterns
        all_patterns = self.HIGH_CONFIDENCE_PATTERNS + (custom_patterns or [])
        self.filter_pattern = re.compile("|".join(all_patterns), re.IGNORECASE)

    # ---------------- Stage 1 ----------------
    def filter(self, target_utt: str) -> Optional[bool]:
        """Stage 1: high-confidence backchannel detection."""
        if not target_utt or isinstance(target_utt, float):
            return False

        utt = target_utt.strip()
        word_count = len(utt.split())
        char_count = len(utt)

        # Length filter
        if word_count > self.max_words or char_count > self.max_length:
            return False  # too long to be a backchannel

        # Anchored dictionary match
        if self.filter_pattern.fullmatch(utt.lower()):
            return True  # high-confidence backchannel

        # Ambiguous / edge case
        return None

    # ---------------- judge ----------------
    def judge(self, target_utt: str, previous_utt: str = "") -> bool:
        """Semantic judgment using context."""
        role_prompt = (
        "You are an expert in dialogue analysis. Your task is to determine whether the target utterance "
        "is a backchannel behavior. Backchannel behaviors are brief listener responses that signal attention, "
        "agreement, or understanding (e.g., 'uh-huh', 'yeah', 'right').\n"
        "Only respond with 'yes' if it is a backchannel, and 'no' otherwise. "
        "Do not provide explanations.\n"
        "You will be given the previous turn as context. "
        "The target utterance will always be labeled as <Target>."
        )

        task_prompt = f"<Context> {self.previous_utt}, <Target>: {self.target_utt}"

        messages=[
        {
            'role': 'user',
            'content': f"{role_prompt} {task_prompt}",
        }]
        response = ollama.chat(self.model_name, messages=messages)
        message = response['message']
        return message["content"]

    # ---------------- Combined method ----------------
    def is_backchannel(self, target_utt: str, previous_utt: str = "") -> bool:
        """Full two-stage pipeline."""
        filter_result = self.filter(target_utt)
        if filter_result is not None:
            return filter_result  # high-confidence decision

        # Stage 1 uncertain -> judge (LLM or semantic judge)
        return self.judge(target_utt, previous_utt)


In [None]:
import pandas as pd
from typing import Optional
import re

class BackchannelIdentifier:
    """Two-stage backchannel detector with batch LLM judge."""

    HIGH_CONFIDENCE_PATTERNS = [
        r"^yes$", r"^yeah$", r"^yup$", r"^ya$", r"^uh\s*huh$", r"^uh$", r"^huh$",
        r"^mhm$", r"^mm$", r"^m$", r"^um$", r"^umm$", r"^mm-hm$",
        r"^oh$", r"^oh\s+boy$", r"^oh\s+yeah$", r"^cool$", r"^nice$", r"^right$",
        r"^what\??$"
    ]

    def __init__(self, max_words: int = 3, max_length: int = 15, custom_patterns: Optional[list[str]] = None):
        self.max_words = max_words
        self.max_length = max_length
        all_patterns = self.HIGH_CONFIDENCE_PATTERNS + (custom_patterns or [])
        self.filter_pattern = re.compile("|".join(all_patterns), re.IGNORECASE)

    # ---------------- Stage 1 ----------------
    def filter(self, target_utt: str) -> Optional[bool]:
        """Stage 1: high-confidence backchannel detection."""
        if not target_utt or isinstance(target_utt, float):
            return False

        utt = target_utt.strip()
        if len(utt.split()) > self.max_words or len(utt) > self.max_length:
            return False  # too long to be a backchannel

        if self.filter_pattern.fullmatch(utt.lower()):
            return True  # high-confidence backchannel

        return None  # ambiguous

    # ---------------- Stage 2 ----------------
    def judge_batch(self, previous_utts: pd.Series, target_utts: pd.Series) -> pd.Series:
        """
        Stage 2: semantic judgment using LLM in batch.
        Returns a boolean Series aligned with input indices.
        """
        role_prompt = (
            "You are an expert in dialogue analysis. Your task is to determine whether the target utterance "
            "is a backchannel behavior. Backchannel behaviors are brief listener responses that signal attention, "
            "agreement, or understanding (e.g., 'uh-huh', 'yeah', 'right').\n"
            "Only respond with 'yes' if it is a backchannel, and 'no' otherwise. "
            "Do not provide explanations.\n"
            "The target utterance will always be labeled as <Target>."
        )

        # Build list of task prompts
        task_prompts = [
            f"<Context> {prev}, <Target>: {targ}"
            for prev, targ in zip(previous_utts, target_utts)
        ]

        messages=[
        {
            'role': 'user',
            'content': f"{role_prompt} {task_prompts}",
        }]

        response_list = ollama.chat(self.model_name, messages=messages)
        # The order of response_list matches task_prompts

        # For demonstration, fallback to 'no' for all
        response_list = ["no"] * len(task_prompts)

        # Convert 'yes'/'no' to boolean
        llm_results = pd.Series([r.lower() == "yes" for r in response_list], index=previous_utts.index)
        return llm_results

    # ---------------- Full pipeline ----------------
    def annotate_dataframe(self, df: pd.DataFrame, target_col='target_utt', previous_col='previous_utt', out_col='is_backchannel') -> pd.DataFrame:
        """
        Annotate dataframe with backchannel detection.
        Returns a new column with True/False.
        """
        df = df.copy()
        # Stage 1 filter
        stage1_results = df[target_col].apply(self.filter)

        # Identify ambiguous rows
        ambiguous_mask = stage1_results.isna()
        if ambiguous_mask.any():
            # Stage 2 LLM judgment in batch
            llm_results = self.judge_batch(
                df.loc[ambiguous_mask, previous_col],
                df.loc[ambiguous_mask, target_col]
            )
            # Fill back into stage1_results
            stage1_results.loc[ambiguous_mask] = llm_results

        # Ensure boolean type
        df[out_col] = stage1_results.astype(bool)
        return df


In [None]:
import ollama

model_name="llama3"

In [None]:
def judge(self)->bool:
    role_prompt = (
        "You are an expert in dialogue analysis. Your task is to determine whether the target utterance "
        "is a backchannel behavior. Backchannel behaviors are brief listener responses that signal attention, "
        "agreement, or understanding (e.g., 'uh-huh', 'yeah', 'right').\n"
        "Only respond with 'yes' if it is a backchannel, and 'no' otherwise. "
        "Do not provide explanations.\n"
        "You will be given the previous turn as context. "
        "The target utterance will always be labeled as <Target>."
    )

    task_prompt = f"<Context> {self.previous_utt}, <Target>: {self.target_utt}"

    messages=[
    {
        'role': 'user',
        'content': f"{role_prompt} {task_prompt}",
    }]
    response = ollama.chat(self.model_name, messages=messages)
    message = response['message']
    return message["content"]

