**Loading of libaries and dataset**

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [2]:
# from datasets import load_dataset
import pandas as pd
import re
from typing import List
import requests
from bs4 import BeautifulSoup

**Main preprocessing procedure**

In [None]:
def pattern_cleaning(
    df: pd.DataFrame,
    exceptions: List[int]
    ) -> pd.DataFrame:
    """
    Function that cleans 4 unwanted patterns from the dataset
    regarding, indexing of questions, special characters, speaker's name
    and description of questions.

    Arguments:
    df – Dataframe to be cleaned
    exceptions - exception list of indexes where the disception of the
    question is needed

    Returns:
    df – Cleaned dataframe
    """

    """
    Regex explanation:

    ^ matches the start of the string
    (\d+\.|Part \d+:|Q\d*:|\d+\. Q\d*: ) is a capturing group that
    matches one of the following:
        \d+\. : one or more digits followed by a period

        Part \d+: : the string "Part " followed by one or more digits,
        a colon, and an optional space

        Q\d*: : the string "Q" followed by one or more digits, a colon,
        and an optional space

        \d+\. Q\d*: : one or more digits followed by a period, a space,
        "Q", one or more digits, a colon, and an optional space

        - : start sentence with "-"
    """

    # 1) Remove indexing from questions
    index_pattern = r'^(\d+\. Q\d+:|\d+\.|Part \d+:|Q\d+:|-)'
    df['question'] = df['question'].str.replace(
        index_pattern,
        '',
        regex=True
        )

    # 2) Remove quotes and new line espace characters
    df['question'] = df['question'].str.replace(
        r'["\n]',
        '',
        regex=True
        )
    df['interview_answer'] = df['interview_answer'].str.replace(
        r'\n',
        '',
        regex=True
        )

    # 3) Remove first sentence from answer (indicates which present is
    # speaking)
    sentence_pattern = r'^[^.]+\.?'
    df['interview_answer'] = df['interview_answer'].str.replace(
        sentence_pattern,
        '',
        regex=True
        )

    # 4) Remove description from questions
    df.loc[~df.index.isin(exceptions), 'question'] = df.loc[
        ~df.index.isin(exceptions), 'question'].apply(
        lambda x: re.sub(r'^[^:]+: ', '', x))
    return df


def get_italic_sentences(url: str) -> list:
    """
    Function to get italic sentences from a url, optimized with error
    handling

    Arguments:
    url - Link of the text

    Returns:
    Text with italics except specific phrases
    """
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise exception for bad responses
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract text from the <div> with class "field-docs-content"
        div_content = soup.find('div', class_='field-docs-content')

        # Return an empty list if the div is not found
        if div_content is None:
            return []

        exception_list = {
            "The President.",
            "Q.",
            "Inaudible",
            "inaudible"
            }

        # Extract unique sentences from <i> or <em> tags, excluding
        # specific phrases
        italic_sentences = {
            i.get_text(strip=True)
            for i in div_content.find_all(['i', 'em'])
            }
        return [
            sentence
            for sentence in italic_sentences
            if sentence not in exception_list
            ]

    except (requests.RequestException, AttributeError) as e:
        print(f"Error retrieving or parsing {url}: {e}")
        return []


def clean_interview_answer(row: pd.Series, url_sentences: set) -> str:
    """
    Remove unnecessary sentences from a interview_answer in a
    vectorized manner

    Arguments:
    row: row of a dataframe
    url_sentences: set of unique sentences to be removed
    from interview answer of a text coming from a particular
    url

    Returns:
    Interview answer string with removed sentences
    """
    unique_sentences = url_sentences.get(row['url'], [])
    interview_answer = row['interview_answer']
    for sentence in unique_sentences:
        interview_answer = interview_answer.replace(sentence, '')
    return interview_answer


def remove_unrelated_text(df: pd.DataFrame) -> pd.DataFrame:
    """
    Function to remove italic sentences from the 'interview_answer' column.

    Arguments:
    df – Dataframe to be cleaned

    Returns:
    df – Cleaned dataframe
    """

    # Create a dictionary to store unique sentences for each URL
    url_sentences = {}

    # Create a dictionary to store unique sentences for each URL
    unique_urls = df['url'].unique()

    # Get sentences for each URL (optionally use parallel processing for
    # speedup)
    for url in unique_urls:
        url_sentences[url] = get_italic_sentences(url)

    df['interview_answer'] = df.apply(
        lambda x: clean_interview_answer(x, url_sentences), axis=1)

    # Optional: Clean up whitespace after sentence removal
    df['interview_answer'] = df['interview_answer'].str.replace(
        r'\s+', ' ',
        regex=True
        ).str.strip()

    return df

def extra_labels(df: pd.DataFrame) -> pd.DataFrame:
  """
  Add inadible and multiple question labels to the dataset

  Arguments:
  df – Dataframe

  Returns:
  df – Labeled dataframe
  """
  df_train["inaudible"] = df_train['interview_answer'].str.contains('inaudible', case=False)
  df_train["multiple_questions"] = df_train['question'].str.count('\?') > 1
  df_train["affirmative_questions"] = ~df_train['question'].str.contains('\?')
  return df_train

In [None]:
# Load train dataset
ds = load_dataset("ailsntua/QEvasion")

# Convert to pandas and keep only useful columns
df_train = ds["train"].to_pandas()[["question","interview_question",
                                    "interview_answer", "label","url"]]

# Remove unwanted patterns
exception_list = [142,493,699,809,1052,1053,1446,
                  2417,2631,2821,3181,3390]
df_train = pattern_cleaning(df_train, exception_list)

# Extract noise from the end of interview answer
df_train = remove_unrelated_text(df_train)

# Add 2 more labels for multiple questions and inadible speech
df_train = extra_labels(df_train)

df_train.to_csv('train_set.csv', index=False)

train.csv:   0%|          | 0.00/14.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3448 [00:00<?, ? examples/s]

---

**Exploring unhandled affirmative questions**

In [None]:
df_train.to_csv('output.csv', index=False)

In [None]:
df_train = pd.read_csv('output.csv')
df_train['label'].unique()

array(['Explicit', 'General', 'Partial/half-answer', 'Dodging',
       'Implicit', 'Deflection', 'Declining to answer',
       'Claims ignorance', 'Clarification'], dtype=object)

In [None]:
filtered_no_quest_df = df_train[~df_train['question'].str.contains('\?')]
len(filtered_no_quest_df)

772

In [None]:
len(df_train[df_train['question'].str.contains('\?')])
len(df_train)

3448

In [None]:
filtered_no_quest_df

Unnamed: 0,question,interview_question,interview_answer,label
8,1. Concerns about the lack of communication be...,"\nQ. Well, let me ask you about—you've spent l...",\nThe President. It's not a wedge issue of the...,Explicit
9,2. Inquiry about the reaction of Kyiv regardin...,"\nQ. Well, let me ask you about—you've spent l...",\nThe President. It's not a wedge issue of the...,Explicit
16,ensuring Finland that the U.S. will remain a r...,"\nQ. In Washington, a bipartisan group of Sena...",\nPresident Biden. I absolutely guarantee it. ...,Explicit
18,Concerns about the comments motivating Putin ...,"\nQ. Thank you, Mr. President. You've said tha...","\nPresident Biden. First of all, no one can jo...",Deflection
19,The risk of the war dragging on for years.,"\nQ. Thank you, Mr. President. You've said tha...","\nPresident Biden. First of all, no one can jo...",Dodging
...,...,...,...,...
3403,I wonder what your reaction is to that,\nQ. But the results are being interpreted as ...,"\nThe President. You know, I really haven't—I'...",Explicit
3415,Asking for an explanation of not knowing somet...,\nQ. How could you not know that and not be ou...,"\nThe President. You didn't know it, either.",Dodging
3416,Adjustments to the agenda regarding Social Sec...,"\nQ. Mr. President, you mentioned entitlements...","\nThe President. I told—Ken, I told Hank Pauls...",General
3430,Secretary Rumsfeld Accountability,"\nQ. When you first ran for President, sir, yo...","\nThe President. Peter, you're asking me why I...",Dodging


In [None]:
len(df_train[df_train['affirmative_questions']==True])

772

In [None]:
df_train[df_train['affirmative_questions']==True]

Unnamed: 0,question,interview_question,interview_answer,label,url,inaudible,multiple_questions,affirmative_questions
8,Concerns about the lack of communication betw...,"\nQ. Well, let me ask you about—you've spent l...",It's not a wedge issue of the Global South. It...,Explicit,https://www.presidency.ucsb.edu/documents/the-...,False,False,True
9,Inquiry about the reaction of Kyiv regarding ...,"\nQ. Well, let me ask you about—you've spent l...",It's not a wedge issue of the Global South. It...,Explicit,https://www.presidency.ucsb.edu/documents/the-...,False,False,True
16,ensuring Finland that the U.S. will remain a r...,"\nQ. In Washington, a bipartisan group of Sena...",I absolutely guarantee it. There is no questio...,Explicit,https://www.presidency.ucsb.edu/documents/the-...,False,False,True
18,Concerns about the comments motivating Putin ...,"\nQ. Thank you, Mr. President. You've said tha...","First of all, no one can join NATO while the w...",Deflection,https://www.presidency.ucsb.edu/documents/the-...,False,False,True
19,The risk of the war dragging on for years.,"\nQ. Thank you, Mr. President. You've said tha...","First of all, no one can join NATO while the w...",Dodging,https://www.presidency.ucsb.edu/documents/the-...,False,False,True
...,...,...,...,...,...,...,...,...
3403,I wonder what your reaction is to that,\nQ. But the results are being interpreted as ...,"You know, I really haven't—I'm still going to ...",Explicit,https://www.presidency.ucsb.edu/documents/the-...,False,False,True
3415,Asking for an explanation of not knowing somet...,\nQ. How could you not know that and not be ou...,"You didn't know it, either.",Dodging,https://www.presidency.ucsb.edu/documents/the-...,False,False,True
3416,Adjustments to the agenda regarding Social Sec...,"\nQ. Mr. President, you mentioned entitlements...","I told—Ken, I told Hank Paulson to tell the Me...",General,https://www.presidency.ucsb.edu/documents/the-...,False,False,True
3430,Secretary Rumsfeld Accountability,"\nQ. When you first ran for President, sir, yo...","Peter, you're asking me why I believe Secretar...",Dodging,https://www.presidency.ucsb.edu/documents/the-...,False,False,True




---



**Preprocessing of LLM rephrazed question**

In [None]:
df2 = pd.read_csv('output2.csv')

In [None]:
# Remove LLM explanation and quotes
df2['rephrazed_question'] = df2['rephrazed_question'].str.replace(r'^[^:]*:\s*', '', regex=True)
df2['rephrazed_question'] = df2['rephrazed_question'].str.replace(
        r'["\n]',
        '',
        regex=True
        )

In [None]:
df2.to_csv('output3.csv')



---



**Test data preprocessing**

In [None]:
df_test = pd.read_csv('test_set_with_label.csv')

column_mapping = {
    'Question': 'question',
    'Interview Answer': 'interview_answer'
    }

df_test = df_test.rename(columns=column_mapping)

exceptions = [160, 176, 207, 309]
df_test = pattern_cleaning(df_test, exceptions)

# Extract noise from the end of interview answer
df_test = remove_unrelated_text(df_test)

# Add 2 more labels for multiple questions and inadible speech
df_test = extra_labels(df_test)

df_test.to_csv('test_set.csv')

In [3]:
df_test1 = pd.read_csv('test_set_with_label.csv')
df_test1

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Interview Question,Interview Answer,Question,Annotator1,Annotator2,Annotator3,Label
0,0,0,"\r\nQ. What about the redline, sir?","\r\nThe President. Well, the world has made it...",Inquiring about the status or information reg...,2.1 Dodging,2.4 General,2.1 Dodging,Indirect
1,1,1,\r\nQ. Will you invite them to the White House...,\r\nPresident Obama. I think that anytime and ...,Will you invite them to the White House to neg...,2.2 Deflection,2.4 General,2.4 General,Indirect
2,2,2,"\r\nQ. Harsh. Mr. President, Japan has dropped...",\r\nThe President. I think that the purpose of...,Why was it necessary for Japan to drop the thr...,1.1 Explicit,1.2 Implicit,1.2 Implicit,Indirect
3,3,3,\r\nQ. The Lebanese Prime Minister is demandin...,\r\nThe President. I'll let Condi talk about t...,When will we see this resolution?,1.1 Explicit,2.4 General,2.4 General,Indirect
4,4,4,"\r\nQ. Thank you, Mr. President. Back on Iraq,...","\r\nThe President. No, I don't consider it a c...",Updating the figure of Iraqi deaths,2.1 Dodging,1.2 Implicit,2.1 Dodging,Indirect
...,...,...,...,...,...,...,...,...,...
312,312,312,"\r\nQ. If so, why? And do you believe that the...",\r\nThe President. I believe that the situatio...,"If so, why?",1.2 Implicit,2.1 Dodging,1.2 Implicit,Indirect
313,313,313,"\r\nQ. Yes, indeed. In reading the 1559 resolu...","\r\nThe President. Well, the people who should...",Request for the speaker's understanding and re...,2.2 Deflection,2.2 Deflection,2.2 Deflection,Indirect
314,314,314,"\r\nQ. Mr. President, we know that you talked ...",\r\nPresident Bush. We strategized on both iss...,"Mr. President, we know that you talked about I...",1.1 Explicit,1.1 Explicit,1.1 Explicit,Direct Reply
315,315,315,\r\nQ. Is there anything that's disappointed y...,\r\nThe President. I was concerned at first ab...,Is there anything that's disappointed you abo...,2.2 Deflection,1.2 Implicit,1.2 Implicit,Indirect


In [7]:
df_test2 = pd.read_csv('test_set.csv')
df_test2

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Interview Question,Interview Answer,Question,Annotator1,Annotator2,Annotator3,link
0,0,0,"\nQ. What about the redline, sir?","\nThe President. Well, the world has made it c...",Inquiring about the status or information reg...,2.1 Dodging,2.4 General,2.1 Dodging,https://www.presidency.ucsb.edu/documents/the-...
1,1,1,\nQ. Will you invite them to the White House t...,\nPresident Obama. I think that anytime and an...,Will you invite them to the White House to neg...,2.2 Deflection,2.4 General,2.4 General,https://www.presidency.ucsb.edu/documents/the-...
2,2,2,"\nQ. Harsh. Mr. President, Japan has dropped t...",\nThe President. I think that the purpose of t...,Why was it necessary for Japan to drop the thr...,1.1 Explicit,1.2 Implicit,1.2 Implicit,https://www.presidency.ucsb.edu/documents/the-...
3,3,3,\nQ. The Lebanese Prime Minister is demanding ...,\nThe President. I'll let Condi talk about the...,When will we see this resolution?,1.1 Explicit,2.4 General,2.4 General,https://www.presidency.ucsb.edu/documents/the-...
4,4,4,"\nQ. Thank you, Mr. President. Back on Iraq, a...","\nThe President. No, I don't consider it a cre...",Updating the figure of Iraqi deaths,2.1 Dodging,1.2 Implicit,2.1 Dodging,https://www.presidency.ucsb.edu/documents/the-...
...,...,...,...,...,...,...,...,...,...
312,312,312,"\nQ. If so, why? And do you believe that the b...",\nThe President. I believe that the situation ...,"If so, why?",1.2 Implicit,2.1 Dodging,1.2 Implicit,https://www.presidency.ucsb.edu/documents/the-...
313,313,313,"\nQ. Yes, indeed. In reading the 1559 resoluti...","\nThe President. Well, the people who should g...",Request for the speaker's understanding and re...,2.2 Deflection,2.2 Deflection,2.2 Deflection,https://www.presidency.ucsb.edu/documents/the-...
314,314,314,"\nQ. Mr. President, we know that you talked ab...",\nPresident Bush. We strategized on both issue...,"Mr. President, we know that you talked about I...",1.1 Explicit,1.1 Explicit,1.1 Explicit,https://www.presidency.ucsb.edu/documents/the-...
315,315,315,\nQ. Is there anything that's disappointed you...,\nThe President. I was concerned at first abou...,Is there anything that's disappointed you abo...,2.2 Deflection,1.2 Implicit,1.2 Implicit,https://www.presidency.ucsb.edu/documents/the-...


In [40]:
df_test1 = pd.read_csv('test_set_with_label.csv')
df_test2 = pd.read_csv('test_set_with_link.csv')[["Unnamed: 0.1","link"]]
df_merge_col = pd.merge(df_test1, df_test2, on='Unnamed: 0.1')
df_merge_col = df_merge_col.drop(columns=["Unnamed: 0.1", "Unnamed: 0"])
df_merge_col.dropna(subset=["link"], inplace=True)
df_merge_col.reset_index(drop=True, inplace=True)
df_merge_col.to_csv('test_set.csv')