In [6]:
import random
from datasets import load_dataset
import nltk

nltk.download("punkt")

def _fix_sentences(sentences):
    merged_sentences = []

    for i, sentence in enumerate(sentences):
        if i > 0 and sentence.startswith(("'", '"', "“", "!", "—")):
            merged_sentences[-1] += " " + sentence
        else:
            merged_sentences.append(sentence)

    return merged_sentences


def count_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return len(_fix_sentences(sentences=sentences))


def split_sentences(text):
    sentences = nltk.sent_tokenize(text)
    merged_sentences = _fix_sentences(sentences)
    
    if len(merged_sentences) < 2:
        return text, ""
    if len(merged_sentences) == 2:
        return merged_sentences[0], merged_sentences[1]
    else:
        split_index = random.randint(1, len(merged_sentences) - 1)
        first_piece = " ".join(merged_sentences[:split_index]).strip()
        second_piece = " ".join(merged_sentences[split_index:]).strip()

        return first_piece, second_piece

[nltk_data] Downloading package punkt to /Users/mandlc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
df = load_dataset("mediabiasgroup/BABE")['train'].to_pandas()
icl_pool = load_dataset("mediabiasgroup/BABE-icl-pool")['train'].to_pandas()

df = df.merge(icl_pool['text'], on='text', how='left',
                            indicator=True).query(
    '_merge == "left_only"').drop('_merge', axis=1)


In [8]:
df['sentence_count'] = df['text'].apply(lambda x: count_sentences(x))
df_sentences = df[df['sentence_count'] > 1].reset_index(drop=True)

min_sentence_count = df['sentence_count'].min()
max_sentence_count = df['sentence_count'].max()
mean_sentence_count = df['sentence_count'].mean()

print("Minimum sentence count:", min_sentence_count)
print("Maximum sentence count:", max_sentence_count)
print("Mean sentence count:", mean_sentence_count)


Minimum sentence count: 1
Maximum sentence count: 5
Mean sentence count: 1.101952995696789


In [9]:
import pandas as pd


df_new = df_sentences[["text",'label']].copy()
df_new[["first_piece", "second_piece"]] = df_new["text"].apply(lambda x: pd.Series(split_sentences(x)))
df_new = df_new.groupby('label').sample(10, random_state=42)

# map label (same as in their paper)
df_new['label'] = df_new['label'].replace({0: '0 (not biased)', 1: '1 (biased)'})
df_new.to_csv("babe_contamination.csv", index=False)

In [10]:
df_new

Unnamed: 0,text,label,first_piece,second_piece
107,"In California, where personal income taxes acc...",0 (not biased),"In California, where personal income taxes acc...","Palmer, a spokesman for the state’s finance de..."
191,Victims of the violence endured potentially li...,0 (not biased),Victims of the violence endured potentially li...,D.C. Police Chief Peter Newsham told protester...
230,"But last Wednesday, when the Heritage Clinic f...",0 (not biased),"But last Wednesday, when the Heritage Clinic f...","Ignoring the no-trespassing signs, they began ..."
66,President Donald Trump and Congress are taking...,0 (not biased),President Donald Trump and Congress are taking...,"Billy Graham last Wednesday, through a preside..."
194,The protestors reportedly are demanding that ...,0 (not biased),The protestors reportedly are demanding that ...,Andrew Cuomo hike taxes on the state’s billion...
23,Before we get to work dispensing these argumen...,0 (not biased),Before we get to work dispensing these argumen...,"A lot of Americans, especially those who got t..."
101,President Donald Trump says that voting by mai...,0 (not biased),President Donald Trump says that voting by mai...,But county-level election supervisors in his h...
241,"Two years ago, America’s white nationalist mov...",0 (not biased),"Two years ago, America’s white nationalist mov...","Neo-Nazi demonstrations in Charlottesville, Vi..."
20,Two Democrats -- state Sens. Creigh Deeds and ...,0 (not biased),Two Democrats -- state Sens.,Creigh Deeds and Chap Petersen -- joined with ...
156,"Japan is facing a major challenge, with the pu...",0 (not biased),"Japan is facing a major challenge, with the pu...","Japan has already invested billions, and the d..."
