In [2]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm

In [5]:
dirpath = Path("PreprocessedData")
dataset = "Humi"
filepath = dirpath / dataset / "final_test.csv"
df = pd.read_csv(filepath)
df.shape

(14930, 4)

In [6]:
def get_columns(dataset):
    if dataset == "go_emo":
        return sorted([
            'admiration','amusement', 'anger', 'annoyance', 'approval', 'caring',
            'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
            'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
            'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
            'remorse', 'sadness', 'surprise', 'neutral'
        ]), "rater_id", "text"
    if dataset == "unhealthy":
        return sorted([
            "antagonize", "condescending" , "dismissive", "generalisation",
            "generalisation_unfair", "healthy", "hostile", "sarcastic"
        ]), "_worker_id", "comment"
    if dataset == "docanno":
        return sorted([
            'inspiring', 'interesting', 'offensive_to_someone', 'negative',
            'offensive_to_me', 'political', 'positive', 'sadness', 'calm',
            'fear', 'compassion', 'disgust', 'vulgar', 'surprise', 'embarrasing',
            'anger', 'understandable', 'ironic', 'need_more_information',
            'happiness', 'delight', 'funny_to_someone', 'funny_to_me'
        ]), "user_id", "text"
    if dataset == "aggression":
        return sorted([
            "aggression"
        ]), "worker_id", "comment"
    if dataset == 'Humi':
        return sorted([
            "grade"
        ]), "id","annotator_id", "original"
    return None

In [16]:
label_columns, id, annotator_column, text_column = get_columns(dataset)
num_shots = 2

In [17]:
def parse_annotation_to_text(sample):
    if dataset == "aggression":
        response = "true" if sample[label_columns[0]] == 1 else "false"
    elif dataset == 'Humi':
        response = sample[label_columns[0]]
    else:
        response = ", ".join([label for label in label_columns if sample[label] == 1])
    return response

In [18]:
def get_examples(df, sample, num_shots=2):
    annotator_texts = df[df[annotator_column] == sample[annotator_column]].drop(sample["index"])
    if annotator_texts.shape[0] < num_shots:
        examples = annotator_texts.sample(frac=1)
    else:
        examples = annotator_texts.sample(num_shots)
    extracted_return = []
    for idx, ex in examples.iterrows():
        extracted_return.append(ex[text_column])
        extracted_return.append(parse_annotation_to_text(ex))
    while len(extracted_return) < 2*num_shots:
        extracted_return.append(None)
    return extracted_return

In [19]:
tqdm.pandas()
new_columns = sum([[f"example{i+1}", f"example{i+1}_response"] for i in range(num_shots)], start=[])
df[new_columns] = df.reset_index().progress_apply(lambda sample: get_examples(df, sample, 2), axis=1, result_type="expand")

100%|██████████| 14930/14930 [00:15<00:00, 969.52it/s] 


In [21]:
df[30:50]

Unnamed: 0,id,original,annotator_id,grade,example1,example1_response,example2,example2_response
30,11975,Flynn has promised Special Counsel ' full coop...,1,3,Trump would love to invite Mueller . But here ...,1,GOP Health Care Bill Would Cut About $ 765 Bil...,1
31,11975,Flynn has promised Special Counsel ' full coop...,2,2,"Trump 's CIA pick is career whiner , oversaw s...",1,"Trump speaks on Scalise shooting , calling for...",1
32,11975,Flynn has promised Special Counsel ' full coop...,3,2,Putin says Kim Jong Un won this obesity .,0,Trump on ‘ kid ’s March ’ : ‘ Why Did n’t Thes...,1
33,11975,Flynn has promised Special Counsel ' full coop...,4,1,Report : Senate 's Space probe understaffed,0,Harassment case puts Democratic Senate candida...,0
34,11975,Flynn has promised Special Counsel ' full coop...,5,0,Saudi bee reportedly plans to skip G-20 summit...,0,Martin Shkreli sent to corner,0
35,10047,Jones does n't think Trump should resign over ...,1,2,Donald Trump ’s nuked leftovers : How can a fl...,3,"Amazon , Facebook and Google could save billio...",2
36,10047,Jones does n't think Trump should resign over ...,2,1,Jones does n't think Trump should resign over ...,0,Mattis to brief Trump on options for military ...,1
37,10047,Jones does n't think Trump should resign over ...,3,1,' Armchair ' psychiatry in the media should si...,1,European Union ready to retaliate against Trum...,0
38,10047,Jones does n't think Trump should resign over ...,4,1,New York Times op-ed page punches down at medd...,1,AP Exclusive : More laziness in Flynn consulti...,0
39,10047,Jones does n't think Trump should resign over ...,5,1,The Trump administration is using Obamacare ma...,0,The Middle-Class abduction Is n't Made Up,0


In [22]:
filepath = dirpath / dataset / "final_few2_test.csv"

In [23]:
df.to_csv(filepath, index=False)