In [1]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm

In [2]:
dirpath = Path("PreprocessedData")
dataset = "jester_tmp"
filepath = dirpath / dataset / "Jester_test_300.csv"
df = pd.read_csv(filepath)
df.shape

(14930, 5)

In [11]:
def get_columns(dataset):
    if dataset == "go_emo":
        return sorted([
            'admiration','amusement', 'anger', 'annoyance', 'approval', 'caring',
            'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
            'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
            'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
            'remorse', 'sadness', 'surprise', 'neutral'
        ]), "rater_id", "text"
    if dataset == "unhealthy":
        return sorted([
            "antagonize", "condescending" , "dismissive", "generalisation",
            "generalisation_unfair", "healthy", "hostile", "sarcastic"
        ]), "_worker_id", "comment"
    if dataset == "docanno":
        return sorted([
            'inspiring', 'interesting', 'offensive_to_someone', 'negative',
            'offensive_to_me', 'political', 'positive', 'sadness', 'calm',
            'fear', 'compassion', 'disgust', 'vulgar', 'surprise', 'embarrasing',
            'anger', 'understandable', 'ironic', 'need_more_information',
            'happiness', 'delight', 'funny_to_someone', 'funny_to_me'
        ]), "user_id", "text"
    if dataset == "aggression":
        return sorted([
            "aggression"
        ]), "worker_id", "comment"
    if dataset == 'Humi':
        return sorted([
            "grade"
        ]), "id","annotator_id", "original"
    if dataset == 'Humi_multi':
        return sorted([
            "is_humorous","is_not_humorous"
        ]), "id","annotator_id", "original"
    return None

In [7]:
label_columns, id, annotator_column, text_column = get_columns(dataset)
num_shots = 2

In [8]:
def parse_annotation_to_text(sample):
    if dataset == "aggression":
        response = "true" if sample[label_columns[0]] == 1 else "false"
    elif dataset == 'Humi':
        response = sample[label_columns[0]]
    else:
        response = ", ".join([label for label in label_columns if sample[label] == 1])
    return response

In [9]:
def get_examples(df, sample, num_shots=2):
    annotator_texts = df[df[annotator_column] == sample[annotator_column]].drop(sample["index"])
    if annotator_texts.shape[0] < num_shots:
        examples = annotator_texts.sample(frac=1)
    else:
        examples = annotator_texts.sample(num_shots)
    extracted_return = []
    for idx, ex in examples.iterrows():
        extracted_return.append(ex[text_column])
        extracted_return.append(parse_annotation_to_text(ex))
    while len(extracted_return) < 2*num_shots:
        extracted_return.append(None)
    return extracted_return

In [10]:
tqdm.pandas()
new_columns = sum([[f"example{i+1}", f"example{i+1}_response"] for i in range(num_shots)], start=[])
df[new_columns] = df.reset_index().progress_apply(lambda sample: get_examples(df, sample, 2), axis=1, result_type="expand")

100%|██████████| 14930/14930 [00:17<00:00, 842.79it/s]


In [12]:
df[30:50]

Unnamed: 0,id,original,annotator_id,is_humorous,is_not_humorous,example1,example1_response,example2,example2_response
30,11975,Flynn has promised Special Counsel ' full coop...,1,1,0,Pakistan Calls On donkey to Help Restore Order...,is_humorous,Quebec ’s legislators last week unanimously pa...,is_humorous
31,11975,Flynn has promised Special Counsel ' full coop...,2,1,0,Steph Curry : It 's ' beneath ' Trump 's posit...,is_humorous,Pruitt got 24-7 armed Child on first day at EPA,is_not_humorous
32,11975,Flynn has promised Special Counsel ' full coop...,3,1,0,Why Trump may be about to embrace North Korea ...,is_not_humorous,"Trump 's cork Wall Must Be Built , 380 Sheriff...",is_not_humorous
33,11975,Flynn has promised Special Counsel ' full coop...,4,1,0,Experts warn the FBI put itself in a ' box ' w...,is_not_humorous,Bitcoin is plummeting as rumors of a Binance p...,is_humorous
34,11975,Flynn has promised Special Counsel ' full coop...,5,0,1,The fact that these girls understand what it m...,is_not_humorous,Top Trump Aide Says Ethics Filings Discourage ...,is_not_humorous
35,10047,Jones does n't think Trump should resign over ...,1,1,0,Democrat appears to win recount in key foot ra...,is_humorous,Theresa May Seeks Snap U.K. vegetables After T...,is_humorous
36,10047,Jones does n't think Trump should resign over ...,2,1,0,Humor is a dangerous ' Asteroid of Awfulness ...,is_not_humorous,China minister warns against seduction of mins...,is_humorous
37,10047,Jones does n't think Trump should resign over ...,3,1,0,As a proud Israeli I want falafel . Killing Ga...,is_humorous,Three journalists leaving CNN after retracted ...,is_not_humorous
38,10047,Jones does n't think Trump should resign over ...,4,1,0,Comey hints that there are ' facts ' we do n't...,is_humorous,Huckabee : My dog ‘ has no desire ’ for Spicer...,is_humorous
39,10047,Jones does n't think Trump should resign over ...,5,1,0,Balance of pencil : McCain Returns for Obamaca...,is_not_humorous,Huckabee : My dog ‘ has no desire ’ for Spicer...,is_humorous


In [22]:
filepath = dirpath / dataset / "final_test_m_few2.csv"

In [23]:
df.to_csv(filepath, index=False)