In [1]:
import pandas as pd
import random
import numpy as np
import os

In [2]:
def select_answer_inputs(df,HitNeeded,AvoidDup = True):
    ## Select Questions
    random.seed(0)
    Keys = df.columns
    OutDic = {k+f"{i}":[] for k in Keys for i in range(1,6)}
    ridxs=np.arange(len(df))
    
    for i in range(HitNeeded):
        selected_ids = random.sample(range(len(ridxs)),5)
        selected_rows = ridxs[selected_ids]
        if AvoidDup: # avoid duplicated videos in same HIT
            trytime = 0
            while len(set(df["video"][selected_rows])) < 5:
                if trytime > 10:
                    raise RuntimeError("Hard to find unduplicated videos")
                selected_ids = random.sample(range(len(ridxs)),5)
                selected_rows = ridxs[selected_ids]
                trytime += 1
        for rid in range(5):
            crow = df.iloc[selected_rows[rid]]
            for k in Keys:
                OutDic[k+f"{rid+1}"].append(crow[k])
        ridxs = np.delete(ridxs,selected_ids)
    return OutDic


def escapeHtml(unsafe):
    return unsafe.replace("&","&amp;").replace("<","&lt;")\
    .replace(">","&gt;").replace("\"",'&quot;').replace("'","&#039;")

In [3]:
HitSize = 8
df_all_expert_answer_anno = pd.read_csv(
    "organize_stage1_annotaton/processed_reviewed_rm_del.csv") # all questions from phase 1

In [4]:
# use if there are previous phase2 annotation input
df_pre_concat = []
for pre_name in os.listdir("previous_phase2_inputs"):
    df_pre_concat.append(pd.read_csv("previous_phase2_inputs/"+pre_name))
    
df_pre_concat = pd.concat(df_pre_concat)
used_anno = {tuple(ite) for i in range(1,6) for ite in 
 df_pre_concat[[f"video{i}",f"question{i}",f"stdAnswer{i}"]].to_dict("split")["data"]}

In [5]:
feasible_anno = df_all_expert_answer_anno[df_all_expert_answer_anno.apply(
    lambda x: (x["video_link"],x['modified_question'],x["modified_answer"]) not in used_anno,axis=1)]

# the following one line is to fix some early stage annotation issues
feasible_anno = feasible_anno[feasible_anno.apply(
    lambda x: (x["video_link"],x['modified_question'],x["correct_answer"]) not in used_anno,axis=1)]

feasible_anno.reset_index(drop=True, inplace=True)

In [6]:
feasible_anno = feasible_anno[['domain','modified_question','video_link','modified_answer','evidences']]
feasible_anno = feasible_anno.rename({"modified_question":"question","video_link":"video",
                                      "modified_answer":"stdAnswer","evidences":"stdEvidences"}, axis=1)

feasible_anno["question"] = feasible_anno["question"].apply(escapeHtml)
# feasible_anno["stdAnswer"] = feasible_anno["stdAnswer"].apply(escapeHtml)

In [7]:
mil_feasible_anno = feasible_anno[feasible_anno["domain"]=="Military"]
no_mil_feasible_anno = feasible_anno[feasible_anno["domain"]!="Military"]
mil_feasible_anno.reset_index(drop=True, inplace=True)
no_mil_feasible_anno.reset_index(drop=True, inplace=True)

In [8]:
print(f"{len(df_all_expert_answer_anno)} questions in total;\
{len(df_all_expert_answer_anno) - len(feasible_anno)} questions used;\
{len(feasible_anno)} questions left")

print("{} Mil questions,{} no-Mil;{} in all".
      format(len(mil_feasible_anno),
             len(no_mil_feasible_anno),
             len(feasible_anno)))

916 questions in total;104 questions used;812 questions left
201 Mil questions,611 no-Mil;812 in all


In [9]:
mil_hits = pd.DataFrame.from_dict(select_answer_inputs(
    mil_feasible_anno,HitNeeded=len(mil_feasible_anno)//5))
no_mil_hits = pd.DataFrame.from_dict(select_answer_inputs(
    no_mil_feasible_anno,HitNeeded=len(no_mil_feasible_anno)//5))

In [10]:
MilHitSize = min(len(mil_hits),HitSize)
for i in range((len(mil_hits)+MilHitSize-1)//MilHitSize):
    mil_hits.iloc[i*MilHitSize:min((i+1)*MilHitSize,len(mil_hits))]\
        .to_csv(f"new_mil_answer_input_{i}.csv",index=False)
    
NoMilHitSize = min(len(no_mil_hits),HitSize)
for i in range((len(no_mil_hits)+NoMilHitSize-1)//NoMilHitSize):
    no_mil_hits.iloc[i*NoMilHitSize:min((i+1)*NoMilHitSize,len(no_mil_hits))].\
        to_csv(f"new_no_mil_answer_input_{i}.csv",index=False)
