In [23]:
import numpy as np
import pandas as pd

QuestionNumber = 5  # question per page

# Functions

In [24]:
def arrange_answers(answers):    # organize answers in one instance
    arranged_answers = [[] for _ in range(QuestionNumber)]
    comment = ""
    for name in answers.keys():
        if "less-2-checkbox" in name:
            continue
        answer = answers[name].strip().lower()    # remove space & convert to lower
        for i in range(QuestionNumber):
            if "answer"+str(i+1)+"-" in name:     # belong to question i
                arranged_answers[i].append(answer)
            if "in-answer-box"+str(i+1) in name:
                arranged_answers[i].append(answer)
            if name == "comments":
                comment = answer
    return [sorted(set(t), key=t.index) for t in  arranged_answers],comment # remove repeated answer # keep the answer order


In [25]:
# function relates to grading

def combine_score(precision,recall):
    return recall + 0.67 * precision


def precision_recall_score(anss,StandAnswer=None,RemoveZero=False):
    """
    Calculate Precision & Recall & score for answers of one question
    Input:
    anss: All answers for one question
    StandAnswer: Standard answer; If 'None', then do not calculate the values of standard answer
    RemoveZero: Using RemoveZero method or not
    """
    if len(anss) == 1:          # only one answer for this question
        if len(anss[0]) != 0:   # this one answer is not empty
            return [1],[1],[combine_score(1,1)]          # full score
        else:                   # this one answer is empty
            return [0],[0],[0]
    scores = []
    recalls = []
    precis = []
    for ind in range(len(anss)):
        ans = anss[ind]
        if len(anss[ind]) == 0:
            recalls.append(0)
            precis.append(0)
            scores.append(0)
        else:                   # answer not empty
            OtherAns = [anss[i] for i in range(len(anss)) if i != ind]
            OtheransSet = set([ite for eachans in OtherAns for ite in eachans])
            preci = sum([ite in OtheransSet for ite in ans])/len(ans)
            recall = sum([ite in ans for ite in OtheransSet])/len(OtheransSet)
            scores.append(combine_score(preci,recall))
            precis.append(preci)
            recalls.append(recall)
    if not RemoveZero:          # Do not remove the zero score answers in the first round
        if StandAnswer is not None:
            AllAnsSet = set([ite for eachans in anss for ite in eachans])
            stdpreci = 1.0 * (StandAnswer in AllAnsSet)
            stdrecall = sum([ite==StandAnswer for ite in AllAnsSet])/len(AllAnsSet)
            return precis,recalls,scores,[stdpreci,stdrecall,combine_score(stdpreci,stdrecall)]
        else: return precis,recalls,scores
    else:                      # remove the zero score answers in the first round
        scores2 = []
        recalls2 = []
        precis2 = []
        for ind in range(len(anss)):
            ans = anss[ind]
            if len(ans)==0 or scores[ind]==0:
                recalls2.append(0)
                precis2.append(0)
                scores2.append(0)
            else:                   # answer not empty
                OtherAns = [anss[i] for i in range(len(anss)) if (i!=ind and scores[i]!=0)]
                OtheransSet = set([ite for eachans in OtherAns for ite in eachans])
                preci = sum([ite in OtheransSet for ite in ans])/len(ans)
                recall = sum([ite in ans for ite in OtheransSet])/len(OtheransSet)
                scores2.append(combine_score(preci,recall))
                precis2.append(preci)
                recalls2.append(recall)
        if StandAnswer is not None:
            AllAns = [anss[i] for i in range(len(anss)) if scores[i]!=0]
            AllAnsSet = set([ite for eachans in AllAns for ite in eachans])
            if len(AllAnsSet) != 0 :
                stdpreci = 1.0 * (StandAnswer in AllAnsSet)
                stdrecall = sum([ite==StandAnswer for ite in AllAnsSet])/len(AllAnsSet)
            else:    # scores of all answers are 0
                stdpreci = np.NAN
                stdrecall = np.NAN
            return precis2,recalls2,scores2,[stdpreci,stdrecall,combine_score(stdpreci,stdrecall)]
        else: return precis2,recalls2,scores2

In [26]:
def CollectAnswerInformation_Write2file(original_result,remove_zero=False,filename="answers_information.txt"):
    '''
    Scan the batch result file; 
    calculate Precision & Recall & Score for each answer; 
    Generate according txt file
    '''
    with open(filename,"w") as f:
        f.write(
            "# FORMAT:\n\n"+
            "VideoId :\n"+
            "Question:\n"+
            "Precision Recall Score(0.67*precision+recall) Standard Answer:\n\n"+
            "Precision Recall Score(0.67*precision+recall) [MTURK_Answer1]\n"+
            "Precision Recall Score(0.67*precision+recall) [MTURK_Answer2]\n"+
            "...\n============================================\n\n")

    answers_collect = []   # collect answers for all HITs, all answers
    size_matrix = []       # collect answer size for all HITs, all answers
    preci_matrix = []      # precision for all HITs, all answers
    recall_matrix = []     # recall for all HITs, all answers
    score_matrix = []      # scores for all HITs, all answers
    std_matrix = []        # Precision & Recall & Score for the standard answer

    # scan Batch_results & cluster answers & calculate score matrix & generate answers collection file
    last_hit_id = None

    with open(filename,"a") as f:
        for ri, (index, row) in enumerate(original_result.iterrows()):
            current_hit_id = row["HITId"]
            if (last_hit_id is not None) and (current_hit_id == last_hit_id):                              # same HIT
                line_answer,comment = arrange_answers(eval(row["Answer.taskAnswers"])[0])
                for i in range(QuestionNumber):
                    answers[i].append(line_answer[i])
            if (last_hit_id is not None) and (current_hit_id!=last_hit_id or ri==len(original_result)-1):  # a new HIT or the Last row
                answers_collect.append(answers)
                # generate precision, recall, score, size matrix for this HIT
                hit_preci = np.zeros((len(answers[0]),QuestionNumber))  # [answers amount * question amount]
                hit_recall = np.zeros((len(answers[0]),QuestionNumber)) # [answers amount * question amount]
                hit_score = np.zeros((len(answers[0]),QuestionNumber))  # [answers amount * question amount]
                hit_size = np.zeros((len(answers[0]),QuestionNumber))   # [answers amount * question amount]
                hit_stand = np.zeros((3,QuestionNumber))                # [std preci,std recall,std score]
                for qid in range(QuestionNumber):
                    hit_preci[:,qid],hit_recall[:,qid],hit_score[:,qid],hit_stand[:,qid] = \
                    precision_recall_score(answers[qid],std_answers[qid],RemoveZero=remove_zero)
                    hit_size[:,qid] = [len(lenans) for lenans in answers[qid]]
                score_matrix.append(hit_score)
                recall_matrix.append(hit_recall)
                preci_matrix.append(hit_preci)
                size_matrix.append(hit_size)
                std_matrix.append(hit_stand)

                # write arranged answers into txt file
                for fwid in range(QuestionNumber):
                    f.write("https://www.youtube.com/watch?v=%s\n%s\n\n"%(video_ids[fwid],video_questions[fwid]))  # Question Information
                    f.write("preci  recall score\n")
                    f.write("%.3f  %.3f  %.3f  %s\n\n"%(hit_stand[0,fwid],hit_stand[1,fwid],         # Standard Answer Information
                                                         hit_stand[2,fwid],std_answers[fwid]))
                    for canid in range(len(answers[fwid])):                                          # Workers Answers Information
                        f.write('%.3f  %.3f  %.3f  %s\n' %(hit_preci[canid,fwid],hit_recall[canid,fwid],
                                                           hit_score[canid,fwid],str(answers[fwid][canid])))
                    f.write("============================================\n\n")                    
            if  last_hit_id == None or current_hit_id!=last_hit_id:              # collect information & answers for the new HIT
                last_hit_id = current_hit_id
                video_ids = [row["Input.video"+str(wid)+"_id"] for wid in range(1,QuestionNumber+1)]
                video_questions = [row["Input.question"+str(wid)] for wid in range(1,QuestionNumber+1)]
                std_answers = [row["Input.label"+str(wid+1)] for wid in range(QuestionNumber)]
                answers = [[] for _ in range(QuestionNumber)]                    # answers for one HIT
                line_answer,comment = arrange_answers(eval(row["Answer.taskAnswers"])[0])
                for i in range(QuestionNumber):
                    answers[i].append(line_answer[i])
    return answers_collect,preci_matrix,recall_matrix,score_matrix,size_matrix,std_matrix

# Calculate Answers Information

In [28]:
original_result = pd.read_csv("Batch_4282865_batch_results.csv")

answers_collect,preci_matrix,recall_matrix,score_matrix,size_matrix,std_matrix = \
    CollectAnswerInformation_Write2file(original_result,
                                        remove_zero=False,filename
                                        ="answers_information.txt")             # Not remove zero method
answers_collect2,preci_matrix2,recall_matrix2,score_matrix2,size_matrix2,std_matrix2 = \
    CollectAnswerInformation_Write2file(original_result,
                                        remove_zero=True,filename="answers_remove_zero_information.txt")  # Remove zero method

# Answers Statistics & Plots

In [7]:
import matplotlib.pyplot as plt
%matplotlib notebook

## Answer Level

In [8]:
## preci-recall plot
all_preci1 = [ite3 for ite1 in preci_matrix for ite2 in ite1 for ite3 in ite2]
all_recall1 = [ite3 for ite1 in recall_matrix for ite2 in ite1 for ite3 in ite2]
all_size1 = [ite3 for ite1 in size_matrix for ite2 in ite1 for ite3 in ite2]

all_preci2 = [ite3 for ite1 in preci_matrix2 for ite2 in ite1 for ite3 in ite2]
all_recall2 = [ite3 for ite1 in recall_matrix2 for ite2 in ite1 for ite3 in ite2]
all_size2 = [ite3 for ite1 in size_matrix2 for ite2 in ite1 for ite3 in ite2]

# plot
# X1 = all_preci1
# X2 = all_preci2
# Y1 = all_recall1
# Y2 = all_recall2
# S1 = all_size1
# S2 = all_size2

X1 = [all_preci1[i] for i in range(len(all_recall1)) if all_recall1[i]!=0]
X2 = [all_preci2[i] for i in range(len(all_recall2)) if all_recall2[i]!=0]
Y1 = np.log([i for i in all_recall1 if i!=0])
Y2 = np.log([i for i in all_recall2 if i!=0])
S1 = [all_size1[i] for i in range(len(all_recall1)) if all_recall1[i]!=0]
S2 = [all_size2[i] for i in range(len(all_recall1)) if all_recall2[i]!=0]

plt.scatter(X2,Y2,s=7*np.power(1.4,S2),alpha=0.4,label="Remove zero")
# plt.scatter(X1,Y1,s=7*np.power(1.4,S1),alpha=0.6,label="Do not remove zero")
# plt.xticks(np.arange(0,1.05,0.05),fontsize=12)
# plt.yticks(np.arange(min(Y1),max(Y2)+0.05,(max(Y2)+0.05-min(Y1))/20),fontsize=12)
plt.grid()
plt.title("Precision-Log(no-zero-Recall)")
# plt.title("Precision-Recall")
# plt.legend(bbox_to_anchor=(0.3,0.90))
plt.legend()
plt.xlabel("Precision",fontsize=12)
plt.ylabel("log(Recall)",fontsize=12)
# plt.ylabel("Recall",fontsize=12)
plt.show()

<IPython.core.display.Javascript object>

## Instance Level

In [10]:
instance_preci_mean_matrix = [np.mean(ite,axis=1) for ite in preci_matrix2]
instance_recall_mean_matrix = [np.mean(ite,axis=1) for ite in recall_matrix2]
instance_size_mean_matrix = [np.mean(ite,axis=1) for ite in size_matrix2]
all_instance_preci_mean = [ite2 for ite1 in instance_preci_mean_matrix for ite2 in ite1]
all_instance_recall_mean = [ite2 for ite1 in instance_recall_mean_matrix for ite2 in ite1]
all_instance_size_mean = [ite2 for ite1 in instance_size_mean_matrix for ite2 in ite1]

X = all_instance_preci_mean
Y = all_instance_recall_mean
plt.scatter(X,Y,s=20*np.power(1.4,all_instance_size_mean),alpha=0.4,label="Remove zero")
plt.xticks(np.arange(0,1.05,0.05),fontsize=12)
plt.yticks(np.arange(min(Y),max(Y)+0.05,(max(Y)+0.05-min(Y))/20),fontsize=12)
plt.grid()

# potential dividing line
xps = np.arange(-0.1,1,0.01)
# potential standard for first batch
bt1ps1 = -0.5*xps+0.2
bt1ps2 = -0.6*xps+0.315
# potential standard for second batch
bt2ps1 = -0.67*xps+0.215
bt2ps2 = -0.66*xps+0.315
# plt.plot(xps,bt1ps1,c="grey",linestyle="--",alpha=0.65,label="lower standard for first batch")
# plt.plot(xps,bt1ps2,c="grey",linestyle="-.",alpha=0.65,label="upper standard for first batch")
plt.plot(xps,bt2ps1,c="k",linestyle="--",alpha=0.8,label="recall + 0.67*preci >= 0.215")
plt.plot(xps,bt2ps2,c="k",linestyle="-.",alpha=0.8,label="recall + 0.66*preci >= 0.315")

plt.ylim(min(Y)-0.03,max(Y)+0.03)
plt.xlim(min(X)-0.03,max(X)+0.03)

plt.title("Instance Precission-Recall")
# plt.legend(bbox_to_anchor=(0.35,0.98))
plt.legend()
plt.xlabel("Precision",fontsize=12)
plt.ylabel("Recall",fontsize=12)
plt.show()

<IPython.core.display.Javascript object>

## Worker Level

In [11]:
# link worker to his/her answers
worker_answer_dict = dict()
for ind in range(len(original_result["WorkerId"])):
    wid = original_result["WorkerId"][ind]
    if wid in worker_answer_dict:
        worker_answer_dict[wid].append(ind)
    else:
        worker_answer_dict[wid]=[ind]

In [12]:
# statistics of answers amount the workers provide
worker_question_amount = [len(ite) for ite in worker_answer_dict.values()]
print("Worker question amount mean: %.1f questions/worker"%np.mean(worker_question_amount))
print("Worker question amount std_error: %.1f questions/worker"%np.std(worker_question_amount))
plt.hist(worker_question_amount,11,alpha=0.85,edgecolor="white")
# plt.xticks(range(0,11,1))
plt.title("Number of workers' questions")
plt.show()

Worker question amount mean: 1.3 questions/worker
Worker question amount std_error: 0.4 questions/worker


<IPython.core.display.Javascript object>

In [13]:
# worker's precision - recall
workers_precision = []
workers_recall = []
workers_len = []
for worker_answers in worker_answer_dict.values():
    workers_precision.append(np.mean(np.array(all_instance_preci_mean)[worker_answers]))
    workers_recall.append(np.mean(np.array(all_instance_recall_mean)[worker_answers]))
    workers_len.append(np.mean(np.array(all_instance_size_mean)[worker_answers]))

X = workers_precision
Y = workers_recall
plt.scatter(X,Y,s=20*np.power(1.4,workers_len),alpha=0.5,label="Remove zero",c=worker_question_amount,cmap="seismic")
plt.xticks(np.arange(0,1.05,0.05),fontsize=12)
plt.yticks(np.arange(min(Y),max(Y)+0.05,(max(Y)+0.05-min(Y))/20),fontsize=12)
plt.colorbar()
plt.grid()

# potential dividing line
xps = np.arange(-0.1,1,0.01)
# potential standard for first batch
# bt1ps1 = -0.5*xps+0.2
# bt1ps2 = -0.5*xps+0.3
# potential standard for second batch
bt2ps1 = -0.65*xps+0.25
bt2ps2 = -0.55*xps+0.32
# plt.plot(xps,bt1ps1,c="grey",linestyle="--",alpha=0.8,label="lower standard for first batch")
# plt.plot(xps,bt1ps2,c="grey",linestyle="-.",alpha=0.8,label="upper standard for first batch")
plt.plot(xps,bt2ps1,c="k",linestyle="--",alpha=0.75,label="recall + 0.65*preci >= 0.25")
plt.plot(xps,bt2ps2,c="k",linestyle="-.",alpha=0.75,label="recall + 0.55*preci >= 0.32")

plt.ylim(min(Y)-0.03,max(Y)+0.03)
plt.xlim(min(X)-0.03,max(X)+0.03)

plt.title("Worker Precission-Recall")
# plt.legend(loc="upper left")
# plt.legend(bbox_to_anchor=(0.35,0.98))
plt.legend()
plt.xlabel("Precision",fontsize=12)
plt.ylabel("Recall",fontsize=12)

plt.show()

<IPython.core.display.Javascript object>

## Time Level

In [14]:
## time statistics
print("Work time mean: %.1f min/question"%(np.mean(original_result["WorkTimeInSeconds"]/60/QuestionNumber)))
print("Work time std_error: %.1f min/question"%(np.std(original_result["WorkTimeInSeconds"]/60/QuestionNumber)))
plt.hist((original_result["WorkTimeInSeconds"]/60/QuestionNumber),40,alpha=0.85,edgecolor="grey")
plt.xticks(range(0,23,2))
plt.xlabel("Min")
plt.title("Work Time for one question")
plt.show()

Work time mean: 4.2 min/question
Work time std_error: 4.8 min/question


<IPython.core.display.Javascript object>

## Question Level

In [15]:
# question size statistics
question_words_mean_matrix = [np.mean(ite,axis=0) for ite in size_matrix2]
all_question_words_mean = [ite2 for ite1 in question_words_mean_matrix for ite2 in ite1]
print("question words mean: %.1f words/question"%np.mean(all_question_words_mean))
print("question words std_error: %.1f words/quesiton"%np.std(all_question_words_mean))
plt.hist(all_question_words_mean,6,alpha=0.85,edgecolor="white")
plt.xticks(range(0,6,1))
plt.title("Average words amount for one question")
plt.show()

question words mean: 2.4 words/question
question words std_error: 0.6 words/quesiton


<IPython.core.display.Javascript object>

In [16]:
# question precision statistics
question_preci_mean_matrix = [np.mean(ite,axis=0) for ite in preci_matrix2]
all_question_preci_mean = [ite2 for ite1 in question_preci_mean_matrix for ite2 in ite1]
print("question preci mean: %.1f /question"%np.mean(all_question_preci_mean))
print("question preci std_error: %.1f /quesiton"%np.std(all_question_preci_mean))
plt.hist(all_question_preci_mean,12,alpha=0.85,edgecolor="white")
# plt.xticks(range(0,1,0.2))
plt.title("Average precision for one question")
plt.show()

question preci mean: 0.4 /question
question preci std_error: 0.2 /quesiton


<IPython.core.display.Javascript object>

In [17]:
# question recall statistics
question_recall_mean_matrix = [np.mean(ite,axis=0) for ite in recall_matrix2]
all_question_recall_mean = [ite2 for ite1 in question_recall_mean_matrix for ite2 in ite1]
print("question recall mean: %.1f /question"%np.mean(all_question_recall_mean))
print("question recall std_error: %.1f /quesiton"%np.std(all_question_recall_mean))
plt.hist(all_question_recall_mean,17,alpha=0.85,edgecolor="white")
# plt.xticks(range(0,1,0.2))
plt.title("Average recall for one question")
plt.show()

question recall mean: 0.1 /question
question recall std_error: 0.1 /quesiton


<IPython.core.display.Javascript object>

## Standard Answer Level

In [18]:
all_std_answer_preci = [ite2 for ite1 in std_matrix2 for ite2 in ite1[0,:] if (ite2 >=0 and ite2 <= 1)]
print("%d/%d standard answers appear in workers' answers"%(sum(all_std_answer_preci),len(all_std_answer_preci)))

2/10 standard answers appear in workers' answers


# Generate Review Files

## Review Result txt

Generate .txt review result for researchers to manually check.The final organized answers will based on this file. There are 2 versions of txt review results: 'Instance-Level Only', which only bases on the instance-level staistis; and the 'Worker-Level & Instance-Level'.

### Instance-Level Only

In [31]:
lower_approve = np.array(all_instance_recall_mean)+0.67*np.array(all_instance_preci_mean) >= 0.215
upper_approve = np.array(all_instance_recall_mean)+0.66*np.array(all_instance_preci_mean) >= 0.315

In [35]:
last_HIT= None
with open("Review_result_instance.txt","w",encoding="utf8") as f:
    f.write("Appr: Recall + 0.66*Precision >= 0.315\n"+
           "Reje: Recall + 0.67*Precision < 0.215\n"+
           "Midd: In the middle\n\n")
    for ite in range(len(original_result)):
        curr_hit = original_result.iloc[ite]["HITId"]
        if last_HIT is None:        # first HIT
            f.write("HITId: %s\n"%curr_hit)
        elif last_HIT != curr_hit:  # New HIT
            f.write("\n\nHITId: %s\n"%curr_hit)
        if (last_HIT is None) or (last_HIT != curr_hit):  # print standard answers
            f.write("Standard answers: ")
            row = original_result.iloc[ite]
            video_ids = [row["Input.video"+str(wid)+"_id"] for wid in range(1,QuestionNumber+1)]
            video_questions = [row["Input.question"+str(wid)] for wid in range(1,QuestionNumber+1)]
            std_answers = [row["Input.label"+str(wid+1)] for wid in range(QuestionNumber)]
            f.write("%s\n"%str(std_answers))
        if upper_approve[ite]:        # approve
            f.write("Appr   ")
        elif lower_approve[ite]:      # middle
            f.write("Midd   ")
        else:                         # reject
            f.write("Reje   ")
        f.write(str(original_result.iloc[ite]["Answer.taskAnswers"]))
        f.write("\n")
        last_HIT = curr_hit

### Worker-Level & Instance-Level

In [36]:
upper_approve = np.array(all_instance_recall_mean)+0.66*np.array(all_instance_preci_mean) >= 0.315
rej_workers = np.array(workers_recall) + 0.65 * np.array(workers_precision) < 0.2
rej_workers_id = [list(worker_answer_dict.keys())[i] for i in range(len(worker_answer_dict.keys())) if rej_workers[i]]

In [37]:
last_HIT= None
with open("Review_result_instance_worker.txt","w",encoding="utf8") as f:
    f.write("Appr: (Instance-Level) Recall + 0.66*Precision >= 0.315\n"+
           "Reje: (Worker-Level) Recall + 0.65*Precision < 0.2\n"+
           "Midd: In the middle\n\n")
    for ite in range(len(original_result)):
        curr_hit = original_result.iloc[ite]["HITId"]
        if last_HIT is None:        # first HIT
            f.write("HITId: %s\n"%curr_hit)
        elif last_HIT != curr_hit:  # New HIT
            f.write("\n\nHITId: %s\n"%curr_hit)
        if (last_HIT is None) or (last_HIT != curr_hit):  # print standard answers
            f.write("Standard answers: ")
            row = original_result.iloc[ite]
            video_ids = [row["Input.video"+str(wid)+"_id"] for wid in range(1,QuestionNumber+1)]
            video_questions = [row["Input.question"+str(wid)] for wid in range(1,QuestionNumber+1)]
            std_answers = [row["Input.label"+str(wid+1)] for wid in range(QuestionNumber)]
            f.write("%s\n"%str(std_answers))
        if original_result.iloc[ite]["WorkerId"] in rej_workers_id:      # reject
            f.write("Reje   ")
        elif upper_approve[ite]:                                         # approve
            f.write("Appr   ")
        else:                                                            # middle
            f.write("Midd   ")
        f.write(str(original_result.iloc[ite]["Answer.taskAnswers"]))
        f.write("\n")
        last_HIT = curr_hit

## Mturk Review CSV

Generate CSV file, which will be uploaded onto the Mturk to accept/reject workers' answers. Researchers could also manually check this CSV file.

### Instance-Level Only

In [38]:
instance_sores = np.array(all_instance_recall_mean)+0.67*np.array(all_instance_preci_mean)
approval_sign = instance_sores >= 0.215
approve_marks = []
reject_marks = []
for ite in approval_sign:
    if ite:                # approve
        approve_marks.append("x")
        reject_marks.append("")
    else:                  # reject
        approve_marks.append("")
        reject_marks.append("Thank you for your answer! " +
                            "But We are sorry to say that this answer is rejected " +
                            "because it is inaccurate or incomprehensible." +
                            " Please read the instructions at first.")
        
original_result["Approve"] = approve_marks
original_result["Reject"] = reject_marks

original_result.to_csv("MturkReview.csv",index=False)