In [2]:
import json
import jsonlines
# from datetime import datetime

import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.stats.inter_rater

import random
random.seed(1)

import copy
import pickle
import os
import re

In [9]:
import sys 
sys.path.insert(1, '/ihome/xli/joh227/developer/ads/workspace/src/')

from utils import (
    load_json,
    write_json,
    load_jsonl,
    write_jsonl
)

In [None]:
curr_dir = '/ihome/xli/joh227/developer/ads/workspace/'
data_dir = curr_dir + 'data/'
mturk_data_dir = curr_dir + 'data/mturk_data/'

# 1. Load MTurk Data

In [51]:
# mturk_raw_data = pd.read_csv(mturk_data_dir + 'subset_0.5/mturk_raw_results.0.5.1.csv')
# mturk_raw_data.shape #, mturk_groundtruth.shape

file_lst = [
    mturk_data_dir + 'subset_0.5/mturk_raw_results.0.5.1.csv',
    mturk_data_dir + 'subset_0.5/mturk_raw_results.0.5.csv',
    mturk_data_dir + 'subset_1.2/mturk_raw_results.1.2.csv',
    mturk_data_dir + 'subset_1.2/mturk_raw_results.1.2.1.csv',
    mturk_data_dir + 'subset_1.1/mturk_raw_results.1.1.csv'
    # mturk_data_dir + 'subset_1.1/Batch_5263776_batch_results.csv'
]
mturk_raw_data = []
for file in file_lst:
    tmp_df = pd.read_csv(file)
    mturk_raw_data.append(tmp_df)
mturk_raw_data = pd.concat(mturk_raw_data)
mturk_raw_data.shape

(2500, 105)

In [29]:
ids_to_drop = []
# ids_to_drop = ["0/52390.jpg", "0/85300.jpg"]
mturk_raw_data = mturk_raw_data.loc[mturk_raw_data['Input.ads_id'].apply(lambda x: x not in ids_to_drop)].reset_index(drop=True)
mturk_raw_data.shape

(2500, 105)

# 2. Clearning and Score Scaling

## 2.1 Cleaning

In [32]:
# assigning unique_id
mturk_raw_data['unique_id'] = range(mturk_raw_data.shape[0])

- Drop some columns

In [None]:
col_to_drop = [
    'HITTypeId',
    'Title',
    'Description',
    'Keywords',
    'Reward',
    'CreationTime',
    'MaxAssignments',
    'RequesterAnnotation',
    'AssignmentDurationInSeconds',
    'AutoApprovalDelayInSeconds',
    'Expiration',
    'NumberOfSimilarHITs',
    'LifetimeInSeconds',
    # 'AssignmentId',
    'AssignmentStatus',
    'AcceptTime',
    'SubmitTime',
    'AutoApprovalTime',
    'ApprovalTime',
    'RejectionTime',
    'RequesterFeedback',
    'WorkTimeInSeconds',
    'LifetimeApprovalRate',
    'Last30DaysApprovalRate',
    'Last7DaysApprovalRate',
    'Input.example_1',
    'Input.example_2',
    'Input.image_url',
    'Approve',
    'Reject'
]
mturk_raw_data = mturk_raw_data.drop(columns=col_to_drop)
mturk_raw_data.shape

- Effective Score

In [35]:
mturk_raw_data['effectiveness_correct'] = mturk_raw_data.apply(lambda x: x['Answer.effectiveness.' + x['Input.action_correct']], axis = 1)
effectiveness_accuracy = sum(mturk_raw_data['effectiveness_correct']) / mturk_raw_data.shape[0]
effectiveness_accuracy

0.9688

In [36]:
sum(mturk_raw_data['effectiveness_correct']) 

2422

In [37]:
len(mturk_raw_data['WorkerId'].unique())

43

## 2.2 Score Scaling

In [38]:
tmp_columns = ['pc', 'atypicality', 'artistic']
answer_choices = {
    'disagree': -1, 
    'neutral': 0, 
    'agree': 1
}

def helper_normalize_lst(lst, bin = 7): # normalize to bin = 7 (i.e. same as -3~3)
    if max(lst) == min(lst): return [0] * len(lst)
    
    mean = np.mean(lst)
    std = np.std(lst)
    lst_norm = [(i - mean) / std for i in lst]
    lst_norm_max = max(lst_norm)
    lst_norm_min = min(lst_norm)
    lst_norm = [
        int((i - lst_norm_min) / (lst_norm_max - lst_norm_min) * bin - bin / 2)
        for i in lst_norm
    ]
    return lst_norm
    
def find_acc_value(tmp_all_values, method, normalize_bin = 0):
    if method == 'single_pos':
        acc_values = [] 
        for i in range(len(tmp_all_values[0].values)):
            item_value = 0
            for lst in tmp_all_values:
                lst = lst.values
                if lst[i] == 1:
                    item_value = 1
            acc_values.append(item_value)
        return acc_values
    
    tmp_sum_lst = []
    for i in range(len(tmp_all_values[0].values)):
        item_value = 0
        tmp_sum = sum([lst.values[i] for lst in tmp_all_values])
        tmp_sum_lst.append(tmp_sum)
    
    if normalize_bin:
        tmp_sum_lst = helper_normalize_lst(tmp_sum_lst, normalize_bin)

    acc_values = []
    for tmp_sum in tmp_sum_lst:
        if method == 'threshold_0' and tmp_sum > 0: item_value = 1
        if method == 'threshold_3' and tmp_sum == 3: item_value = 1
        if method == 'raw': item_value = tmp_sum
        acc_values.append(item_value)
    return acc_values

normalize_bin = 3
processed_data = []
for worker_id in mturk_raw_data.WorkerId.unique():
    tmp_df = copy.deepcopy(mturk_raw_data.query('WorkerId == "{}"'.format(worker_id)))
    for col in tmp_columns:
        # 1. value translation
        tmp_all_values = []
        for sub_question in ['_1', '_2', '_3']:
            tmp_answer_values = []
            for ans in answer_choices:
                tmp_col = 'Answer.' + col + sub_question + '.' + ans
                tmp_values = tmp_df[tmp_col].apply(lambda x: answer_choices[ans] if x else 0)
                tmp_answer_values.append(tmp_values)
            tmp_df['value_' + col + sub_question] = sum(tmp_answer_values)
            tmp_all_values.append(sum(tmp_answer_values))
        
        # 2. value accumulation
        tmp_df['acc_raw_' + col] = find_acc_value(tmp_all_values, method = 'raw')
        tmp_df['acc_th0_' + col] = find_acc_value(tmp_all_values, method = 'threshold_0')
        tmp_df['acc_th3_' + col] = find_acc_value(tmp_all_values, method = 'threshold_3')
        tmp_df['acc_single_' + col] = find_acc_value(tmp_all_values, method = 'single_pos')

        # 3. normalization 

        tmp_df['acc_norm_raw_' + col] = find_acc_value(tmp_all_values, method = 'raw', normalize_bin = normalize_bin)
        tmp_df['acc_norm_th0_' + col] = find_acc_value(tmp_all_values, method = 'threshold_0', normalize_bin = normalize_bin)
        tmp_df['acc_norm_th3_' + col] = find_acc_value(tmp_all_values, method = 'threshold_3', normalize_bin = normalize_bin)
    processed_data.append(tmp_df)

# mturk_raw_data.tail(2)
mturk_processed_df = pd.concat(processed_data).sort_values(by = 'unique_id')

effectiveness_map = {
    'a': 'a',
    'b': 'b',
    'c': 'c',
    'd': 'd',
    'e': 'e'
}
age_map = {
    'age_18': 18, 
    'age_18_24': 24, 
    'age_25_34': 30, 
    'age_35_44': 40, 
    'age_45_54': 50, 
    'age_55_64': 60, 
    'age_65': 70, 
    'unknown': -1, 
}
overall_map = {str(i): i for i in range(1, 6)}

def find_multi_choice(s, feature_map, feature):
    for e in feature_map:
        if s['Answer.{}.{}'.format(feature, e)]:
            return feature_map[e]
    return ''
mturk_processed_df['value_effectiveness'] = mturk_processed_df.apply(lambda x: find_multi_choice(x, effectiveness_map, 'effectiveness'), axis = 1)
mturk_processed_df['value_overall'] = mturk_processed_df.apply(lambda x: find_multi_choice(x, overall_map, 'overall'), axis = 1)

# 3. demo data
col_location = 'Answer.worker_location_live'
mturk_processed_df['demo_location'] = mturk_processed_df[col_location]
mturk_processed_df['demo_age'] = mturk_processed_df.apply(lambda x: find_multi_choice(x, age_map, 'age'), axis = 1)

mturk_processed_df.shape

(2500, 112)

In [39]:
all_overall_data = []
mturk_processed_df_overall = mturk_processed_df[['unique_id', 'WorkerId', 'value_overall']]
for worker_id in mturk_processed_df_overall.WorkerId.unique():
    tmp_df = copy.deepcopy(mturk_processed_df_overall.query('WorkerId == "{}"'.format(worker_id)))
    tmp_df['value_overall'] = helper_normalize_lst(tmp_df['value_overall'].values, bin = normalize_bin)
    all_overall_data.append(tmp_df)
all_overall_df = pd.concat(all_overall_data).sort_values(by = 'unique_id')

all_overall_df.head(10)

Unnamed: 0,unique_id,WorkerId,value_overall
0,0,A1DMXEJGJY02E1,0
1,1,A3RVHUY67SVXQV,0
2,2,A41APS6V2Z1FJ,0
3,3,A15X8ATAWSRXIF,0
4,4,A2XZLEY2RCF5VM,0
5,5,A5WWHKD82I8UE,0
6,6,A34YDGVZKRJ0LZ,1
7,7,A3B7TNVOISSZ2O,0
8,8,A3PUUVUDORJS8W,0
9,9,A3W0SCW5UYEB0F,0


In [40]:
mturk_processed_df['value_overall_norm'] = all_overall_df['value_overall']

In [41]:
mturk_processed_df['value_overall_norm'].value_counts()

value_overall_norm
 0    1750
 1     381
-1     369
Name: count, dtype: int64

## 2.3 (CAUTIOUS!!) Save to local file 

In [61]:
mturk_processed_df.to_csv(data_dir + 'mturk_cleaned_1201/mturk_processed_df.csv', index = False)

# 3. Calculate Agreements

In [42]:
def mturk_get_kappa_data(mturk_raw_data, column, data_maping = None):
    # unique_values = accumulate_data[column + '_agree'].unique()
    def count_kappa(data, data_maping):
        if data_maping is None:
            unique_values = data.unique()
        else:
            unique_values = list(data_maping.values())
        result = {k: 0 for k in unique_values}
        for v in data.values:
            # v = int(v)
            if data_maping is None:
                if v in result: result[v] += 1
            else:
                if data_maping[v] in result: result[data_maping[v]] += 1
        return result
    kappa_data = mturk_raw_data[[column, 'Input.ads_id']].groupby('Input.ads_id').agg(lambda data: count_kappa(data, data_maping = data_maping))[column].values
    kappa_data = pd.DataFrame(list(kappa_data)).fillna(0)
    return kappa_data

In [46]:
fleiss_value_lst = []
for col in sorted(list(mturk_processed_df.columns)):
    if 'acc_' in col:
        data_maping = None
    elif 'value_' in col:
        if 'effectiveness' in col or 'overall' in col:
            data_maping = None
        else:
            # data_maping = {
            #     0: 0,
            #     -1: 0,
            #     1: 1
            # }
            data_maping = None
    else:
        continue 
    tmp_kappa_data = mturk_get_kappa_data(mturk_processed_df, col, data_maping)
    # display(tmp_kappa_data)
    # fleiss_value = round(statsmodels.stats.inter_rater.fleiss_kappa(tmp_kappa_data, 'randolph'), 4)
    fleiss_value = round(statsmodels.stats.inter_rater.fleiss_kappa(tmp_kappa_data), 4)
    # print(col, fleiss_value)
    fleiss_value_lst.append({
        'col': col,
        'fleiss': fleiss_value,
        'norm': 'norm' in col,
        'acc': 'acc_' in col
    })
fleiss_value_df = pd.DataFrame(fleiss_value_lst)

  kappa = (p_mean - p_mean_exp) / (1- p_mean_exp)
  kappa = (p_mean - p_mean_exp) / (1- p_mean_exp)
  kappa = (p_mean - p_mean_exp) / (1- p_mean_exp)


In [47]:
fleiss_value_df.query('norm == True').sort_values(by = 'col')

Unnamed: 0,col,fleiss,norm,acc
0,acc_norm_raw_artistic,0.1394,True,True
1,acc_norm_raw_atypicality,0.2469,True,True
2,acc_norm_raw_pc,0.2408,True,True
3,acc_norm_th0_artistic,0.1228,True,True
4,acc_norm_th0_atypicality,0.097,True,True
5,acc_norm_th0_pc,0.0713,True,True
6,acc_norm_th3_artistic,,True,True
7,acc_norm_th3_atypicality,,True,True
8,acc_norm_th3_pc,,True,True
29,value_overall_norm,0.1053,True,False


In [48]:
fleiss_value_df.query('(norm == False) & (acc == True)').sort_values(by = 'col')

Unnamed: 0,col,fleiss,norm,acc
9,acc_raw_artistic,0.0639,False,True
10,acc_raw_atypicality,0.2113,False,True
11,acc_raw_pc,0.1509,False,True
12,acc_single_artistic,0.2284,False,True
13,acc_single_atypicality,0.3548,False,True
14,acc_single_pc,0.3668,False,True
15,acc_th0_artistic,0.0859,False,True
16,acc_th0_atypicality,0.1562,False,True
17,acc_th0_pc,0.1197,False,True
18,acc_th3_artistic,0.1066,False,True


# 4. Create Training Data

## 4.1 Intrinsic Data

In [62]:
mturk_processed_df = pd.read_csv(data_dir + 'mturk_cleaned_1201/mturk_processed_df.csv')
mturk_processed_df.shape

(2500, 113)

In [63]:
instrinsic_task_cols = {
    "value_overall_norm": "creativity",
    "acc_norm_raw_atypicality": "atypicality",
    "acc_norm_raw_pc": "originality",
}

In [64]:
# process single task 
tmp_task_col = 'value_overall_norm' 

def get_score_distribution(s, offset = 2):
    return [i + offset for i in s.values]

def get_value_counts(lst):
    unique_values, counts = np.unique(lst, return_counts=True)
    return dict(zip(unique_values, counts))

def get_majority(value_counts):
    return sorted(value_counts.items(), key = lambda x: x[1], reverse = True)[0][0] # majority label 

def get_majority_percentage(value_counts):
    majority_count = sorted(value_counts.items(), key = lambda x: x[1], reverse = True)[0][1] 
    return round(majority_count / sum(value_counts.values()), 2)


all_instrinsic_data = None
for tmp_task_col in instrinsic_task_cols:
    tmp_distribution_data = mturk_processed_df[['Input.ads_id', tmp_task_col]].groupby('Input.ads_id').agg(get_score_distribution)[tmp_task_col]
    tmp_value_count_data = tmp_distribution_data.apply(get_value_counts)
    tmp_average_data = tmp_distribution_data.apply(lambda x: np.mean(x))
    tmp_disagreement_data = tmp_distribution_data.apply(lambda x: np.var(x))
    tmp_majority_data = tmp_value_count_data.apply(get_majority)
    tmp_majority_percentage_data = tmp_value_count_data.apply(get_majority_percentage)

    tmp_instrinsic_data = pd.DataFrame({
        'ads_id': tmp_distribution_data.index,
        instrinsic_task_cols[tmp_task_col] + '_distribution': tmp_distribution_data.values,
        instrinsic_task_cols[tmp_task_col] + '_value_count': tmp_value_count_data.values,
        instrinsic_task_cols[tmp_task_col] + '_average': tmp_average_data.values ,
        instrinsic_task_cols[tmp_task_col] + '_disagreement': tmp_disagreement_data.values,
        instrinsic_task_cols[tmp_task_col] + '_majority': tmp_majority_data.values,
        instrinsic_task_cols[tmp_task_col] + '_majority_percentage': tmp_majority_percentage_data.values
    })
    if all_instrinsic_data is None:
        all_instrinsic_data = tmp_instrinsic_data
    else:
        all_instrinsic_data = all_instrinsic_data.merge(tmp_instrinsic_data, on = 'ads_id')
    # all_instrinsic_data.append(tmp_instrinsic_data)

In [65]:
all_instrinsic_data.creativity_majority_percentage.mean()

0.7172000000000002

In [66]:
all_instrinsic_data.atypicality_majority_percentage.mean()

0.6676000000000002

In [67]:
all_instrinsic_data.originality_majority_percentage.mean()

0.6464000000000002

In [None]:
#### IMPORTANT: saving data file ####

# all_instrinsic_data.to_csv(data_dir + 'mturk_cleaned_1201/modeling_instrinsic_data.csv', index = False)

## 4.2 Pairwise Data

In [None]:
pairwise_task_cols = {
    "creativity_average": "creativity",
    "atypicality_average": "atypicality",
    "originality_average": "originality",
}

def rebalance_df(df):
    df.sample(frac=1).reset_index(drop = True)

def arrange_sample(ads_id_1, ads_id_2, diff):
    if diff < 0:
        ads_id_1, ads_id_2 = ads_id_2, ads_id_1
        diff = (-1) * diff 
    if random.random() > 0.5:
        ads_id_1, ads_id_2 = ads_id_2, ads_id_1
        diff = (-1) * diff 
    return ads_id_1, ads_id_2, diff

# tmp_task_col = "creativity_average"
pairwise_threshold = 0.5
# all_instrinsic_data[["creativity_average", "ads_id"]]
all_pairwise_data = []
for tmp_task_col in pairwise_task_cols:
    tmp_average_data = all_instrinsic_data[tmp_task_col].values 
    tmp_ads_id = all_instrinsic_data["ads_id"].values 

    tmp_all_diff = []
    counter = 0
    for i in range(len(tmp_ads_id)):
        for j in range(i + 1, len(tmp_ads_id)):
            ads_id_1, ads_id_2 = tmp_ads_id[i], tmp_ads_id[j]
            diff = tmp_average_data[i] - tmp_average_data[j]
            ads_id_1, ads_id_2, diff = arrange_sample(ads_id_1, ads_id_2, diff)

            tmp_all_diff.append({
                'ads_pair': ads_id_1 + ', ' + ads_id_2,
                tmp_task_col + '_diff': diff
            })
            if abs(diff) > pairwise_threshold:
                counter += 1
    tmp_diff_df = pd.DataFrame(tmp_all_diff)
    print(tmp_task_col, counter, tmp_diff_df.query('{} > 0'.format(tmp_task_col + '_diff')).shape[0] / tmp_diff_df.shape[0])
    

    #### IMPORTANT: saving data file ####
    # tmp_diff_df.to_csv(data_dir + 'mturk_cleaned_1201/modeling_{}_diff.csv'.format(tmp_task_col), index = False)

creativity_average 938 0.46464646464646464
atypicality_average 2631 0.49252525252525253
originality_average 2708 0.4862626262626263
