In [65]:
import json
import jsonlines
# from datetime import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import random
random.seed(1)
from tqdm import tqdm 

import math 
import statsmodels.stats.inter_rater
from scipy import stats
from sklearn.metrics import accuracy_score

import copy
import pickle
import os
import re

import sys 
sys.path.insert(1, '../src/')
from utils import (
    load_json,
    write_json,
    load_jsonl,
    write_jsonl,
    openai_call
)
data_dir = '../../pitt_ads/'
mturk_data_dir = '../../pitt_ads/mturk_data/'
annotation_dir = data_dir + 'annotations_images/image/'

# 1 Load Data

In [93]:
sampled_ads_id_200 = pickle.load(open(mturk_data_dir + 'sampled_ads_id_200.pkl', 'rb'))
atypicality_df = pd.read_csv(data_dir + 'atypicality_annotations_final_csv.csv').dropna(subset = ['image_url'])

In [94]:
# atypicality_df.ads_id.nunique()

In [68]:
qa_action_json = load_json(annotation_dir + 'QA_Action.json')
# qa_action_json.keys()

annotation_data = {}
annotation_data['sentiment'] = load_json(annotation_dir + 'Sentiments.json')
annotation_data['symbol'] = load_json(annotation_dir + 'Symbols.json')
annotation_data['topic'] = load_json(annotation_dir + 'Topics.json')
annotation_data['action_reason'] = load_json(annotation_dir + 'QA_Combined_Action_Reason.json')
annotation_data['action'] = load_json(annotation_dir + 'QA_Action.json')
annotation_data['reason'] = load_json(annotation_dir + 'QA_Reason.json')
annotation_data['slogan'] = load_json(annotation_dir + 'Slogans.json')

def parse_list(list_file):
    data_dict = {}
    with open(list_file, 'r', encoding='unicode_escape') as f:
        for line in f.readlines():
            if line.strip() == '':
                continue
            # get number
            # print(line)
            idx, label = line.split('. ')
            # re.match(pattern='\d{1,2}', string=line).group()
            # get key
            # label = re.sub('[^A-Za-z]', '', line.split('ABBREVIATION')[1])
            data_dict[idx] = label
    return data_dict

def get_most_frequent(lst):
    label, counts = np.unique(lst, return_counts = True)
    return sorted(zip(label, counts), key = lambda x: x[1], reverse = True)[0][0].strip()

def get_longest(lst):
    length = [len(label) for label in lst]
    return sorted(zip(lst, length), key = lambda x: x[1], reverse = True)[0][0].strip()

def get_ads_data(ads_key):
    ads_data = {'ads_id': ads_key}
    for key in annotation_data.keys():
        # print('key:', key)
        try:
            ads_data[key] = annotation_data[key][ads_key]
            if key == 'sentiment':
                ads_data[key] = [sentiment_key[l[0]] for l in ads_data[key]]
            if key == 'topic':
                ads_data[key] = [topic_key[i] for i in ads_data[key]]
        except:
            pass
    return ads_data
# get_image_data("10/170741.png")

sentiment_key = parse_list(annotation_dir + 'Sentiments_List.txt')
topic_key = parse_list(annotation_dir + 'topics.txt')
# topic_key
# sentiment_key

# 2 Atypicality Data 

## 2.1 Instrinsic

In [116]:
atypicality_df['ads_id'] = atypicality_df['image_url'].apply(lambda x: x.replace('https://people.cs.pitt.edu/~mzhang/image_ads/', ''))
atypicality_df['atypicality'] = atypicality_df.category.apply(lambda x: 0 if x == 'Regular_Object' else 1)
atypicality_df.category.unique()

array(['2', 'Regular_Object', '8', '3', '9', '5', '1', '6', '4', '7'],
      dtype=object)

In [117]:
all_ads_id = list(atypicality_df.ads_id.unique())
test_ads_id = [i[0] for i in sampled_ads_id_200]
all_ads_id = [i for i in all_ads_id if i not in test_ads_id]
len(all_ads_id)

4072

In [118]:
train_ads_id = list(atypicality_df[['ads_id', 'category']].groupby('ads_id').agg('count').query('category == 3').index)
train_ads_id = [i for i in train_ads_id if i not in test_ads_id]
len(train_ads_id)

2574

In [119]:
train_ads_id = random.sample(train_ads_id, 300)
# train_ads_id

In [121]:
atypicality_train_df = atypicality_df.loc[atypicality_df.ads_id.apply(lambda x: x in train_ads_id)].reset_index(drop = True)
atypicality_train_df.shape

(900, 7)

In [155]:
atypicality_train_df.atypicality.sum() / 900

0.4088888888888889

In [152]:
atypicality_train_df[['ads_id', 'atypicality']].groupby('ads_id').agg('sum').query('atypicality == 0').shape

(115, 1)

In [154]:
(300 - 115) / 300

0.6166666666666667

In [150]:
atypicality_train_df

Unnamed: 0,image_url,category,first_explaination,second_explanation,bounding_box,ads_id,atypicality
0,https://people.cs.pitt.edu/~mzhang/image_ads/0...,Regular_Object,,,{},0/100080.jpg,0
1,https://people.cs.pitt.edu/~mzhang/image_ads/0...,Regular_Object,,,{},0/100080.jpg,0
2,https://people.cs.pitt.edu/~mzhang/image_ads/0...,Regular_Object,,,"[{""left"":311-""top"":136-""width"":90-""height"":90-...",0/100080.jpg,0
3,https://people.cs.pitt.edu/~mzhang/image_ads/0...,8,a chain inside a knee/leg,the knee,"[{""left"":57-""top"":102-""width"":139-""height"":110...",0/100790.jpg,1
4,https://people.cs.pitt.edu/~mzhang/image_ads/0...,5,leg- scales,,"[{""left"":55-""top"":103-""width"":128-""height"":99-...",0/100790.jpg,1
...,...,...,...,...,...,...,...
895,https://people.cs.pitt.edu/~mzhang/image_ads/2...,Regular_Object,,,{},2/9102.jpg,0
896,https://people.cs.pitt.edu/~mzhang/image_ads/2...,Regular_Object,,,{},2/9102.jpg,0
897,https://people.cs.pitt.edu/~mzhang/image_ads/2...,Regular_Object,,,{},2/93222.jpg,0
898,https://people.cs.pitt.edu/~mzhang/image_ads/2...,Regular_Object,,,"[{""left"":10-""top"":6-""width"":490-""height"":442-""...",2/93222.jpg,0


In [124]:
# process single task 
tmp_task_col = 'atypicality' 

def get_score_distribution(s, offset = 3):
    return [i + offset for i in s.values]

def get_value_counts(lst):
    unique_values, counts = np.unique(lst, return_counts=True)
    return dict(zip(unique_values, counts))

def get_majority(value_counts):
    return sorted(value_counts.items(), key = lambda x: x[1], reverse = True)[0][0] # majority label 

def get_majority_percentage(value_counts):
    majority_count = sorted(value_counts.items(), key = lambda x: x[1], reverse = True)[0][1] 
    return round(majority_count / sum(value_counts.values()), 2)


# all_instrinsic_data = None

tmp_distribution_data = atypicality_train_df[['ads_id', tmp_task_col]].groupby('ads_id').agg(lambda x: get_score_distribution(x, 0))[tmp_task_col]
tmp_value_count_data = tmp_distribution_data.apply(get_value_counts)
tmp_average_data = tmp_distribution_data.apply(lambda x: np.mean(x))
tmp_disagreement_data = tmp_distribution_data.apply(lambda x: np.var(x))
tmp_majority_data = tmp_value_count_data.apply(get_majority)
tmp_majority_percentage_data = tmp_value_count_data.apply(get_majority_percentage)

tmp_instrinsic_data = pd.DataFrame({
    'ads_id': tmp_distribution_data.index,
    tmp_task_col + '_distribution': tmp_distribution_data.values,
    tmp_task_col + '_value_count': tmp_value_count_data.values,
    tmp_task_col + '_average': tmp_average_data.values ,
    tmp_task_col + '_disagreement': tmp_disagreement_data.values,
    tmp_task_col + '_majority': tmp_majority_data.values,
    tmp_task_col + '_majority_percentage': tmp_majority_percentage_data.values
})
# if all_instrinsic_data is None:
#     all_instrinsic_data = tmp_instrinsic_data
# else:
#     all_instrinsic_data = all_instrinsic_data.merge(tmp_instrinsic_data, on = 'ads_id')
# all_instrinsic_data.append(tmp_instrinsic_data)

tmp_instrinsic_data.head()

Unnamed: 0,ads_id,atypicality_distribution,atypicality_value_count,atypicality_average,atypicality_disagreement,atypicality_majority,atypicality_majority_percentage
0,0/100080.jpg,"[0, 0, 0]",{0: 3},0.0,0.0,0,1.0
1,0/100790.jpg,"[1, 1, 1]",{1: 3},1.0,0.0,1,1.0
2,0/10200.jpg,"[0, 0, 0]",{0: 3},0.0,0.0,0,1.0
3,0/104190.jpg,"[1, 1, 1]",{1: 3},1.0,0.0,1,1.0
4,0/10430.jpg,"[0, 1, 0]","{0: 2, 1: 1}",0.333333,0.222222,0,0.67


In [157]:
tmp_instrinsic_data.atypicality_majority_percentage.mean()

0.8713

In [78]:
random.seed(1)

In [125]:
tmp_instrinsic_data['dataset'] = tmp_instrinsic_data.ads_id.apply(lambda x: "test" if x in sampled_ads_id_200 else "train")
tmp_instrinsic_data = tmp_instrinsic_data.query('dataset == "train"')
# tmp_instrinsic_data = tmp_instrinsic_data.sample(100).reset_index(drop = True)
tmp_instrinsic_data.head(2)

Unnamed: 0,ads_id,atypicality_distribution,atypicality_value_count,atypicality_average,atypicality_disagreement,atypicality_majority,atypicality_majority_percentage,dataset
0,0/100080.jpg,"[0, 0, 0]",{0: 3},0.0,0.0,0,1.0,train
1,0/100790.jpg,"[1, 1, 1]",{1: 3},1.0,0.0,1,1.0,train


In [143]:
tmp_instrinsic_data.atypicality_disagreement.unique()

array([0.        , 0.22222222])

In [126]:
tmp_instrinsic_data.shape

(300, 8)

In [128]:
tmp_instrinsic_data.to_csv(mturk_data_dir + 'modeling_instrinsic_atypicality_train.csv', index = False)

In [146]:
# atypicality training data
import os 

os.makedirs(mturk_data_dir + 'atypicality_train/', exist_ok=True)
for ads_id in tmp_instrinsic_data.ads_id.values:
    tmp_dir = mturk_data_dir + 'atypicality_train/' + ads_id.split('/')[0]
    os.makedirs(tmp_dir, exist_ok=True)
    os.system('cp {} {}'.format(
        mturk_data_dir + '../original_data/' + ads_id, 
        mturk_data_dir + 'atypicality_train/' + ads_id
    ))

In [144]:
# tmp_instrinsic_data.atypicality_average.unique()

In [145]:
tmp_instrinsic_data

Unnamed: 0,ads_id,atypicality_distribution,atypicality_value_count,atypicality_average,atypicality_disagreement,atypicality_majority,atypicality_majority_percentage,dataset
0,0/100080.jpg,"[0, 0, 0]",{0: 3},0.000000,0.000000,0,1.00,train
1,0/100790.jpg,"[1, 1, 1]",{1: 3},1.000000,0.000000,1,1.00,train
2,0/10200.jpg,"[0, 0, 0]",{0: 3},0.000000,0.000000,0,1.00,train
3,0/104190.jpg,"[1, 1, 1]",{1: 3},1.000000,0.000000,1,1.00,train
4,0/10430.jpg,"[0, 1, 0]","{0: 2, 1: 1}",0.333333,0.222222,0,0.67,train
...,...,...,...,...,...,...,...,...
295,2/87792.jpg,"[1, 1, 1]",{1: 3},1.000000,0.000000,1,1.00,train
296,2/88052.jpg,"[0, 0, 0]",{0: 3},0.000000,0.000000,0,1.00,train
297,2/90392.jpg,"[1, 0, 1]","{0: 1, 1: 2}",0.666667,0.222222,1,0.67,train
298,2/9102.jpg,"[0, 0, 0]",{0: 3},0.000000,0.000000,0,1.00,train


## 2.2 Pairwise

In [137]:
all_pairwise_data = []

tmp_task_col = 'atypicality_average'
tmp_average_data = tmp_instrinsic_data[tmp_task_col].values 
tmp_ads_id = tmp_instrinsic_data["ads_id"].values 

tmp_all_diff = []
counter = 0
for i in range(len(tmp_ads_id)):
    for j in range(i + 1, len(tmp_ads_id)):
        tmp_all_diff.append({
            'ads_pair': tmp_ads_id[i] + ', ' + tmp_ads_id[j],
            tmp_task_col + '_diff': tmp_average_data[i] - tmp_average_data[j]
        })
        if abs(tmp_average_data[i] - tmp_average_data[j]) > 1:
            counter += 1
print(tmp_task_col, counter)
tmp_diff_df = pd.DataFrame(tmp_all_diff)

#### IMPORTANT: saving data file ####
tmp_diff_df.to_csv(mturk_data_dir + 'modeling_atypicality_diff_train.csv', index = False)
    

atypicality_average 0


In [138]:
tmp_diff_df.shape

(44850, 2)

In [139]:
# tmp_diff_df

In [140]:
tmp_diff_df.query('abs(atypicality_average_diff) > 0.5').atypicality_average_diff.unique()

array([-1.        , -0.66666667,  1.        ,  0.66666667, -0.66666667,
        0.66666667])

In [141]:
tmp_diff_df.query('abs(atypicality_average_diff) > 0.5').shape

(17985, 2)

In [142]:
17985 / 44850

0.40100334448160535