In [1]:
import pandas as pd
import json 
import numpy as np

def load_json(filename):
    """
    Load a JSON file given a filename
    If the file doesn't exist, then return an empty dictionary instead
    """
    try:
        with open(filename, 'r') as f:
            return json.load(f)
    except FileNotFoundError:
        return {}
    
def get_majority(lst):
    unique_values, counts = np.unique(lst, return_counts = True)
    return sorted(list(zip(unique_values, counts)), key = lambda p: p[1], reverse = True)[0][0]

def get_lst_overlap(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

# get_majority([0, 0, 1, 2, 2, 2])

# 1 Load Existing Data

- Existing Annotated Ads

In [2]:
## get existing ads ids 
existing_ids = []
for i in range(1, 6):
    tmp_df = pd.read_csv('../../pitt_ads/mturk_data/subset_0.{}/subset_ads_data_0.{}.csv'.format(i, i))
    existing_ids.extend(list(tmp_df.ads_id))
len(existing_ids)

modeling_instrinsic_atypicality_train = pd.read_csv('../../pitt_ads/mturk_data/modeling_instrinsic_atypicality_train.csv')
existing_ids.extend(list(modeling_instrinsic_atypicality_train.ads_id.values))

existing_ids = list(set(existing_ids))
len(existing_ids)

393

- Current Topic Distribution

In [3]:
mturk_data = pd.read_csv('../../pitt_ads/mturk_data/subset_0.5/subset_ads_data_0.5.csv')
mturk_data.head(2)

topic_data = load_json('../../pitt_ads/annotations_images/image/Topics.json')
# topic_data
mturk_data['topic'] = mturk_data.ads_id.apply(lambda x: get_majority(topic_data[x]))
mturk_data['topic'].value_counts()

topic
19    4
9     3
17    3
1     2
10    2
26    1
25    1
8     1
3     1
16    1
31    1
Name: count, dtype: int64

In [17]:
dict(mturk_data['topic'].value_counts())

{'19': 4,
 '9': 3,
 '17': 3,
 '1': 2,
 '10': 2,
 '26': 1,
 '25': 1,
 '8': 1,
 '3': 1,
 '16': 1,
 '31': 1}

- load other annotations

In [4]:
strategy_data = load_json('../../pitt_ads/annotations_images/image/Strategies.json')
sentiments_data = load_json('../../pitt_ads/annotations_images/image/Sentiments.json')
len(strategy_data), len(sentiments_data), len(topic_data)

(4000, 30340, 64340)

In [5]:
overlapping_ads_id = get_lst_overlap(list(strategy_data.keys()), list(topic_data.keys()))
overlapping_ads_id = get_lst_overlap(overlapping_ads_id, list(sentiments_data.keys()))
len(overlapping_ads_id)

4000

- Available Ads (with atypicality annotations)

In [6]:
## get available ads 
# atypicality_annotation_df = pd.read_csv('../../pitt_ads/atypicality_annotations_final_csv.csv')
# atypicality_annotation_df = atypicality_annotation_df.dropna(subset=['image_url'])
# atypicality_annotation_df['ads_id'] = atypicality_annotation_df.image_url.apply(lambda x: '/'.join(x.split('/')[-2:]))

available_ads = overlapping_ads_id # list(set(atypicality_annotation_df.ads_id.values))
len(available_ads)

4000

In [7]:
duplicate_count = 0
no_data_count = 0
to_remove = []
for ads_id in available_ads:
    if ads_id in existing_ids:
        duplicate_count += 1
        to_remove.append(ads_id)
    if ads_id not in topic_data:
        no_data_count += 1
        to_remove.append(ads_id)
for ads_id in set(to_remove):
    available_ads.remove(ads_id)
    
duplicate_count, no_data_count, len(available_ads)

(0, 0, 4000)

In [8]:
available_ads_topics = [get_majority(topic_data[ads_id]) for ads_id in available_ads]
available_ads_df = pd.DataFrame({'ads_id': available_ads, 'topic': available_ads_topics})
def clean_topic(t):
    try:
        return int(t)
    except:
        return
available_ads_df.topic = available_ads_df.topic.apply(clean_topic)
available_ads_df = available_ads_df.dropna().reset_index(drop = True)
available_ads_df.head()

Unnamed: 0,ads_id,topic
0,10/170489.png,2
1,10/173962.png,7
2,10/170037.png,19
3,10/171489.png,18
4,10/170720.png,1


In [9]:
available_ads_df.shape

(4000, 2)

# 2 Topic Sampling
food
1. "Restaurants, cafe, fast food" (ABBREVIATION: "restaurant")
2. "Chocolate, cookies, candy, ice cream" (ABBREVIATION: "chocolate")
3. "Chips, snacks, nuts, fruit, gum, cereal, yogurt, soups" (ABBREVIATION: "chips")
4. "Seasoning, condiments, ketchup" (ABBREVIATION: "seasoning")

5. "Pet food" (ABBREVIATION: "petfood")

drinks
6. "Alcohol" (ABBREVIATION: "alcohol")
7. "Coffee, tea" (ABBREVIATION: "coffee")
8. "Soda, juice, milk, energy drinks, water" (ABBREVIATION: "soda")

cars
9. "Cars, automobiles (car sales, auto parts, car insurance, car repair, gas, motor oil, etc.)" (ABBREVIATION: "cars")

electronics
10. "Electronics (computers, laptops, tablets, cellphones, TVs, etc.)" (ABBREVIATION: "electronics")

services
11. "Phone, TV and internet service providers" (ABBREVIATION: "phone_tv_internet_providers")
12. "Financial services (banks, credit cards, investment firms, etc.)" (ABBREVIATION: "financial")
14. "Security and safety services (anti-theft, safety courses, etc.)" (ABBREVIATION: "security")
15. "Software (internet radio, streaming, job search website, grammar correction, travel planning, etc.)" (ABBREVIATION: "software")
16. "Other services (dating, tax, legal, loan, religious, printing, catering, etc.)" (ABBREVIATION: "other_service")

education
13. "Education (universities, colleges, kindergarten, online degrees, etc.)" (ABBREVIATION: "education")

beauty
17. "Beauty products and cosmetics (deodorants, toothpaste, makeup, hair products, laser hair removal, etc.)" (ABBREVIATION: "beauty")

healthcare
18. "Healthcare and medications (hospitals, health insurance, allergy, cold remedy, home tests, vitamins)" (ABBREVIATION: "healthcare")

clothing
19. "Clothing and accessories (jeans, shoes, eye glasses, handbags, watches, jewelry)" (ABBREVIATION: "clothing")

home
22. "Cleaning products (detergents, fabric softeners, soap, tissues, paper towels, etc.)" (ABBREVIATION: "cleaning")
23. "Home improvements and repairs (furniture, decoration, lawn care, plumbing, etc.)" (ABBREVIATION: "home_improvement")
24. "Home appliances (coffee makers, dishwashers, cookware, vacuum cleaners, heaters, music players, etc.)" (ABBREVIATION: "home_appliance")
20. "Baby products (baby food, sippy cups, diapers, etc.)" (ABBREVIATION: "baby")

leisure
21. "Games and toys (including video and mobile games)" (ABBREVIATION: "game")
25. "Vacation and travel (airlines, cruises, theme parks, hotels, travel agents, etc.)" (ABBREVIATION: "travel")
26. "Media and arts (TV shows, movies, musicals, books, audio books, etc.)" (ABBREVIATION: "media")
27. "Sports equipment and activities" (ABBREVIATION: "sports")
29. "Gambling (lotteries, casinos, etc.)" (ABBREVIATION: "gambling")

shopping
28. "Shopping (department stores, drug stores, groceries, etc.)" (ABBREVIATION: "shopping")

social goods
30. "Environment, nature, pollution, wildlife" (ABBREVIATION: "environment")
31. "Animal rights, animal abuse" (ABBREVIATION: "animal_right")
32. "Human rights" (ABBREVIATION: "human_right")
33. "Safety, safe driving, fire safety" (ABBREVIATION: "safety")
34. "Smoking, alcohol abuse" (ABBREVIATION: "smoking_alcohol_abuse")
35. "Domestic violence" (ABBREVIATION: "domestic_violence")
36. "Self esteem, bullying, cyber bullying" (ABBREVIATION: "self_esteem")
38. "Charities" (ABBREVIATION: "charities")

37. "Political candidates (support or opposition)" (ABBREVIATION: "political")
39. "Unclear"

In [10]:
cat2topics = {
    'food': [1, 2, 3, 4],
    'pet': [5],
    'drinks': [6, 7, 8],
    'automobile':[9],
    'electronics': [10],
    'service': [11, 12, 14, 15, 16],
    'education': [13],
    'beauty': [17],
    'healthcare': [18],
    'clothing': [19],
    'home': [20, 22, 23, 24],
    'leisure': [21, 25, 26, 27, 29],
    'shopping': [28],
    'non-commercial': [30, 31, 32, 33, 34, 35, 36, 37, 38],
    'Unclear': [39]
}
topics2cat = {}
for cat in cat2topics:
    for topic in cat2topics[cat]:
        topics2cat[topic] = cat 
# topics2cat

In [11]:
topic2text = {}
with open('../../pitt_ads/annotations_images/image/topics.txt', 'r') as f:
    for line in f.readlines():
        topic = int(line.split('.')[0])
        topic_text = line.split('.')[1].split('(ABBREVIATION:')[0].strip()
        topic2text[topic] = topic_text 
# topic2text

In [12]:
for cat in cat2topics:
    cat_str = cat + ": ["
    for topic in cat2topics[cat]:
        cat_str += topic2text[topic].split('(')[0]
        cat_str += ', '
    cat_str = cat_str[:-2] + ']'
    print(cat_str)

food: ["Restaurants, cafe, fast food", "Chocolate, cookies, candy, ice cream", "Chips, snacks, nuts, fruit, gum, cereal, yogurt, soups", "Seasoning, condiments, ketchup"]
pet: ["Pet food"]
drinks: ["Alcohol", "Coffee, tea", "Soda, juice, milk, energy drinks, water"]
automobile: ["Cars, automobiles ]
electronics: ["Electronics ]
service: ["Phone, TV and internet service providers", "Financial services , "Security and safety services , "Software , "Other services ]
education: ["Education ]
beauty: ["Beauty products and cosmetics ]
healthcare: ["Healthcare and medications ]
clothing: ["Clothing and accessories ]
home: ["Baby products , "Cleaning products , "Home improvements and repairs , "Home appliances ]
leisure: ["Games and toys , "Vacation and travel , "Media and arts , "Sports equipment and activities", "Gambling ]
shopping: ["Shopping ]
non-commercial: ["Environment, nature, pollution, wildlife", "Animal rights, animal abuse", "Human rights", "Safety, safe driving, fire safety", "S

In [13]:
# existing super-topic stats
mturk_data['super_topic'] = mturk_data['topic'].apply(lambda x: topics2cat[int(x)])
existing_data_count = dict(mturk_data.super_topic.value_counts())

In [14]:
available_ads_df['topic_text'] = available_ads_df['topic'].apply(lambda x: topic2text[int(x)])
available_ads_df['super_topic'] = available_ads_df['topic'].apply(lambda x: topics2cat[int(x)])

In [15]:
available_ads_df.super_topic.value_counts().iloc[:10]

super_topic
food              1521
drinks             408
clothing           402
non-commercial     370
automobile         339
beauty             333
service            187
leisure            105
electronics         96
healthcare          84
Name: count, dtype: int64

In [16]:
# super_topic_list
# pitt_ads/mturk_data/sampled_ads_id_200.pkl
# available_ads_df.to_csv('available_ads_102024.csv', index = False)

In [132]:
super_topic_list = list(available_ads_df.super_topic.value_counts().iloc[:10].index)

annotation_plan = {k: 10 for k in super_topic_list}
for k in super_topic_list:
    if k in existing_data_count:
        annotation_plan[k] = 10 - existing_data_count[k]
sum(annotation_plan.values())

80

In [133]:
annotation_plan

{'food': 7,
 'drinks': 9,
 'clothing': 6,
 'non-commercial': 9,
 'automobile': 7,
 'beauty': 7,
 'service': 9,
 'leisure': 8,
 'electronics': 8,
 'healthcare': 10}

In [136]:
import random

In [139]:
selected_ids = []
for k in annotation_plan:
    tmp_available_ids = list(available_ads_df.query('super_topic == "{}"'.format(k)).ads_id.values)
    selected_ids.extend(random.sample(tmp_available_ids, k = annotation_plan[k]))
len(selected_ids)

80

In [141]:
len(set(selected_ids))

80

# 3 Ads Image Pair Sampling