In [1]:
import json
import os

from collections import defaultdict

In [2]:
DATA_FILE = '../data/official/CP1_train_ads.json' # 8226390d0e8ce5f95dd72f54efa43aa4

In [3]:
!md5sum $DATA_FILE

8226390d0e8ce5f95dd72f54efa43aa4  ../data/official/CP1_train_ads.json


# Curating Positive Training Data

In [4]:
cdr_ids = set()

with open(DATA_FILE, 'rb') as infile:
    for line in infile:
        ad = json.loads(line.strip())
        cdr_ids.add(ad['doc_id'])
        
with open('../data/CP1_cdr_ids.txt', 'wb') as outfile:
    for cdr_id in cdr_ids:
        outfile.write('%s\n' % cdr_id)

In [8]:
!time parallel -j4 --joblog ../data/misc/get_es_child_documents.log \
                   --arg-file ../data/CP1_cdr_ids.txt \
                   --retries 3 \
                   --max-args 100 \
                   python ../scripts/get_es_child_documents.py >> ../data/CP1_image_documents.json
                
# View any jobs that failed:
# awk '$7 != 0' ../data/misc/get_es_child_documents.log


real	10m1.335s
user	5m15.614s
sys	1m3.469s


In [9]:
!sort -u ../data/CP1_image_documents.json > tmp.json
!mv tmp.json ../data/CP1_image_documents.json

In [10]:
# Figure out unique URLs in the image documents and map them to their doc_ids for a sane downloaded name
image_url_to_doc_ids = defaultdict(set)

with open('../data/CP1_image_documents.json', 'rb') as infile:
    for line in infile:
        image_doc = json.loads(line)
        
        if image_doc['obj_stored_url']:
            image_url_to_doc_ids[image_doc['obj_stored_url']].add(image_doc['doc_id'])
        
# Construct file for parallel downloading
with open('../data/CP1_image_urls.txt', 'wb') as outfile:
    for (url, doc_ids) in image_url_to_doc_ids.iteritems():
        outfile.write(' '.join([url] + list(doc_ids)) + '\n') 

In [13]:
# Retries are important here since servers can get bogged down
# The joblog will reveal with the exit status of the script to determine
# if anything may be able to be retrieved
!time parallel -j10 \
               --joblog ../data/misc/dl_images.log \
               --arg-file ../data/CP1_image_urls.txt \
               --retries 3 \
               --colsep ' ' \
               python ../scripts/download_url_for_doc_ids.py > ../data/CP1_url_sha.txt
            
!find ../data/CP1_imageset -type f -size 0 -delete

^C


In [14]:
!sort -u ../data/CP1_url_sha.txt > tmp.txt
!mv tmp.txt ../data/CP1_url_sha.txt 

## Formalize CSV 

In [15]:
# Curate a standard CSV
# cluster_id,ad_id,sha1

# CP1_image_documents.json gives ad_id and image doc_id which can be mapped to sha via image_doc_id_sha
clusters_ads = set()

# cluster_id,ad_id are given from the official DATA_FILE
with open(DATA_FILE, 'rb') as infile:
    for line in infile:
        ad = json.loads(line.strip())
        clusters_ads.add((ad['cluster_id'], ad['doc_id']))

In [16]:
print '%d ads across %d clusters.' % (len(set([x[1] for x in clusters_ads])),
                                      len(set([x[0] for x in clusters_ads]))) 

203172 ads across 450 clusters.


In [17]:
# Retrieve the sha1 through the CP1_url_shas.txt through the obj_stored_url in CP1_image_documents.json
# Going from image to ad will always yield an ad
with open('../data/CP1_image_documents.json', 'rb') as infile:
    ad_id_image_urls = defaultdict(set)
    
    for line in infile:
        image_doc = json.loads(line)
        
        if not isinstance(image_doc['obj_parent'], list):
            image_doc['obj_parent'] = [image_doc['obj_parent']]
                
        for ad_id in image_doc['obj_parent']:
            ad_id_image_urls[ad_id].add(image_doc['obj_stored_url'])

In [18]:
print '%d image URLs exist across %d ads.' % (sum([len(x) for x in ad_id_image_urls.values()]), len(ad_id_image_urls))

914576 image URLs exist across 121901 ads.


In [19]:
# Go from obj_stored_url to sha1
with open('../data/CP1_url_sha.txt', 'rb') as infile:
    url_sha = {}
    
    for line in infile:
        url, sha = line.strip().split()
        url_sha[url] = sha

In [20]:
print '%d unique URLs.' % len(url_sha)

232794 unique URLs.


In [21]:
import csv

with open('../data/CP1_clusters_ads_images.csv', 'wb') as outfile:
    writer = csv.writer(outfile)
    
    # Headers
    writer.writerow(['cluster_id', 'ad_id', 'sha1'])
    
    for (cluster_id, ad_id) in clusters_ads:
        # Finding a url should always work
        # Finding the sha from a url may not, if the URL failed to be retrieved (404, whatever)
        image_urls_from_ad = ad_id_image_urls[ad_id] 
        image_shas_from_ad = set([url_sha[x] for x in image_urls_from_ad if x in url_sha])
        
        for sha1 in image_shas_from_ad:
            writer.writerow((cluster_id, ad_id, sha1))

# Curating Negative Data