# Google Custom Search API

## Part I: Getting URLs

To use the API, we need to get an engine ID and API key from the [Google Custom Search API](https://developers.google.com/custom-search/v1/overview). The code presented here demonstrates querying keywords of recyclables and non-recyclables and storing data in a pandas dataframe.

In [None]:
from apiclient.discovery import build
import cnfg
import pandas as pd
import time
from IPython import display

In [None]:
# load config file (engine ID and api key registered from Google API)
# config = {'api_key': 'XXXX', 'engine_ID': 'XXXX'}
url = '/Users/lkchemposer/.googleapi_config'
config = cnfg.load(url)

In [None]:
# build a collection
service = build('customsearch', 'v1', developerKey=config['api_key'])
collections = service.cse()

In [None]:
# keywords to query (from https://www1.nyc.gov/assets/dsny/site/services/recycling/what-to-recycle)
metal = ['metal can', 'crushed metal can', 'pet food can', 'paint can', 'soup can', 'aluminum foil', 'aluminum tray',
         'metal lid', 'metal wire hanger', 'metal pot', 'metal tool', 'metal curtain rod', 'license plate']
glass = ['glass jar', 'glass soda bottles']
plastic = ['plastic soda bottles', 'plastic water bottle', 'hard plastic water bottle', 'plastic milk jug',
           'plastic jar', 'plastic lid', 'plastic tupperware', 'plastic food container', 'plastic cookie insert',
           'plastic yogurt container', 'plastic dairy tub', 'plastic clamshell container',
           'plastic blister pack container', 'acetate box', 'plastic flower pot', 'plastic mixing bowl',
           'plastic crate', 'plastic bucket', 'plastic pail', 'plastic chair', 'plastic toy']
paper = ['carton food box', 'carton box', 'milk carton packaging', 'beverage carton packaging', 'carton drink box',
         'carton aseptic package', 'carton juice box', 'carton soup', 'newspaper', 'magazine', 'yellow pages',
         'mixed paper', 'white scrunched paper', 'lined paper', 'crumpled paper', 'sheet music', 'envelope',
         'paper receipt', 'paper bag', 'wrapping paper', 'paperback book', 'comic book', 'cardboard egg carton',
         'cardboard tray', 'cardboard shoe box', 'cardboard tube', 'paper file folders', 'cardboard packaging',
         'pizza box', 'cardboard sleeve', 'paper cup', 'corrugated cardboard']
recs = metal + glass + plastic + paper


nrplastic = ['candy wrapper', 'spiral binding', 'styrofoam container', 'styrofoam plate', 'styrofoam cup',
             'styrofoam tray', 'foam packing peanut', 'flexible plastic tube', 'lotion', 'toothpaste tubes',
             'cosmetics', 'basketball ball', 'bowling ball', 'soccer ball', 'american football ball', 'yoga ball',
             'plastic shopping bag']
nrglass = ['light bulb', 'mirror', 'glassware']
tanglers = ['cable', 'wire', 'cord', 'hose']
other = ['battery', 'printer cartridge', 'ceramic', 'cigarette lighter', 'gas lighter', 'cassette', 'VHS tape',
         'pen', 'marker']

nonrecs = nrplastic + nrglass + tanglers + other

In [None]:
# query recyclables keywords to get URLs, image type, and caption
l = list()
for query in recs:
    for i in list(range(1, 100, 10)): # maximum 100 results
        try:
            request = collections.list(q=query, start=i, filter='1', # no duplicate results
                                       searchType='image', imgType='photo', imgColorType='color',
                                       cx=config['engine_ID'])
            time.sleep(1)
            response = request.execute()
            for image in response['items']:
                link = image['link']
                typ = image['mime']
                capt = image['title'].lower()
                l.append(dict(zip(['class', 'link', 'type', 'caption'], [query, link, typ, capt])))
                if i == 91: # store results in csv
                    images = pd.DataFrame(l)
                    images.to_csv('Recs.csv', mode='a', index=False, header=None)
                    l = list() # reset list and dataframe
                    images = pd.DataFrame()
            if i == 1: # checking progress at the start of each keyword
                display.clear_output()
                print(query)       
        except:
            break

In [None]:
# same for non-recyclables
l = list()
for query in nonrecs:
    for i in list(range(1, 100, 10)): # maximum 100 results
        try:
            request = collections.list(q=query, start=i, filter='1', # no duplicate results
                                       searchType='image', imgType='photo', imgColorType='color',
                                       cx=config['engine_ID'])
            time.sleep(1)
            response = request.execute()
            for image in response['items']:
                link = image['link']
                typ = image['mime']
                capt = image['title'].lower()
                l.append(dict(zip(['class', 'link', 'type', 'caption'], [query, link, typ, capt])))
                if i == 91: # store results in csv
                    images = pd.DataFrame(l)
                    images.to_csv('Nonrecs.csv', mode='a', index=False, header=None)
                    l = list() # reset list and dataframe
                    images = pd.DataFrame()
            if i == 1: # checking progress at the start of each keyword
                display.clear_output()
                print(query)       
        except:
            break

## Part II: Image Downloading and Cleaning

Here, we:
1. Remove duplicate links and captions
2. Organize classes of material in the CSV files
3. Download images and store them in appropriately named folders

In [None]:
# deleting duplicate links and caption (same images sometimes have distinct links but same caption)
rdf = pd.read_csv('Recs.csv', names=sorted(['class', 'link', 'type', 'caption']))
rdf.drop_duplicates('link', inplace=True)
rdf.drop_duplicates('caption', inplace=True)

nrdf = pd.read_csv('Nonrecs.csv', names=sorted(['class', 'link', 'type', 'caption']))
nrdf.drop_duplicates('link', inplace=True)
nrdf.drop_duplicates('caption', inplace=True)

rdf.head()

In [None]:
# classify items into materials
recsd = {'metal': metal, 'glass': glass, 'plastic': plastic, 'paper': paper}
nrecsd = {'glass': nrglass, 'plastic': nrplastic, 'tanglers': tanglers, 'other': other}

def rclassify(row):
    for key in recsd:
        if row['class'] in recsd[key]:
            return str(key)

rdf['class'] = rdf.apply(rclassify, axis=1)
rdf.head()

In [None]:
def nrclassify(row):
    for key in nrecsd:
        if row['class'] in nrecsd[key]:
            return str(key)

nrdf['class'] = nrdf.apply(nrclassify, axis=1)
nrdf.head()

In [None]:
# make directories for image files
import os
import sys
from urllib.request import urlretrieve as download

drs = ['recs_glass', 'recs_metal', 'recs_paper', 'recs_plastic',
       'nonrecs_glass', 'nonrecs_other', 'nonrecs_tanglers', 'nonrecs_plastic']

for i in drs:
    os.mkdir(os.path.join('./data/', i))

In [None]:
# downloading images from urls
t = 0
for i in rdf.index:
    if t == 100:
        break
    try:
        download(rdf['link'][i], './data/{}_{}/{}-{}.jpg'.format('recs', rdf['class'][i],
                                                                  rdf['class'][i], i))
        time.sleep(1)
    except:
        continue
        t += 1

## Part III: Deleting Errorneous Image Files

Lastly, some image files cannot be opened by PIL, so we remove them to prevent errors from occurring in the model.

In [None]:
from PIL import Image

for j in drs:
    for i in os.listdir(os.path.join('./data/', j)):
        p = os.path.join('./data/', j, i)
        try:
            Image.open(p)
        except:
            os.remove(p)