## Scrape metadata and images for each category

In [40]:
import json
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
import urllib.request
import sys
import getopt
from multiprocessing.dummy import Pool
import csv
import os
import requests
from sqlalchemy.exc import IntegrityError
import argparse
import pickle

In [41]:
#define function to save response
def get_api_json(api_url):
    response = requests.get(api_url).text
    resp_obj = json.loads(response)
    return resp_obj

In [42]:
# define function to parse product info
def parse_product(prod, parent_category):
        pro_id = prod['id']
        name = prod['name']
        image_url = prod['image']['sizes']['Original']['url']
        price=prod['priceLabel']
        shopping_link=prod['clickUrl']
        try:
            saleprice=prod['salePriceLabel']
        except:
            saleprice=None
        categories = [cat['id'] for cat in prod['categories']]
        return {'id': pro_id,
                'product_name': name,
                'price':price,
                'saleprice':saleprice,
                'image_url': image_url,
                'categories': categories,
                'parent_category': parent_category,
               'url':shopping_link}

In [43]:
# define function to get response from API
def get_responses(url, offset, product_type):
    api_url = url + '&offset=' + str(offset)
    resp=get_api_json(api_url)
    return resp

In [44]:
# define function to paginate products
def paginate_shopstyle_products(url, offset, product_type):
    api_url = url + '&offset=' + str(offset)
    products = get_api_json(api_url)['products']
    products = [parse_product(
                prod, product_type) for prod in products]
    return products

In [61]:
# define function to store scraped product
def store_shopstyle_products(product_type):
    product_list=[]
    response_list=[]
    api_url = ''
    if product_type == 'dress':
        api_url = 'http://api.shopstyle.com/api/v2/products?pid=shopstyle&fts=dress&day-dresses&max-price=100&r=amazon.com-us,and-other-stories-us,banana-republic-us,bergdorf-goodman-us,bloomingdales-us,boohoo-us,calvin-klein-us,century-21-us,handm-us,hautelook-us,j.crew-us,jcpenney-us,last-call-by-neiman-marcus-us,loft-us,lord-and-taylor-us,lucky-brand-us,macys-us,mango-us,missguided-us,nasty-gal-us,need-supply-co.-us,neiman-marcus-us,nordstrom-rack-us,nordstrom-us,off-5th-us,revolve-us,rue-la-la-us,saks-fifth-avenue-us,shopbop.com-us,the-outnet.com-us,zappos-us&sort=Favorite&limit=50'
    elif product_type == 'jeans':
        api_url = 'http://api.shopstyle.com/api/v2/products?pid=shopstyle&fts=women+jeans&max-price=80&r=bloomingdales-us,last-call-by-neiman-marcus-us,lord-and-taylor-us,macys-us,neiman-marcus-us,nordstrom-rack-us,nordstrom-us&sort=Popular&limit=50'
    elif product_type == 'jacket':
        api_url = 'http://api.shopstyle.com/api/v2/products?pid=shopstyle&fts=women+jacket&max-price=250&r=bloomingdales-us,last-call-by-neiman-marcus-us,lord-and-taylor-us,macys-us,neiman-marcus-us,nordstrom-rack-us,nordstrom-us&sort=Popular&limit=50'
    elif product_type == 'outerwear':
        api_url = 'http://api.shopstyle.com/api/v2/products?pid=shopstyle&fts=women+outerwear&max-price=500&r=bloomingdales-us,handm-us,jcpenney-us,last-call-by-neiman-marcus-us,lord-and-taylor-us,macys-us,mango-us,neiman-marcus-us,nordstrom-rack-us,nordstrom-us,off-5th-us,rue-la-la-us,therealreal-us&sort=Popular&limit=50'
    elif product_type == 'pants':
        api_url = 'http://api.shopstyle.com/api/v2/products?pid=shopstyle&fts=women+pants&max-price=80&r=bloomingdales-us,last-call-by-neiman-marcus-us,lord-and-taylor-us,macys-us,neiman-marcus-us,nordstrom-rack-us,nordstrom-us&sort=Popular&limit=50'
    elif product_type == 'shorts':
        api_url = 'http://api.shopstyle.com/api/v2/products?pid=shopstyle&fts=women+shorts&max-price=80&r=bloomingdales-us,jcpenney-us,last-call-by-neiman-marcus-us,lord-and-taylor-us,macys-us,mango-us,neiman-marcus-us,nordstrom-rack-us,nordstrom-us,off-5th-us,rue-la-la-us&sort=Popular&limit=50'
    elif product_type == 'skirts':
        api_url = 'http://api.shopstyle.com/api/v2/products?pid=shopstyle&fts=skirts&max-price=80&r=bloomingdales-us,jcpenney-us,last-call-by-neiman-marcus-us,lord-and-taylor-us,macys-us,mango-us,neiman-marcus-us,nordstrom-rack-us,nordstrom-us,off-5th-us,rue-la-la-us&sort=Popular&limit=50'
    elif product_type == 'sweaters':
        api_url = 'http://api.shopstyle.com/api/v2/products?pid=shopstyle&fts=women+sweater&max-price=80&r=bloomingdales-us,jcpenney-us,last-call-by-neiman-marcus-us,lord-and-taylor-us,macys-us,mango-us,neiman-marcus-us,nordstrom-rack-us,nordstrom-us,off-5th-us,rue-la-la-us&sort=Popular&limit=50'
    elif product_type == 'sweatshirt':
        api_url = 'http://api.shopstyle.com/api/v2/products?pid=shopstyle&fts=women+sweatshirts&max-price=80&r=bloomingdales-us,jcpenney-us,last-call-by-neiman-marcus-us,lord-and-taylor-us,macys-us,mango-us,neiman-marcus-us,nordstrom-rack-us,nordstrom-us,off-5th-us,rue-la-la-us&sort=Popular&limit=50'
    elif product_type == 'tops':
        api_url = 'http://api.shopstyle.com/api/v2/products?pid=shopstyle&fts=women+tops&max-price=50&r=bloomingdales-us,last-call-by-neiman-marcus-us,lord-and-taylor-us,macys-us,nordstrom-rack-us,nordstrom-us&sort=Popular&limit=50'
    elif product_type == 'handbags':
        api_url = 'http://api.shopstyle.com/api/v2/products?pid=shopstyle&fts=handbags&max-price=80&r=6pm.com-us,amazon.com-us,bloomingdales-us,last-call-by-neiman-marcus-us,macys-us,nordstrom-rack-us,nordstrom-us&sort=Favorite&limit=50'
        
    offset = 5050
    while offset < 10000:
        print('Getting {} batch {}'.format(product_type, offset))
        product_batch = paginate_shopstyle_products(
            api_url, offset, product_type)
        response=get_responses(api_url, offset, product_type)
        for p in product_batch:
            try:
                product_list.append(p)
            except IntegrityError:
                print('product ', p['id'], ' already exists')
        for r in response:
            response_list.append(response)
        offset += 50
    pickle_name=str(product_type)+'.pickle'
    response_name=str(product_type)+'_response.pickle'
    pickle.dump(product_list, open(pickle_name, 'wb'))
    pickle.dump(response_list, open(response_name, 'wb'))
    return product_list

In [None]:
dress_new_2=store_shopstyle_products('dress')

In [56]:
# define functions to save images
def get_img(product):
    path = './data/train/dress/'+str(product['id'])+'.jpg'
    url = product['image_url']
    print('Getting ', str(product['id']), ': ', path, ', ', url)
    urllib.request.urlretrieve(url, path)
    
def save_shopstyle_products(product_type):
    for i in product_type:
        if i['id'] in dress_list:
            print('already exist')
        else:
            try:
                get_img(i)
            except:
                print('error')

In [None]:
save_shopstyle_products(dress_new_2)

In [2]:
# define function to store test images

def store_shopstyle_products_test(product_type):
    product_list=[]
    response_list=[]
    api_url = ''
    if product_type == 'dress':
        api_url = 'http://api.shopstyle.com/api/v2/products?pid=shopstyle&fts=dress&min-price=80&r=bloomingdales-us,last-call-by-neiman-marcus-us,lord-and-taylor-us,macys-us,neiman-marcus-us,nordstrom-rack-us,nordstrom-us&sort=Popular&limit=50'
    elif product_type == 'jeans':
        api_url = 'http://api.shopstyle.com/api/v2/products?pid=shopstyle&fts=women+jeans&min-price=80&r=bloomingdales-us,last-call-by-neiman-marcus-us,lord-and-taylor-us,macys-us,neiman-marcus-us,nordstrom-rack-us,nordstrom-us&sort=Popular&limit=50'
    elif product_type == 'jacket':
        api_url = 'http://api.shopstyle.com/api/v2/products?pid=shopstyle&fts=women+jacket&min-price=250&r=bloomingdales-us,last-call-by-neiman-marcus-us,lord-and-taylor-us,macys-us,neiman-marcus-us,nordstrom-rack-us,nordstrom-us&sort=Popular&limit=50'
    elif product_type == 'outerwear':
        api_url = 'http://api.shopstyle.com/api/v2/products?pid=shopstyle&fts=women+outerwear&min-price=500&r=bloomingdales-us,handm-us,jcpenney-us,last-call-by-neiman-marcus-us,lord-and-taylor-us,macys-us,mango-us,neiman-marcus-us,nordstrom-rack-us,nordstrom-us,off-5th-us,rue-la-la-us,therealreal-us&sort=Popular&limit=50'
    elif product_type == 'pants':
        api_url = 'http://api.shopstyle.com/api/v2/products?pid=shopstyle&fts=women+pants&min-price=80&r=bloomingdales-us,last-call-by-neiman-marcus-us,lord-and-taylor-us,macys-us,neiman-marcus-us,nordstrom-rack-us,nordstrom-us&sort=Popular&limit=50'
    elif product_type == 'shorts':
        api_url = 'http://api.shopstyle.com/api/v2/products?pid=shopstyle&fts=women+shorts&min-price=80&r=bloomingdales-us,jcpenney-us,last-call-by-neiman-marcus-us,lord-and-taylor-us,macys-us,mango-us,neiman-marcus-us,nordstrom-rack-us,nordstrom-us,off-5th-us,rue-la-la-us&sort=Popular&limit=50'
    elif product_type == 'skirts':
        api_url = 'http://api.shopstyle.com/api/v2/products?pid=shopstyle&fts=skirts&min-price=80&r=bloomingdales-us,jcpenney-us,last-call-by-neiman-marcus-us,lord-and-taylor-us,macys-us,mango-us,neiman-marcus-us,nordstrom-rack-us,nordstrom-us,off-5th-us,rue-la-la-us&sort=Popular&limit=50'
    elif product_type == 'sweaters':
        api_url = 'http://api.shopstyle.com/api/v2/products?pid=shopstyle&fts=women+sweater&min-price=80&r=bloomingdales-us,jcpenney-us,last-call-by-neiman-marcus-us,lord-and-taylor-us,macys-us,mango-us,neiman-marcus-us,nordstrom-rack-us,nordstrom-us,off-5th-us,rue-la-la-us&sort=Popular&limit=50'
    elif product_type == 'sweatshirt':
        api_url = 'http://api.shopstyle.com/api/v2/products?pid=shopstyle&fts=women+sweatshirts&min-price=80&r=bloomingdales-us,jcpenney-us,last-call-by-neiman-marcus-us,lord-and-taylor-us,macys-us,mango-us,neiman-marcus-us,nordstrom-rack-us,nordstrom-us,off-5th-us,rue-la-la-us&sort=Popular&limit=50'
    elif product_type == 'tops':
        api_url = 'http://api.shopstyle.com/api/v2/products?pid=shopstyle&fts=women+tops&min-price=50&r=bloomingdales-us,last-call-by-neiman-marcus-us,lord-and-taylor-us,macys-us,nordstrom-rack-us,nordstrom-us&sort=Popular&limit=50'


        
    offset = 0
    while offset <= 950:
        print('Getting {} batch {}'.format(product_type, offset))
        product_batch = paginate_shopstyle_products(
            api_url, offset, product_type)
        response=get_responses(api_url, offset, product_type)
        for p in product_batch:
            try:
                product_list.append(p)
            except IntegrityError:
                print('product ', p['id'], ' already exists')
        for r in response:
            response_list.append(response)
        offset += 50
    pickle_name=str(product_type)+'_test.pickle'
    response_name=str(product_type)+'_response_test.pickle'
    pickle.dump(product_list, open(pickle_name, 'wb'))
    pickle.dump(response_list, open(response_name, 'wb'))
    return product_list

In [None]:
shorts_test=store_shopstyle_products_test('shorts')

In [19]:
train_dress_list=!ls data/train/dress
train_dress_list=[int(i.split('.')[0]) for i in train_dress_list]

In [158]:
train_dress=pd.read_pickle('metadata/train_meta/sweatshirt.pickle')

In [159]:
train_dress=list(pd.DataFrame(train_dress)['id'])

In [160]:
need_scrape=[i for i in train_dress_list if i not in train_dress]

In [81]:
def find_parse_list(parent_category):
    parse_list=!ls data/train/sweatshirt
    parse_list=[int(i.split('.')[0]) for i in parse_list]
    return parse_list

In [None]:
train_tops_list=find_parse_list('tops')

In [45]:
def parse_product_new(parent_category,need_scrape):
    response_list=[]
    need_scrape_info=[]
    for i in need_scrape:
        print(i)
        url='http://api.shopstyle.com/api/v2/products/'+str(i)+'?pid=shopstyle'
        prod=get_api_json(url)
        response=prod
        pro_id = prod['id']
        name = prod['name']
        image_url = prod['image']['sizes']['Original']['url']
        price=prod['priceLabel']
        shopping_link=prod['clickUrl']
        try:
            saleprice=prod['salePriceLabel']
        except:
            saleprice=None
        categories = [cat['id'] for cat in prod['categories']]
        info= {'id': pro_id,
                    'product_name': name,
                    'price':price,
                    'saleprice':saleprice,
                    'image_url': image_url,
                    'categories': categories,
                    'parent_category': parent_category,
              'url':shopping_link}
        need_scrape_info.append(info)
        response_list.append(prod)
    pickle_name=parent_category+'.pickle'
    pickle.dump(need_scrape_info, open(pickle_name, 'wb'))
    response_name=parent_category+'_response_train.pickle'
    pickle.dump(response_list, open(response_name, 'wb'))
    return need_scrape_info

In [None]:
parse_product_new('sweatshirt',train_sweatshirt_list)

In [None]:
def parse_product(prod, parent_category):
        pro_id = prod['id']
        name = prod['name']
        image_url = prod['image']['sizes']['Original']['url']
        price=prod['priceLabel']
        shopping_link=prod['clickUrl']
        try:
            saleprice=prod['salePriceLabel']
        except:
            saleprice=None
        categories = [cat['id'] for cat in prod['categories']]
        return {'id': pro_id,
                'product_name': name,
                'price':price,
                'saleprice':saleprice,
                'image_url': image_url,
                'categories': categories,
                'parent_category': parent_category,
               'url':}

In [2]:
## find sold out ones
df_meta=pd.read_pickle('/Users/Minmin/ds/styleflask/models/df_meta.pickle')

In [10]:
oos=df_meta.loc[(df_meta['parent_category']=='dress')&(df_meta['final_price']=='Sold Out')]

In [14]:
oos_list=list(oos['id'])

In [23]:
len(oos_list)

1275

In [18]:
df_dress=pd.read_pickle('/Users/Minmin/ds/styleflask/models/dress_features.pickle')

In [22]:
df_dress.shape

(5049, 4096)

In [25]:
df_dress=df_dress.loc[~df_dress.index.isin(oos_list)]

In [29]:
dress_list=list(df_dress.index)

In [39]:
for i in oos_list:
    filename='/Users/Minmin/ds/shopstyle/data/train/dress/'+str(i)+'.jpg'
    if os.path.isfile(filename):
        os.remove(filename)
    else:    ## Show an error ##
        print("Error: %s file not found" % filename)