# iMaterialist_Kaggle_Competition

https://www.kaggle.com/c/imaterialist-challenge-fashion-2018

In [None]:
import sys, os, multiprocessing, urllib3, csv
from PIL import Image
from io import BytesIO
import pandas as pd

Dataset is provided as a list of urls. First step is to get all the images.

#### Downloading validation images

In [None]:
PATH = "/media/james/HDD/Data_sets/iMaterialist/jsonfiles/"

In [None]:
df = pd.read_json(f"{PATH}validation.json")

In [None]:
def anno_checker(anno):
    known = ['labelId', 'imageId']
    for key in anno.keys():
        if key not in known: 
            return 1

In [None]:
df.annotations.apply(anno_checker).sum() #annotations only contain ids and labels

In [None]:
def anno_puller(anno):
    return [int(label) for label in anno['labelId']]

In [None]:
df['annos'] = df.annotations.apply(anno_puller) #sort nested dicts to columns
df['annos_id'] = df.annotations.apply(lambda x: x['imageId'])
df['image_id'] = df.images.apply(lambda x: x['imageId'])
df['image_url'] = df.images.apply(lambda x: x['url'])

In [None]:
df.head()

In [None]:
(df.annos_id == df.image_id).all() #all image urls and labels match

In [None]:
df.drop(['annotations','images','annos_id'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
labelcsv = df[['image_id','annos']].copy() #create a csv of labels for fastai
labelcsv.iloc[:,1] = labelcsv['annos']\
.apply(lambda x: "".join(str(x))\
.replace(",","")\
.replace("[", "")\
.replace("]", ""))

In [None]:
labelcsv.head()

In [None]:
labelcsv.to_csv("/media/james/HDD/Data_sets/iMaterialist/train_labels.csv",index=False, header=False)

In [None]:
save_dir = "/media/james/HDD/Data_sets/iMaterialist/"

In [None]:
client = urllib3.PoolManager(500) #multiple requests
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [None]:
def DownloadImage(img_id, url):
    fname = os.path.join(save_dir, f'{img_id}.jpg')
    
    if os.path.exists(fname):
        return
    
    try:
        global client
        response = client.request('GET', url)
        img = response.data
    except: 
        print(f'download failed: {img_id}')
        
    try:
        pil_img = Image.open(BytesIO(img))
    except:
        print(f'PIL failed: {img_id}')
        
    try:
        img_rgb = pil_img.convert('RGB')
    except:
        print(f'RGB failed: {img_id}')
        
    try:
        img_rgb.save(fname, format='JPEG', quality=90)
    except:
        print(f'save failed: {img_id}')

In [None]:
iterlist = []
for row in df.itertuples():
    iterlist.append((row.image_id, row.image_url))

In [None]:
i = 0 

pool = multiprocessing.Pool(processes=12)
for _ in pool.starmap(DownloadImage, iterlist):
    i += 1
    if i % 1000 == 0:
        print(i)

####  Download the test image set

In [None]:
df = pd.read_json(f"{PATH}test.json")

In [None]:
df.head()

In [None]:
df['image_id'] = df['images'].apply(lambda x: str(x['imageId']))
df['image_url'] = df['images'].apply(lambda x: x['url'])
df = df[['image_id', 'image_url']]

In [None]:
df.head()

In [None]:
iterlist = []
for row in df.itertuples():
    iterlist.append((row.image_id, row.image_url))

In [None]:
save_dir = "/media/james/HDD/Data_sets/iMaterialist/test"

In [None]:
len(iterlist)

In [None]:
i = 0 

pool = multiprocessing.Pool(processes=12)
for _ in pool.starmap(DownloadImage, iterlist):
    i += 1
    if i % 1000 == 0:
        print(i)

##### Downloading training images ~1 million (will take a long time)

The train file has metadata at its begining that prevents pandas from reading it.

In [None]:
file = '/media/james/HDD/Data_sets/iMaterialist/train.json'

js = json.load(open(file))

labeldf = pd.DataFrame(js['annotations'])
imagedf = pd.DataFrame(js['images'])
df = imagedf.merge(labeldf)
df.columns = ['imageId', 'image_url', 'labels']
df.head()

In [None]:
df.shape

In [None]:
save_dir = "/media/james/HDD/Data_sets/iMaterialist/train"

In [None]:
iterlist = []
for row in df.itertuples():
    iterlist.append((row.imageId, row.image_url))

In [None]:
i = 0 

pool = multiprocessing.Pool(processes=12)
for _ in pool.starmap(DownloadImage, iterlist):
    i += 1
    if i % 100000 == 0:
        print(i)