In [1]:
import pandas as pd
import numpy as np
import urllib
import os

In [2]:
crowd_data = pd.read_pickle('../data/processed_data_crowd.pkl')

In [3]:
crowd_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 393022 entries, 489 to 222446005
Data columns (total 4 columns):
project_id    393022 non-null int32
src           393022 non-null object
attribute     393022 non-null category
label         393022 non-null category
dtypes: category(2), int32(1), object(1)
memory usage: 8.2+ MB


In [4]:
crowd_data.head()

Unnamed: 0_level_0,project_id,src,attribute,label
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
489,40030,https://mir-s3-cdn-cf.behance.net/project_modu...,emotion_peaceful,positive
1053,40041,https://mir-s3-cdn-cf.behance.net/project_modu...,emotion_peaceful,unsure
1065,40041,https://mir-s3-cdn-cf.behance.net/project_modu...,emotion_gloomy,unsure
1067,40043,https://mir-s3-cdn-cf.behance.net/project_modu...,emotion_peaceful,unsure
1247,40053,https://mir-s3-cdn-cf.behance.net/project_modu...,media_oilpaint,negative


In [5]:
crowd_data['attribute'].unique()

[emotion_peaceful, emotion_gloomy, media_oilpaint, emotion_scary, media_pen_ink, ..., content_people, content_cars, content_tree, content_cat, content_flower]
Length: 20
Categories (20, object): [emotion_peaceful, emotion_gloomy, media_oilpaint, emotion_scary, ..., content_cars, content_tree, content_cat, content_flower]

In [6]:
classes = crowd_data['attribute'].apply(lambda x: x.split('_')[0])
classes.head(10)

mid
489     emotion
1053    emotion
1065    emotion
1067    emotion
1247      media
1248    emotion
1559    emotion
1593      media
2240    emotion
2240    emotion
Name: attribute, dtype: object

In [7]:
def make_class_dir(df):
    """
    Make the appropriate directories for storing images.
    
    :param df: dataframe 
    :type  df: pandas.core.frame.DataFrame
    """
    
    attributes = list (df['attribute'].unique())
    
    base_dir = "../data/images"
    
    for attribute in attributes:
        directory = os.path.join(base_dir, attribute)
        if not os.path.exists(directory):
            os.makedirs(directory)
            for subdir in ['train', 'valid', 'test']:
                subdirectory = os.path.join(directory, subdir)
                os.makedirs(subdirectory)
                for subdir2 in ['positive', 'unsure', 'negative']:
                    subdirectory2 = os.path.join(subdirectory, subdir2)
                    os.makedirs(subdirectory2)
# list (crowd_data['attribute'].unique())

#make_class_dir(crowd_data)

In [8]:
def one_dir_download(df, attribute):
    """
    For one directory, download images corresponding to train, validate, test.
    Download it in folders corresponding to its labels.
    
    :param df: dataframe 
    :type  df: pandas.core.frame.DataFrame
    :param attribute: the attribute
    :type  attribute: str
    """
    rows = df[df['attribute'] == attribute]
    
    # 60%, 20%, 20% split
    train, validate, test = np.split(
        rows.sample(frac=1), 
        [int(.6*len(rows)), int(.8*len(rows))]
    )
    
    # small amount for now
    train = train[:500]
    validate = validate[:500]
    test = test[:500]
    
    for set_name in ['train', 'valid', 'test']:
        if set_name == 'train':
            set_df = train
        elif set_name == 'validate':
            set_df = validate
        else:
            set_df = test
            
        for index, row in set_df.iterrows():
#             print (row['src'])
            directory = os.path.join ("../data/images", attribute, set_name)

            for label in ["positive", "negative", "unsure"]:
                if row['label'] == label:
                    try:
                        urllib.request.urlretrieve(
                        row['src'], 
                        os.path.join(directory, label, str (index) + '.png')
                        )
                    except urllib.error.HTTPError:
                        pass
            
    return train, validate, test
        
    
# train, validate, test = one_dir_download(crowd_data, "media_watercolor")

In [100]:
train.head()

Unnamed: 0_level_0,project_id,src,attribute,label
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
155171893,23374817,https://mir-s3-cdn-cf.behance.net/project_modu...,media_vectorart,unsure
86585233,12056105,https://mir-s3-cdn-cf.behance.net/project_modu...,media_vectorart,unsure
105322677,15349193,https://mir-s3-cdn-cf.behance.net/project_modu...,media_vectorart,negative
140901477,20991759,https://mir-s3-cdn-cf.behance.net/project_modu...,media_vectorart,negative
59318129,7805767,https://mir-s3-cdn-cf.behance.net/project_modu...,media_vectorart,positive


In [102]:
def download_images(crowd_data):
    """
    Download all wanted images.
    """
    
    classes = crowd_data['attribute'].apply(lambda x: x.split('_')[0])
    
    # remove content classes since we are not classifying content
    rm_content = crowd_data.loc[classes == "media"]
    attributes = list (rm_content['attribute'].unique())
    
    make_class_dir(rm_content)
    
    
    for attribute in attributes:
        train, validate, test = one_dir_download(rm_content, attribute)
        
#     return train

In [103]:
# download_images(crowd_data)

https://mir-s3-cdn-cf.behance.net/project_modules/disp/a103c43528306.5601b1ca6531c.jpg
https://mir-s3-cdn-cf.behance.net/project_modules/disp/8b06a920603667.562ee1189d197.jpg
https://mir-s3-cdn-cf.behance.net/project_modules/disp/64b8f2564657.56009eae22ab1.jpg
https://mir-s3-cdn-cf.behance.net/project_modules/disp/63c7c724874275.56343ba4da9fc.jpg
https://mir-s3-cdn-cf.behance.net/project_modules/disp/c05c4e12265881.56265ab74a98d.jpg
https://mir-s3-cdn-cf.behance.net/project_modules/disp/1d22bd16959159.562b3add0ed02.jpg
https://mir-s3-cdn-cf.behance.net/project_modules/disp/fd39a49887437.560dbe5ca835e.jpg
https://mir-s3-cdn-cf.behance.net/project_modules/disp/474cc51917009.56011038008da.jpg
https://mir-s3-cdn-cf.behance.net/project_modules/disp/0e43322528404.56017620a5559.jpg
https://mir-s3-cdn-cf.behance.net/project_modules/disp/e1267937628107.56066fe0efc38.jpg
https://mir-s3-cdn-cf.behance.net/project_modules/disp/1d22bd16959159.562b3add0ed02.jpg
https://mir-s3-cdn-cf.behance.net/proj

https://mir-s3-cdn-cf.behance.net/project_modules/disp/7ed4f124570831.563379b4175d0.jpg
https://mir-s3-cdn-cf.behance.net/project_modules/disp/399d5619965997.562e3464ac849.jpg
https://mir-s3-cdn-cf.behance.net/project_modules/disp/8e8bae21285161.562fe9d6cfbca.jpg
https://mir-s3-cdn-cf.behance.net/project_modules/disp/8b902912626686.560338e9227a0.png
https://mir-s3-cdn-cf.behance.net/project_modules/disp/81fd4419113747.562d526bce5b9.jpg
https://mir-s3-cdn-cf.behance.net/project_modules/disp/3ae69822255311.5630f3dc53060.jpg
https://mir-s3-cdn-cf.behance.net/project_modules/disp/399d5619965997.562e3464ac849.jpg
https://mir-s3-cdn-cf.behance.net/project_modules/disp/8e8bae21285161.562fe9d6cfbca.jpg
https://mir-s3-cdn-cf.behance.net/project_modules/disp/8b902912626686.560338e9227a0.png
https://mir-s3-cdn-cf.behance.net/project_modules/disp/81fd4419113747.562d526bce5b9.jpg
https://mir-s3-cdn-cf.behance.net/project_modules/disp/3ae69822255311.5630f3dc53060.jpg


Unnamed: 0_level_0,project_id,src,attribute,label
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
49622573,6398337,https://mir-s3-cdn-cf.behance.net/project_modu...,media_vectorart,negative
62269521,8260897,https://mir-s3-cdn-cf.behance.net/project_modu...,media_vectorart,negative
11903308,1677700,https://mir-s3-cdn-cf.behance.net/project_modu...,media_vectorart,unsure
125231709,18494525,https://mir-s3-cdn-cf.behance.net/project_modu...,media_vectorart,negative
162703363,24570831,https://mir-s3-cdn-cf.behance.net/project_modu...,media_vectorart,negative


In [38]:
# rm_content = crowd_data.loc[classes != "content"]
# rm_content.loc[rm_content["label"] == "positive"]