In [1]:
import pandas as pd
import glob
import requests
import shutil
from pandarallel import pandarallel

pandarallel.initialize()

INFO: Pandarallel will run on 24 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
df = pd.read_csv('../DATA/unsplash/photos.tsv000', sep='\t')

In [3]:
df.columns

Index(['photo_id', 'photo_url', 'photo_image_url', 'photo_submitted_at',
       'photo_featured', 'photo_width', 'photo_height', 'photo_aspect_ratio',
       'photo_description', 'photographer_username', 'photographer_first_name',
       'photographer_last_name', 'exif_camera_make', 'exif_camera_model',
       'exif_iso', 'exif_aperture_value', 'exif_focal_length',
       'exif_exposure_time', 'photo_location_name', 'photo_location_latitude',
       'photo_location_longitude', 'photo_location_country',
       'photo_location_city', 'stats_views', 'stats_downloads',
       'ai_description', 'ai_primary_landmark_name',
       'ai_primary_landmark_latitude', 'ai_primary_landmark_longitude',
       'ai_primary_landmark_confidence', 'blur_hash'],
      dtype='object')

In [4]:
df_imp = df[['photo_id', 'photo_image_url', 'photo_width', 'photo_height', 'photo_description', 'ai_description']]

In [5]:
df_imp

Unnamed: 0,photo_id,photo_image_url,photo_width,photo_height,photo_description,ai_description
0,XMyPniM9LF0,https://images.unsplash.com/uploads/1411949294...,4272,2848,Woman exploring a forest,woman walking in the middle of forest
1,rDLBArZUl1c,https://images.unsplash.com/photo-141633941111...,3000,4000,Succulents in a terrarium,succulent plants in clear glass terrarium
2,cNDGZ2sQ3Bo,https://images.unsplash.com/photo-142014251503...,2564,1710,Rural winter mountainside,rocky mountain under gray sky at daytime
3,iuZ_D1eoq9k,https://images.unsplash.com/photo-141487280988...,2912,4368,Poppy seeds and flowers,red common poppy flower selective focus phography
4,BeD3vjQ8SI0,https://images.unsplash.com/photo-141700759404...,4896,3264,Silhouette near dark trees,trees during night time
...,...,...,...,...,...,...
24995,c7OrOMxrurA,https://images.unsplash.com/photo-159300793778...,4160,6240,,black metal fence during daytime
24996,15IuQ5a0Qwg,https://images.unsplash.com/photo-159296761254...,6000,4000,Pearl earrings and seashells,white and brown seashell on white surface
24997,w8nrcXz8pwk,https://images.unsplash.com/photo-159299937329...,2584,4592,,leopard on brown tree trunk during daytime
24998,n1jHrRhehUI,https://images.unsplash.com/photo-159192792878...,3533,4824,Floral truck in the streets of Rome,woman in beige coat and white hat standing on ...


In [9]:
path = '../DATA/unsplash/'
documents = ['photos', 'keywords', 'collections', 'conversions']
datasets = {}

for doc in documents:
  files = glob.glob(path + doc + ".tsv*")

  subsets = []
  for filename in files:
    df = pd.read_csv(filename, sep='\t', header=0)
    subsets.append(df)

  datasets[doc] = pd.concat(subsets, axis=0, ignore_index=True)

In [10]:
datasets['collections']

Unnamed: 0,photo_id,collection_id,collection_title,photo_collected_at
0,--2IBUMom1I,9832457,business,2020-04-04 14:26:10.506402
1,--2IBUMom1I,162470,Majestical Sunsets,2016-03-15 17:04:25.089589
2,--2IBUMom1I,4916417,PESSOAS,2019-06-01 04:20:40.076819
3,--2IBUMom1I,88441555,ROCANCOURT,2020-10-26 10:53:03.931209
4,--2IBUMom1I,2143051,Travel / Places,2018-05-22 23:20:05.898545
...,...,...,...,...
2076634,zzux2cH-F-A,136096,Asthetic,2016-02-25 04:59:54.933131
2076635,zzux2cH-F-A,10747101,Flora,2020-06-17 23:11:36.884176
2076636,zzux2cH-F-A,9835422,Holy Wood,2020-04-05 00:28:55.015276
2076637,zzux2cH-F-A,10747029,Landscapes,2020-06-17 22:59:06.831638


In [11]:
datasets['photos']

Unnamed: 0,photo_id,photo_url,photo_image_url,photo_submitted_at,photo_featured,photo_width,photo_height,photo_aspect_ratio,photo_description,photographer_username,...,photo_location_country,photo_location_city,stats_views,stats_downloads,ai_description,ai_primary_landmark_name,ai_primary_landmark_latitude,ai_primary_landmark_longitude,ai_primary_landmark_confidence,blur_hash
0,XMyPniM9LF0,https://unsplash.com/photos/XMyPniM9LF0,https://images.unsplash.com/uploads/1411949294...,2014-09-29 00:08:38.594364,t,4272,2848,1.50,Woman exploring a forest,michellespencer77,...,,,2375421,6967,woman walking in the middle of forest,,,,,L56bVcRRIWMh.gVunlS4SMbsRRxr
1,rDLBArZUl1c,https://unsplash.com/photos/rDLBArZUl1c,https://images.unsplash.com/photo-141633941111...,2014-11-18 19:36:57.08945,t,3000,4000,0.75,Succulents in a terrarium,ugmonk,...,,,13784815,82141,succulent plants in clear glass terrarium,,,,,LvI$4txu%2s:_4t6WUj]xat7RPoe
2,cNDGZ2sQ3Bo,https://unsplash.com/photos/cNDGZ2sQ3Bo,https://images.unsplash.com/photo-142014251503...,2015-01-01 20:02:02.097036,t,2564,1710,1.50,Rural winter mountainside,johnprice,...,,,1302461,3428,rocky mountain under gray sky at daytime,,,,,LhMj%NxvM{t7_4t7aeoM%2M{ozj[
3,iuZ_D1eoq9k,https://unsplash.com/photos/iuZ_D1eoq9k,https://images.unsplash.com/photo-141487280988...,2014-11-01 20:15:13.410073,t,2912,4368,0.67,Poppy seeds and flowers,krisatomic,...,,,2890238,33704,red common poppy flower selective focus phography,,,,,LSC7DirZAsX7}Br@GEWWmnoLWCnj
4,BeD3vjQ8SI0,https://unsplash.com/photos/BeD3vjQ8SI0,https://images.unsplash.com/photo-141700759404...,2014-11-26 13:13:50.134383,t,4896,3264,1.50,Silhouette near dark trees,jonaseriksson,...,,,8704860,49662,trees during night time,,,,,L25|_:V@0hxtI=W;odae0ht6=^NG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,c7OrOMxrurA,https://unsplash.com/photos/c7OrOMxrurA,https://images.unsplash.com/photo-159300793778...,2020-06-24 14:12:32.397564,t,4160,6240,0.67,,andyadcon,...,,,1603469,4757,black metal fence during daytime,,,,,L34d_aJ-I:R*tlxGWUjY1y$i$hsm
24996,15IuQ5a0Qwg,https://unsplash.com/photos/15IuQ5a0Qwg,https://images.unsplash.com/photo-159296761254...,2020-06-24 03:00:42.603563,t,6000,4000,1.50,Pearl earrings and seashells,contentpixie,...,Australia,Melbourne,550016,2544,white and brown seashell on white surface,,,,,LAM%_?_NNIH?xvRPx]kBajRPWAxv
24997,w8nrcXz8pwk,https://unsplash.com/photos/w8nrcXz8pwk,https://images.unsplash.com/photo-159299937329...,2020-06-24 11:53:00.668613,t,2584,4592,0.56,,maur1ts,...,Tansania,,500831,3923,leopard on brown tree trunk during daytime,,,,,LlK1wK00M{%MxvV@x[tRM|oyt8t7
24998,n1jHrRhehUI,https://unsplash.com/photos/n1jHrRhehUI,https://images.unsplash.com/photo-159192792878...,2020-06-12 02:13:04.409162,t,3533,4824,0.73,Floral truck in the streets of Rome,keithalva,...,Italy,Rome,335692,1734,woman in beige coat and white hat standing on ...,,,,,LOIhKfV@0J%N~WM{sT-=g4M{Mxx]


In [12]:
datasets['conversions']

Unnamed: 0,converted_at,conversion_type,keyword,photo_id,anonymous_user_id,conversion_country
0,2020-07-29 00:08:04.221,download,clouds,ABmygVJcYgY,dd01ebdd-7691-4518-ab19-b2105782ae8b,VE
1,2020-07-29 00:25:23.426,download,shark,fB2jl6Rb3l4,c48ba6e0-c6a7-4a92-b569-fe57808a8a2c,QA
2,2020-07-29 00:26:13.122,download,dogs,k1hbfag2na0,62c4f043-579c-438f-8815-eb8ba3c54d34,KR
3,2020-07-29 00:37:03.308,download,astronaut,-SyUjRlHauQ,7ad6dc18-a02e-4ba2-b93c-fd7ea2e551d8,JP
4,2020-07-29 00:54:28.942,download,red roses,A0iTJUhK4es,f03a5708-32e4-4fae-8210-3c5d2632cbfb,NZ
...,...,...,...,...,...,...
12166083,2021-07-28 23:22:57.299,download,tree cut,S59dyPi00ow,a03d67e5-cc90-49e1-b5b3-f9532e44d5ff,CL
12166084,2021-07-28 23:32:24.952,download,lake,pJJUHnEOR3s,0789667b-3ed2-4c2e-9bd1-1a5cc7ecebe3,IN
12166085,2021-07-28 23:50:06.84,download,oil,217U8oxGoQ4,b8e7b3c7-fcc8-4454-abc1-7a477bdab957,US
12166086,2021-07-28 23:57:37.071,download,cat,uhnbTZC7N9k,d4f909ea-306d-43e8-b6ed-e6b5c262bf10,MX


In [13]:
datasets['keywords']

Unnamed: 0,photo_id,keyword,ai_service_1_confidence,ai_service_2_confidence,suggested_by_user
0,zzux2cH-F-A,grove,99.077385,,f
1,zzux2cH-F-A,compass,26.864105,,f
2,zzux2cH-F-A,field,94.293869,,f
3,zzux2cH-F-A,jar,43.128902,,f
4,zzux2cH-F-A,flower,81.635406,,f
...,...,...,...,...,...
2634884,--2IBUMom1I,hotel,33.518658,,f
2634885,--2IBUMom1I,resort,28.783237,,f
2634886,--2IBUMom1I,light,52.600540,,f
2634887,--2IBUMom1I,light fixture,33.199734,,f


In [14]:
datasets['photos'][datasets['photos'].photo_id == "EWDvHNNfUmQ"]

Unnamed: 0,photo_id,photo_url,photo_image_url,photo_submitted_at,photo_featured,photo_width,photo_height,photo_aspect_ratio,photo_description,photographer_username,...,photo_location_country,photo_location_city,stats_views,stats_downloads,ai_description,ai_primary_landmark_name,ai_primary_landmark_latitude,ai_primary_landmark_longitude,ai_primary_landmark_confidence,blur_hash
987,EWDvHNNfUmQ,https://unsplash.com/photos/EWDvHNNfUmQ,https://images.unsplash.com/photo-149519512935...,2017-05-19 14:10:40.496725,t,6144,4069,1.51,Fresh wood garlic on wood,goumbik,...,,,29670246,419286,green leafed vegetable on brown wooden surface,,,,,LGNm.w^Tu3?I0c?IE0xvE0%3Inof


In [15]:
datasets['conversions'][datasets['conversions'].photo_id == "ABmygVJcYgY"].keyword.unique()

array(['clouds', 'blue sky', 'graffiti artist', 'flower cloud',
       'blue sky desert', 'flowers', 'desert flower', 'dreamy landscape',
       'graffiti desert', 'dreamy sky', 'dreamy', 'sky desert',
       'shiney sky', 'sky clouds blue', 'desert sky', 'dreamy floral',
       'painted tongue', 'graffiti', 'cloud', 'blue sky and clouds',
       'green sky', 'painted clouds', 'sky with clouds', 'sky',
       'clouds and sky', 'sky blue clouds', 'blue graffiti',
       'blue flower', 'desert art', 'blue floral', 'painted sky',
       'nature artist', 'floral rock in desert', 'dreamy clouds',
       'graffiti rocks', 'blue art', 'sky clouds', 'desert graffiti',
       'desert flowers', 'painted mountains', 'desert rocks',
       'blue florals', 'artist', 'desert blue sky', 'desert and sky',
       'painted rocks', 'rocks sky', 'mountains and blue sky',
       'sky with mountain and hills', 'graffiti landscape',
       'painted landscape', 'sky and mountain', 'cloud art',
       'desert 

In [19]:
def combine(dataframe):

    df_photos = dataframe['photos']
    df_collections = dataframe['collections']
    #df_conversions = dataframe['conversions']
    df_keywords = dataframe['keywords']

    df_photos['keywords'] = df_photos.photo_id.apply(lambda x: df_keywords[df_keywords.photo_id == x].keyword.unique())
    print("finished keywords !!!")
    #df_photos['conversions_keywords'] = df_photos.photo_id.apply(lambda x: df_keywords[df_conversions.photo_id == x].keyword.unique())
    #print("finished conversions_keywords !!!")
    df_photos['collection_title'] = df_photos.photo_id.apply(lambda x: df_collections[df_collections.photo_id == x].collection_title.unique())
    print("finished collection_title !!!")

    return df_photos

In [20]:
combinbed_df = combine(datasets)

finished keywords !!!
finished collection_title !!!


In [21]:
combinbed_df.to_csv('../DATA/unsplash/combined.csv', index=False)

In [22]:
df = pd.read_csv('../DATA/unsplash/combined.csv')

In [24]:
df_cleaned = df[['photo_id', 'photo_url', 'photo_image_url', 'photo_description', 'ai_description', 'keywords', 'collection_title']]

In [27]:
df_cleaned.to_csv("../DATA/unsplash/selected_features.csv", index=False)