# Dataset creation

The dataset will contain pictures from my personal unsplash account, with their metadata. Currently there are 21 pictures, but using this notebook, this dataset can be updated periodically. 

In [17]:
import requests
import pandas as pd

In [5]:
USERNAME = "yogne"
NO_IMAGES = 30 # my account has exactly 21 pictures as of April 22, 2023
with open("./access_token.txt") as f:
    ACCESS_TOKEN = f.read().strip() 

In [7]:
# Collect a given number of pictures from the Unsplash API from a given user
# stats=true for statistics about likes and visualisations
URL_TEMPLATE_LIST = "https://api.unsplash.com/users/{username}/photos/?client_id={access_key}&stats=true&per_page={no_images}"

In [9]:
# Collect information about one picture based on id
# I use this to get the tags of a picture
URL_TEMPLATE_PHOTO = "https://api.unsplash.com/photos/{id}/?client_id={access_key}"

In [11]:
def get_unsplash_photo_list(username: str, no_images: int, access_token: str) -> dict: 
    url = URL_TEMPLATE_LIST.format(username=username,
                                   access_key=access_token,
                                   no_images=no_images) 
    get_response = requests.get(url) 
    return get_response.json()


def get_unsplash_photo_by_id(photo_id: str, access_token: str):
    url = URL_TEMPLATE_PHOTO.format(id=photo_id,
                                    access_key=access_token)
    get_response = requests.get(url)
    return get_response.json() 

In [19]:
# TODO: 1. unsplash api: get longer horizons than 30 days <- 30 is maximum
#       2. find a way to store elegantly time horizons
#       3. push dataframe to big query

def get_list_data_from_unsplash_response(response: dict):
    ids = [] 
    urls = []
    downloads = [] 
    likes = [] 
    views = []
    for pic in response:
        ids.append(pic["id"])
        urls.append(pic["urls"]["full"])
        downloads.append(pic["statistics"]["downloads"]["total"])
        likes.append(pic["statistics"]["likes"]["total"])
        views.append(pic["statistics"]["views"]["total"])
    
    return pd.DataFrame(list(zip(ids, urls, views, downloads, likes)), columns=["id", "url", "views", "downloads", "likes"])

def get_tags_from_unsplash_response(response: dict):
    tags = [] 
    for tag in response["tags"]:
        tags.append(tag["title"])
    return tags 

def get_time_horizons(response: dict): 
    pass

In [20]:
list_response = get_unsplash_photo_list(USERNAME, NO_IMAGES, ACCESS_TOKEN) 
photos_df = get_list_data_from_unsplash_response(list_response) 
photos_df

Unnamed: 0,id,url,views,downloads,likes
0,d5HkawVDFWs,https://images.unsplash.com/photo-167801460841...,186,1,0
1,qOJagwP9AH4,https://images.unsplash.com/photo-167718248759...,1396,2,0
2,ayNtUoUd_Mg,https://images.unsplash.com/photo-167718248015...,1661,32,0
3,QvW1NO_cgJc,https://images.unsplash.com/photo-167718230296...,2849,61,0
4,TaIHXYbBBXM,https://images.unsplash.com/photo-167718230256...,2970,60,0
5,xTfmtxO9f_w,https://images.unsplash.com/photo-167718216220...,2923,54,0
6,w8NjrSOPjY8,https://images.unsplash.com/photo-167718225145...,2873,24,0
7,85UinCrJc3Q,https://images.unsplash.com/photo-166245062081...,1122,18,0
8,YieB0alU9p4,https://images.unsplash.com/photo-165942315997...,9240,18,0
9,HNZb5jhftN0,https://images.unsplash.com/photo-165765050525...,10898,106,0


In [43]:
def get_tags_per_picture(picture_ids):
    tags = []
    tags_count = []
    for id in picture_ids: 
        response = get_unsplash_photo_by_id(id, ACCESS_TOKEN)
        tags_per_image = get_tags_from_unsplash_response(response)
        tags.append(tags_per_image)
        tags_count.append(len(tags_per_image))
    return pd.DataFrame(list(zip(picture_ids, tags, tags_count)), columns = ["id", "tags", "tags_count"])

In [44]:
tags_df = get_tags_per_picture(photos_df["id"].to_list())

In [45]:
tags_df

Unnamed: 0,id,tags,tags_count
0,d5HkawVDFWs,"[france, annecy, city at night, religious, dar...",20
1,qOJagwP9AH4,"[france, cannes, sea, cote d'azure, port, shor...",17
2,ayNtUoUd_Mg,"[france, cannes, port, boat, sea, beach backgr...",20
3,QvW1NO_cgJc,"[france, cannes, boat, shore, cote d'azure, ya...",20
4,TaIHXYbBBXM,"[france, cannes, boat, shore, sea ​​shore, arc...",20
5,xTfmtxO9f_w,"[cannes, france, sea, palm tree, sunset, sunse...",20
6,w8NjrSOPjY8,"[cannes, france, boat, sea, beach background, ...",19
7,85UinCrJc3Q,"[spain, zaragoza, city, day, people, church, c...",20
8,YieB0alU9p4,"[barcelona, spain, balcony, dog, plant, green,...",19
9,HNZb5jhftN0,"[grey, paris, france, pollution, smoke, landsc...",19
