# Database with Location
- As we saw, scraping by tag allows us to obtain recent posts
- But we need to scrape location information and also download the images for each media post.
- That's a lot of pages to scrape!


- As we established, scraping too fast will get us blocked. The recommended speed limit is also very slow.
- Allowing 15-30s rest per request, we can scrape ~5000 images a day if we scrape all day. It's good enough for us for our MVP

In [None]:
# don't worry about these
%load_ext autoreload
%autoreload 2

In [None]:
!pwd

In [None]:
import os, sys, time
sys.path.append('..')
import pathlib
from time import sleep

from igramscraper.instagram import Instagram
from igramscraper.exception.instagram_not_found_exception import InstagramNotFoundException
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image
%matplotlib inline  

# import previously defined functions
from core.utils import get_thumbnail, show_thumbnail, imresize
from core.instagram import get_media_by_url
from core.envs import DATA_DIR, IMAGE_DIR, THUMBNAIL_DIR
from core.db.persistence import media_to_row, DATA_ATTRIBUTES

# change paths
PROJECT_TT_ROOT = pathlib.Path('../..')
DATA_DIR = PROJECT_TT_ROOT / DATA_DIR
IMAGE_DIR = PROJECT_TT_ROOT / IMAGE_DIR
THUMBNAIL_DIR = PROJECT_TT_ROOT / THUMBNAIL_DIR

In [None]:
# specify target directory where we will save data
search_tag = 'london'  # the tag we use for scraping
timestamp = time.strftime("%Y%m%d-%H%M")  # log the current time
dest_dir = DATA_DIR / search_tag / timestamp  # create dataset based on search tag and timestamp
dest_img_dir = dest_dir / 'images'
dest_thumbnail_dir = dest_dir / 'thumbnails'

# create directory if they don't exist
if not dest_dir.exists():
    os.makedirs(dest_dir)
    os.makedirs(dest_img_dir)
    os.makedirs(dest_thumbnail_dir)

Let's first get bunch of posts using our usual function. For this example, lets scrape 10 medias

In [None]:
instagram = Instagram(sleep_between_requests=15)
medias = instagram.get_medias_by_tag(search_tag, count=10)  # this will take some time!

In [None]:
# create pandas.Dataframe from scraped data
df = pd.DataFrame([media_to_row(m) for m in medias], columns=DATA_ATTRIBUTES)

In [None]:
# create dataset from the dataframe
csv_name = dest_dir / f'{search_tag}_{timestamp}.csv'
df.to_csv(csv_name, quotechar="'", index=False)
print(f'saved to {csv_name}')                 

Now that we created a temporary dataset, lets scrape the location information for each media with all the attributes that we want, using the `get_media_by_url` function we defined in "[2] Instagram Data Attributes.ipynb"

In [None]:
medias = pd.read_csv(csv_name, quotechar="'")  # read csv is easy!

In [None]:
bad_list = []
full_medias = []
progress_idx = 0

# create csv to save the new dataset with all attributes
fname = dest_dir / f'{search_tag}_{timestamp}_location.csv'

# create csv file with just the column names. In the loop, we will add one row at a time
df_full_medias = pd.DataFrame([], columns=DATA_ATTRIBUTES)
df_full_medias.to_csv(fname, quotechar="'", index=False)

In [None]:
for curr_idx in range(progress_idx+1, len(medias)):
    media = medias.iloc[curr_idx]
    print(f'[{curr_idx}/{len(medias)}]: {media.media_id}')

    # sometimes the request can fail so surround in try-catch block
    try:
        # scrape media
        sleep(np.random.randint(15))
        print(f'.. scrape media {media.media_link}')
        media_obj = get_media_by_url(instagram, media.media_link)

        # scrape media image
        sleep(np.random.randint(15))
        print(f'.. scrape image {media.img_thumbnail_url}')
        thumbnail = get_thumbnail(media.img_thumbnail_url)

        # save the images
        imname = f'{dest_img_dir}/{media.media_id}.jpeg'
        thumbnail_name = f'{dest_thumbnail_dir}/{media.media_id}.jpeg'
        Image.fromarray(thumbnail).save(imname)
        Image.fromarray(imresize(thumbnail, (64,64))).save(thumbnail_name)

        # 
        datarow = media_to_row(media_obj)
        full_medias.append(datarow)
        df_full_medias = df_full_medias.append(datarow)

        # append the row to csv
        df_row = pd.DataFrame([datarow])
        df_row.to_csv(fname, quotechar="'", mode='a', header=False, index=False)

    except InstagramNotFoundException as e:
        print(e)
        print('.. adding to bad list')
        bad_list.append(media)

    # keep track of the current index we've got, in case the function fails
    progress_idx = curr_idx
    
    # random long sleep
    sleep_time = np.random.randint(60)
    print(f'.. sleeping for {sleep_time}s')
    sleep(sleep_time)