# Scraping Location data and images
- Scraping by tag allows us to obtain recent posts
- But we need to scrape location information
- As we established, scraping too fast will get us blocked. The recommended speed limit is also very slow -- Allowing 30-60s rest per request, we can scrape ~500 images a day

In [1]:
# don't worry about these
%load_ext autoreload
%autoreload 2

In [2]:
!pwd

/Users/jschlemper/projects/project-TT/backend/notebooks


In [None]:
import os, sys, time
sys.path.append('..')
from time import sleep

from igramscraper.instagram import Instagram
from igramscraper.exception.instagram_not_found_exception import InstagramNotFoundException
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image
%matplotlib inline  

# import previously defined functions
from core.utils import get_thumbnail, show_thumbnail, imresize
from core.instagram import get_media_by_url
from core.envs import DATA_DIR, IMAGE_DIR, THUMBNAIL_DIR
from core.db.persistence import media_to_row, DATA_ATTRIBUTES

# change paths
DATA_DIR = os.path.join('../../..', DATA_DIR)
IMAGE_DIR = os.path.join('../../..', IMAGE_DIR)
THUMBNAIL_DIR = os.path.join('../../..', THUMBNAIL_DIR)

In [None]:
# login
instagram = Instagram(sleep_between_requests=15)
instagram.with_credentials('username', 'password', 'cache')
instagram.login()

In [None]:
instagram.sleep_between_requests = 30
search_tag = 'usa'
medias = instagram.get_medias_by_tag(search_tag, count=10)  # this will take some time!
instagram.sleep_between_requests = 15

In [None]:
# create directory
search_tag = 'london'
timestamp = time.strftime("%Y%m%d")
dest_dir = os.path.join(DATA_DIR, search_tag, timestamp)
dest_img_dir = os.path.join(dest_dir, 'images')
dest_thumbnail_dir = os.path.join(dest_dir, 'thumbnails')

if not os.path.isdir(dest_dir):
    os.makedirs(dest_dir)
    os.makedirs(dest_img_dir)
    os.makedirs(dest_thumbnail_dir)
    
csv_name = os.path.join(dest_dir, f'{search_tag}_{timestamp}.csv')
df.to_csv(csv_name, quotechar="'", index=False)
print(f'saved to {csv_name}')                 

In [None]:
medias = pd.read_csv(os.path.join(dest_dir, f'{search_tag}_{timestamp}.csv'), quotechar="'")

In [None]:
bad_list = []
full_medias = []
progress_idx = 0

# create csv to save to 
fname = os.path.join(dest_dir, f'{search_tag}_{timestamp}_location.csv')
df_full_medias = pd.DataFrame([], columns=DATA_ATTRIBUTES)
df_full_medias.to_csv(fname, quotechar="'", index=False)

In [None]:
for curr_idx in range(progress_idx+1, len(medias)):
    media = medias.iloc[curr_idx]
    print(f'[{curr_idx}/{len(medias)}]: {media.media_id}')

    try:
        # scrape media
        sleep(np.random.randint(15))
        print(f'.. scrape media {media.media_link}')
        media_obj = get_media_by_url(instagram, media.media_link)

        # scrape media image
        sleep(np.random.randint(15))
        print(f'.. scrape image {media.img_thumbnail_url}')
        thumbnail = get_thumbnail(media.img_thumbnail_url)

        # save the images
        imname = f'{dest_img_dir}/{media.media_id}.jpeg'
        thumbnail_name = f'{dest_thumbnail_dir}/{media.media_id}.jpeg'
        Image.fromarray(thumbnail).save(imname)
        Image.fromarray(imresize(thumbnail, (64,64))).save(thumbnail_name)

        datarow = media_to_row(media_obj)
        full_medias.append(datarow)
        df_full_medias = df_full_medias.append(datarow)

        # append the row to csv
        df_row = pd.DataFrame([datarow])
        df_row.to_csv(fname, quotechar="'", mode='a', header=False, index=False)

    except InstagramNotFoundException as e:
        print(e)
        print('.. adding to bad list')
        bad_list.append(media)

    progress_idx = curr_idx
    
    # random long sleep
    sleep_time = np.random.randint(60)
    print(f'.. sleeping for {sleep_time}s')
    sleep(sleep_time)