In [None]:
# don't worry about these
%load_ext autoreload
%autoreload 2

In [None]:
import os, sys
sys.path.append('..')
from time import sleep

from igramscraper.instagram import Instagram
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image
%matplotlib inline  

# import previously defined functions
from core.utils import get_thumbnail, show_thumbnail, imresize
from core.instagram import get_media_by_url
from core.envs import DATA_DIR, IMAGE_DIR, THUMBNAIL_DIR
from core.db.persistence import media_to_row

# Scraping Location data
- We are not done yet! We need to scrape location information
- As we established, scraping too fast will get us blocked. The recommended speed limit is also very slow -- about 30s per request, 10min break between every 10 requests. That's too slow! 60post/hr => dataset of 10,000 = takes 166hr = 1 week!

### Scraping with ProxyPool 

- We will speed this up by creating a proxy when we make the request. This is essentially like VPN -- we "fake" the origin of where the request is made. In this way, there is no way instagram can block us!
- *Is it safe? Not at all. What proxy can do is that they essentially can intercept your information, but also even return you altered messages. Worst case they will send us a virus (Is that even possible @terence? I don't know :D). But these website I'm getting the list of proxy actually also offer monetized service. I imagine they do need trusted proxy servers otherwise they will be sued AF*. Secondly, we are only accessing instagram. It's quite clear what we are trying to get. DO NOT SEND YOUR SENSITIVE INFO!!

Source:
- https://blog.scrapinghub.com/python-requests-proxy
- Free Proxy List: 
  - https://hidemy.name/en/proxy-list/
  - https://www.sslproxies.org/
- Paid Proxy List: https://scrapinghub.com/crawlera


In [None]:
medias = pd.read_csv(os.path.join(DATA_DIR, 'newyork_20191124.csv'), quotechar="'")

In [None]:
from igramscraper.instagram import Instagram 

class ProxyPoolExecutor:
    def __init__(self, proxy_pool):
        self.proxy_pool = proxy_pool.copy()
    
    @staticmethod
    def _proxy_dict(proxy):
        return {
            'http': f'http://{proxy}',
            'https': f'http://{proxy}',            
        }
    
    def select_proxy(self):
        idx = np.random.choice(len(self.proxy_pool))
        proxy = self.proxy_pool[idx]
        print(f'Selected {proxy}')
        return idx, self._proxy_dict(proxy)

    def run(self, func, *args, **kwargs):
        # until all proxy is down, keep trying
        idx, proxy = self.select_proxy()
        while self.proxy_pool:
            # until all proxy is down, keep trying
            idx, proxy = self.select_proxy()
            try:
                return func(*args, **kwargs, proxy=proxy)
            except:
                print("Unexpected error:", sys.exc_info()[0])
                
                deleted_proxy = self.proxy_pool.pop(idx)
                print(f'{deleted_proxy} no longer works :( removing from the pool')
            
            if self.is_empty():
                raise IndexError('Ran out of valid proxy servers')

    def is_empty(self):
        return len(self.proxy_pool) == 0

    def update_proxy_pool(self, proxy_pool):
        self.proxy_pool += proxy_pool

In [None]:
# create instagram
instagram = Instagram(sleep_between_requests=3)

In [None]:
# create proxy pool executor
with open('../data/proxy_pool.txt', 'r') as proxy_file:
    proxy_pool = [p.strip() for p in proxy_file.readlines()]

executor = ProxyPoolExecutor(proxy_pool)

In [None]:
# Try one
proxy = '182.53.197.202:45661'
proxy_dict = ProxyPoolExecutor._proxy_dict(proxy)
instagram = Instagram(sleep_between_requests=3)
media_obj = get_media_by_url(instagram, medias.iloc[100].media_link, proxy=proxy_dict)

In [None]:
full_medias = []
progress_idx = 0

In [None]:
# update the proxy list before running
with open('../data/proxy_pool.txt', 'r') as proxy_file:
    proxy_pool = [p.strip() for p in proxy_file.readlines()]
    
executor = ProxyPoolExecutor(proxy_pool)

for curr_idx in range(progress_idx+1, len(medias)):
    media = medias.iloc[curr_idx]
    print(media.media_id)

    # scrape media
    print('.. scrape media')
    media_obj = executor.run(get_media_by_url, instagram, media.media_link)

    # scrape media image
    print('.. scrape image')
    thumbnail = executor.run(get_thumbnail_adapter, media.img_thumbnail_url)

    # save the images
    imname = f'{IMAGE_DIR}/{media.media_id}.jpeg'
    thumbnail_name = f'{THUMBNAIL_DIR}/{media.media_id}.jpeg'
    Image.fromarray(thumbnail).save(imname)
    Image.fromarray(imresize(thumbnail, (64,64))).save(thumbnail_name)

    full_medias.append(media_to_row(media_obj))
    progress_idx = curr_idx