# Web Scraping Instagram with Selenium

In [1]:
#imports here
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import time

# Download ChromeDriver

Now we need to download latest stable release of ChromeDriver from:
https://chromedriver.chromium.org/

In [2]:
#specify the path to chromedriver.exe (download and save on your computer)
driver = webdriver.Chrome('/Users/user/Desktop/Web_Scraping_Instagram_with_Selenium/chromedriver')

#open the webpage
driver.get("http://www.instagram.com")

# Log into your Instagram account

Open Developer Tools in your browser and look for the name attribute

In [3]:
#target username
username = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='username']")))
password = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='password']")))

#enter username and password
username.clear()
username.send_keys("your_id")
password.clear()
password.send_keys("your_password")

#target the login button and click it
log_in = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[type='submit']"))).click()

#We are logged in!

# Handle Alerts

You might only get a single alert, or you might get 2 of them
please adjust the cell below accordingly. <br>
As my browser is in Spanish, my target button will say "Ahora no" but if yours is in English look for "Not now"

In [4]:
time.sleep(5)
alert = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Ahora no")]'))).click()
alert2 = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Ahora no")]'))).click()

# Search for a certain hashtag

In [5]:
#target the search input field
searchbox = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//input[@placeholder='Buscar']")))
searchbox.clear()

#search for the hashtag cat
keyword = "#cat"
searchbox.send_keys(keyword)
 
#DOUBLE ENTER
time.sleep(5) # Wait for 5 seconds
my_link = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//a[contains(@href, '/" + keyword[1:] + "/')]")))
my_link.click()

# Scroll Down

Increase n_scrolls to select more photos (depending on screen resolution)<br>
<strong>Example:</strong>

- 2 scrolls cover approx. 35 photos
- 3 scrolls cover approx. 45 photos

In [6]:
#scroll down 2 times
#increase the range to sroll more
n_scrolls = 2
for j in range(0, n_scrolls):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(5)

In [7]:
#target all the link elements on the page
anchors = driver.find_elements_by_tag_name('a')
anchors = [a.get_attribute('href') for a in anchors]
#narrow down all links to image links only
anchors = [a for a in anchors if str(a).startswith("https://www.instagram.com/p/")]

print('Found ' + str(len(anchors)) + ' links to images')
anchors[:5]

Found 51 links to images


['https://www.instagram.com/p/CUNRb6YA43r/',
 'https://www.instagram.com/p/CUNTDyOD8Ma/',
 'https://www.instagram.com/p/CUNG87vJjBt/',
 'https://www.instagram.com/p/CUNcOdApyfi/',
 'https://www.instagram.com/p/CUNWVEXAK0A/']

In [8]:
images = []

#follow each image link and extract only image at index=1
for a in anchors:
    img = driver.find_elements_by_tag_name('img')
    img = [i.get_attribute('src') for i in img]
    images.append(img[1])
    
images[:5]

['https://instagram.fmex23-1.fna.fbcdn.net/v/t51.2885-15/e35/242692731_1368790030184455_2833736039745165744_n.jpg?_nc_ht=instagram.fmex23-1.fna.fbcdn.net&_nc_cat=1&_nc_ohc=yc8n6i_-c6YAX-Af0nE&tn=DqvdhehCtTpRNFKd&edm=ABZsPhsBAAAA&ccb=7-4&oh=579cb31711f83dd69fdcc7c833d28cd3&oe=6155ABA0&_nc_sid=4efc9f&ig_cache_key=MjY2OTg2NjgzODk5NzA0NDcxNQ%3D%3D.2-ccb7-4',
 'https://instagram.fmex23-1.fna.fbcdn.net/v/t51.2885-15/e35/242692731_1368790030184455_2833736039745165744_n.jpg?_nc_ht=instagram.fmex23-1.fna.fbcdn.net&_nc_cat=1&_nc_ohc=yc8n6i_-c6YAX-Af0nE&tn=DqvdhehCtTpRNFKd&edm=ABZsPhsBAAAA&ccb=7-4&oh=579cb31711f83dd69fdcc7c833d28cd3&oe=6155ABA0&_nc_sid=4efc9f&ig_cache_key=MjY2OTg2NjgzODk5NzA0NDcxNQ%3D%3D.2-ccb7-4',
 'https://instagram.fmex23-1.fna.fbcdn.net/v/t51.2885-15/e35/242692731_1368790030184455_2833736039745165744_n.jpg?_nc_ht=instagram.fmex23-1.fna.fbcdn.net&_nc_cat=1&_nc_ohc=yc8n6i_-c6YAX-Af0nE&tn=DqvdhehCtTpRNFKd&edm=ABZsPhsBAAAA&ccb=7-4&oh=579cb31711f83dd69fdcc7c833d28cd3&oe=6155ABA0&_

# Save images to computer

First we'll create a new folder for our images somewhere on our computer. <br>
Then, we'll save all the images there.

In [9]:
import os

path = os.getcwd()
path = os.path.join(path, keyword[1:] + "s")

#create the directory
os.mkdir(path)

path

'/Users/user/Desktop/Web_Scraping_Instagram_with_Selenium/cats'

In [11]:
#download images
import wget

counter = 0
for image in images:
    save_as = os.path.join(path, keyword[1:] + str(counter) + '.jpg')
    wget.download(image, save_as)
    counter += 1