# A jupyter notebook to scrape data on Instagram

This notebook was written for scraping data from Instagram posts based on a hashtag or location

## Import packages needed for the scraping

In [27]:
import time, random, pandas as pd, json, traceback

from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait

## Download ChromeDriver

To be able to scrap it is necessary to simulate a Browser through the code and open Instagram in it. To do this, you must initially download a driver that the code will use to simulate the Browser.

To download the ChromeDriver click on this link:
<br>
https://chromedriver.chromium.org/

Once the driver has been downloaded, insert it in a PATH that must be indicated in the code.

## Open Instagram with ChromeDriver

In [28]:
#specify the path to chromedriver.exe (download and save on your computer and insert the path)
driver = Chrome(r"/Users/magio94/Dropbox/Master of Science in Geospatial Technologies/Corsi Muenster/Transferring Data to Knowledge/Project/chromedriver")

#open the webpage
driver.get("http://www.instagram.com")

In [29]:
#close the pop-up
button0 = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Accetta tutti")]'))).click()

In [30]:
#target username
username = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='username']")))
password = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='password']")))

#enter username and password
username.clear()
username.send_keys("put here you username")

password.clear()
password.send_keys("put here your password")

In [31]:
time.sleep(5)

#target the login button and click it
button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[type='submit']"))).click()

#We are logged in!

In [32]:
#close the NOT NOW pop-ups
time.sleep(5)
not_now = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Non ora")]'))).click()
time.sleep(2)
not_now2 = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Non ora")]'))).click()

## Search the hashtag (or it can be used for find location, users, etc. posts)

In [33]:
#target the search input field
searchbox = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//input[@placeholder='Cerca']")))
searchbox.clear()

#search for the hashtag cat
keyword = "#valborbera"
searchbox.send_keys(keyword)
 
# Wait for 5 seconds
time.sleep(5)
searchbox.send_keys(Keys.ENTER)
time.sleep(5)
searchbox.send_keys(Keys.ENTER)
time.sleep(5)

## Scrape posts

The following code is written for:
- select the first post of the research done;
- create the lists in which to save the information I want to obtain for each post (in this case: 1) link of each post, 2) date of publication of the post, 3) the hashtags contained in the post, 4) the link of the location linked to the post, 5) the link of the user puplished the post
- finally the last code reads the post and then passes to the following one, until it encounters some error, at this point the program stops and waits for information to be obtained again

In [34]:
# click the first post of the search
click_first_post = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//*[@id='react-root']/section/main/article/div[1]/div/div/div[1]/div[1]/a/div/div[2]"))).click()

In [36]:
# create the list for the info to scrape
links = []
date_time = []
hashtags_by_posts = []
lst_locations = []
users = []

In [None]:
%%time

# scrape the data

i = 0
time.sleep(2)

while True:
    i+=1
    try:
        # get the user
        user = driver.find_element(By.XPATH,'/html/body/div[5]/div[2]/div/article/header/div[2]/div[1]/div/span/a')
        user_attribute = user.get_attribute('href')
        users.append(user_attribute)
        
        # get the links
        get_link = driver.current_url
        links.append(get_link)
        # get time
        get_date_time = driver.find_element(By.XPATH,'/html/body/div[5]/div[2]/div/article/div[3]/div[2]/a/time').text
        date_time.append(get_date_time)
        # get the hashtags
        links2 = driver.find_elements(By.XPATH,'/html/body/div[5]/div[2]/div/article/div[3]/div[1]/ul/div/li/div/div/div[2]/span/a')        
        hashtags = []
        for link2 in links2:
            hashtags.append((link2.text))
        hashtags_by_posts.append(hashtags)
        # get the location
        try:
            location = driver.find_element(By.XPATH,'/html/body/div[5]/div[2]/div/article/header/div[2]/div[2]/div[2]/a')
            lst_locations.append(location.get_attribute('href'))
        except:
            lst_locations.append(0)
        scroll_post = driver.find_element_by_css_selector('a._65Bje.coreSpriteRightPaginationArrow').click()
        # avoid errors
        while True:
            try:
                element = WebDriverWait(driver, 20).until(
                    EC.presence_of_element_located((
                        By.XPATH,'/html/body/div[5]/div[2]/div/article/div[3]/div[2]/a/time')))
                break
            except:
                scroll_back_post = driver.find_element_by_css_selector('a.ITLxV.coreSpriteLeftPaginationArrow').click()
                time.sleep(10)
                while True:
                    try:
                        element = WebDriverWait(driver, 20).until(
                            EC.presence_of_element_located((
                                By.XPATH,'/html/body/div[5]/div[2]/div/article/div[3]/div[2]/a/time')))
                        scroll_post = driver.find_element_by_css_selector('a._65Bje.coreSpriteRightPaginationArrow').click()
                        time.sleep(3)
                        break
                    except:
                        continue 
        time_sleep = float(random.randrange(450, 550))/100
        time.sleep(time_sleep)
    except Exception:
        print(i)
        traceback.print_exc()
        break

## Check if the list have the same number of objects to merge them after

In [None]:
len(links)

In [None]:
len(date_time)

In [None]:
len(hashtags_by_posts)

In [None]:
len(lst_locations)

In [None]:
len(users)

In [None]:
df = pd.DataFrame({'Links' : links, 'Date_time' : date_time, 'Hashtags_by_posts' : hashtags_by_posts, 
                   'Geotag' : lst_locations, 'users' : users})    

In [None]:
df

## Save the each list and the final table

In [None]:
with open('links4.txt', 'w') as f:
    for item in links:
        f.write("%s\n" % item)

In [None]:
with open('date_time4.txt', 'w') as f:
    for item in date_time:
        f.write("%s\n" % item)

In [None]:
with open('hashtags_by_posts4.txt', 'w') as f:
    for item in hashtags_by_posts:
        f.write("%s\n" % item)

In [None]:
with open('lst_locations4.txt', 'w') as f:
    for item in lst_locations:
        f.write("%s\n" % item)

In [None]:
with open('users4.txt', 'w') as f:
    for item in lst_locations:
        f.write("%s\n" % item)

In [None]:
df.to_csv('merged_lists.csv', index=False, encoding='utf-8')

## Scrape the coordinates by location from the links got in the post

Once you have obtained the links of the place where the post was published in case it was available, to obtain the coordinates it is necessary to insert the link for each location in the browser. There is a possibility that many of the locations have been collected more than once, so to reduce the scraping time, create a dictionary for having no repetitions.

In [None]:
# opening the file saved with the location (or you can just use the list)

a_file = open("lst_locations4.txt", "r")

lst_locations = []

for line in a_file:
    stripped_line = line.strip()
    lst_locations.append(stripped_line)

a_file.close()

In [None]:
# create the list for the locations for having no repetitions and counting them

geotag = list(dict.fromkeys(lst_locations))
len(geotag)

In [None]:
# create the list to store the coordinates

coordinates = []

In [None]:
%%time

# scrape the location

for i in geotag:
    while True:
        try:
            if str(i) == '0':
                lat = 0
                lng = 0
            else:
                driver.get(str(i))
                js = "return JSON.stringify(window._sharedData)"
                get_lan_lng = json.loads(driver.execute_script(js))
                lat = get_lan_lng['entry_data']['LocationsPage'][0]['native_location_data']['location_info']['lat']
                lng = get_lan_lng['entry_data']['LocationsPage'][0]['native_location_data']['location_info']['lng']        
            break
        except:
            time_sleep = float(random.randrange(1500, 2500))/100
            time.sleep(time_sleep)
            
        
    coordinates.append([i, lat, lng])
    time_sleep = float(random.randrange(800, 1500))/100
    time.sleep(time_sleep)

### Scrape location from a determined link

Instagram could block the location scraping process through the link, in fact after several calls, the page will remain blank. In this case you can make use of this code and also for this reason the list with the coordinates has been loaded separately so as not to overwrite the information already obtained, but to add new ones. 

In [None]:
# find the number of the link where the code should start to scrape again

start_from = int(len(coordinates)) + 1

In [None]:
%%time

# start the scraping from the number of the link found before

for i in geotag[start_from:]:
    while True:
        
        try:
            if str(i) == '0':
                lat = 0
                lng = 0
                
            else:
                driver.get(str(i))
                js = "return JSON.stringify(window._sharedData)"
                get_lan_lng = json.loads(driver.execute_script(js))
                lat = get_lan_lng['entry_data']['LocationsPage'][0]['native_location_data']['location_info']['lat']
                lng = get_lan_lng['entry_data']['LocationsPage'][0]['native_location_data']['location_info']['lng']        
            break
        except:
            time_sleep = float(random.randrange(1500, 2500))/100
            time.sleep(time_sleep)
            continue
        break
            
    coordinates.append([i, lat, lng])
    time_sleep = float(random.randrange(800, 1500))/100
    time.sleep(time_sleep)

## Save the coordinates as table

In [None]:
# create a table with the coordinates got with pandas

export_table = pd.DataFrame(coordinates)

In [None]:
# naming the columns 

export_table.columns = ["Geotag", "lat", "lng"]

## Join the location table with the posts table

In [None]:
merged_table = pd.merge(export_table, df, on='Geotag')
merged_table

## Counts the number of posts by location

In [None]:
count_table = merged_table.groupby(["Geotag", "lat", "lng"]).size().reset_index(name='counts')
count_table