### SOURCES: 
When the Entire Page Has Infinite Scroll
- https://michaeljsanders.com/2017/05/12/scrapin-and-scrollin.html

General Web Scraping Tutorials
- https://github.com/pwikstrom/build-a-bot
- https://medium.com/@srujana.rao2/scraping-instagram-with-python-using-selenium-and-beautiful-soup-8b72c186a058

Remove duplicates from list
- https://thispointer.com/python-how-to-remove-duplicates-from-a-list/

Get the JSON from the HTML
- https://python-forum.io/Thread-ReGex-With-Python?page=3

### Basic tips about jupyter notebook

SOURCE: https://www.dataquest.io/blog/jupyter-notebook-tips-tricks-shortcuts/

- **Esc** will take you into command mode where you can navigate around your notebook with arrow keys.

- In command mode:
    - **A** to insert a new cell above the current cell, **B** to insert a new cell below.
    - **M** to change the current cell to Markdown, **Y** to change it back to code
    - **D + D** (press the key twice) to delete the current cell
    
- **Enter** will take you from command mode back into edit mode for the given cell.
- You can also use **Shift + M** to merge multiple cells.



### Import packages

In [98]:
# Import python modules
import bs4        # BeautifulSoup4 is a Python package for parsing HTML and XML documents
import time       # We need to wait 3 seconds every time that we scroll in instagram
import requests   # It allows you to send HTTP requests in Python
import re
import json 
from datetime import datetime
import pandas as pd

from selenium import webdriver   # A collection of language specific bindings to drive a browser 
from webdriver_manager.chrome import ChromeDriverManager   # allows to automate the management of the binary drivers 
                                                           # (e.g. chromedriver, geckodriver...) required by Selenium WebDriver.

### Initialise global variables

In [62]:
# Hashtag/s
hashtag='laveganesa'

# The bot pretends to be a Chrome browser
hdrs = {"User-Agent": "Chrome/78.0"}

# Columns labels
colnames = ["likes_post", "datetime"]

### Function definitions

#### Removing duplicates from a list

In [3]:
# Let's remove duplicates (keepping the order of unique elements as it was in the original list)
def removeDuplicates(list_elements):
    
    # Create an empty list to store unique elements
    unique_list = []
    
    # Iterate over the original list and for each element
    # add it to uniqueList, if its not already there.
    for elem in list_elements:
        if elem not in unique_list:
            unique_list.append(elem)
    
    # Return the list of unique elements        
    return unique_list

#### Scroll and store href of every post

In [4]:
# Selenium script to scroll to the bottom. We need to wait 3 seconds to the next batch of data to load, then continue scrolling
# It will continue to do this until the page stops loading new data.
# Meanwhile, we'll store in a list all the href from every post

def getHrefInstagram(hashtag):
    
    # Initialise browser
    browser = webdriver.Chrome(ChromeDriverManager().install())
    browser.get('https://www.instagram.com/explore/tags/'+hashtag)   

    # Scrolling and storing process    
    length_page = browser.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")

    match=False
    list_href=[]

    while(match==False):
        last_count = length_page
        time.sleep(3)
        length_page = browser.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")

        if last_count==length_page:
            match=True

        # Grab the source code
        source = browser.page_source

        # Transform to soup using html.parser (beautify)
        soup = bs4.BeautifulSoup(source, "html.parser")

        # Find all div-tags of class "v1Nh3 kIKUG  _bz0w" 
        links_posts = soup.find_all("div", class_=["v1Nh3 kIKUG _bz0w"]) 

        # Extract the href from every post
        for post in links_posts:

            ind_link = post.find("a")
            href = "https://www.instagram.com" + ind_link.get("href")
            list_href +=[href]
            
    # Remove the duplicates
    final_list_href = removeDuplicates(list_href)  
    
    return final_list_href

#### Getting useful information 

In [103]:
def infoEveryPost(final_list_href):

    list = []
    
    for post in final_list_href:
        
        # A. GETTING JSON
   
        # A.1. Call the url
        response_url = requests.get(post, headers=hdrs) 

        # A.2. Get the JSON from the url. There we'll find the graphql with the data we want
        source = response_url.text
        data_json = re.findall(r'<script type="text/javascript">window._sharedData = (.*);</script>', source)[0]
        data_json = json.loads(data_json)     # it gives back a python dictionary.

        # A.3. Let's go to the section we're interested in. Here ['PostPage'][0] contain a list (it has []), 
        # therefore, we use [0] to get get contented inside this list and continue navigating. 
        data = data_json['entry_data']['PostPage'][0]['graphql']['shortcode_media']   

        # B. GETTING INTERESTING INFORMATION FROM THE JSON

        # B.1. Empty item
        item_x = []
    
        # B.2. Likes
        likes = data['edge_media_preview_like']['count']
        item_x += [likes]
        
        # B.3. Datetime
        from datetime import datetime
        temp = data['taken_at_timestamp']
        temp = datetime.fromtimestamp(temp)
        datetime = temp.strftime("%Y-%m-%d %H:%M:%S%z")
        item_x += [datetime]        
        
        # C. ADD EVERYTHING TO THE LIST
        list+= [item_x]
    
    return pd.DataFrame(list, columns=colnames)

### The script

In [7]:
# Get the href of every post
href_everylink = getHrefInstagram(hashtag)


Looking for [chromedriver 78.0.3904.70 win32] driver in cache 
File found in cache by path [C:\Users\saram\.wdm\drivers\chromedriver\78.0.3904.70\win32\chromedriver.exe]


In [104]:
total_info = infoEveryPost(href_everylink)

In [105]:
total_info

Unnamed: 0,likes_post,datetime
0,52,2019-10-18 21:29:50
1,199,2019-03-03 21:56:11
2,283,2019-03-09 13:00:10
3,261,2017-12-28 17:28:35
4,105,2016-07-26 22:55:25
5,192,2017-07-11 15:08:57
6,73,2019-09-15 12:04:37
7,88,2019-09-17 16:01:19
8,83,2019-10-29 11:44:56
9,40,2019-11-15 14:01:57


### Testing

In [67]:
# Get information from every post
first_post = href_everylink[0]
first_post

'https://www.instagram.com/p/B3xXXFehcoV/'

In [68]:
# 1. Call the url
response_url = requests.get(first_post, headers=hdrs) 

# 2. Get the JSON from the url. There we'll find the graphql with the data we want
source = response_url.text
data_json = re.findall(r'<script type="text/javascript">window._sharedData = (.*);</script>', source)[0]
data_json = json.loads(data_json)     # it gives back a python dictionary.

# 3. Let's go to the section we're interested in. Here ['PostPage'][0] contain a list (it has []), 
# therefore, we use [0] to get get contented inside this list and continue navigating. 
data = data_json['entry_data']['PostPage'][0]['graphql']['shortcode_media']   

# 4. Let's get interesting information!!
dt =data['taken_at_timestamp']

In [93]:
from datetime import datetime
datetime = datetime.fromtimestamp(dt)
datetime = datetime.strftime("%Y-%m-%d %H:%M:%S%z")
datetime

'2019-10-18 21:29:50'

In [74]:
dt


1571426990