### SOURCES: 
When the Entire Page Has Infinite Scroll
- https://michaeljsanders.com/2017/05/12/scrapin-and-scrollin.html

General Web Scraping Tutorials
- https://github.com/pwikstrom/build-a-bot
- https://medium.com/@srujana.rao2/scraping-instagram-with-python-using-selenium-and-beautiful-soup-8b72c186a058

Remove duplicates from list
- https://thispointer.com/python-how-to-remove-duplicates-from-a-list/

Get the JSON from the HTML
- https://python-forum.io/Thread-ReGex-With-Python?page=3

### Basic tips about jupyter notebook

SOURCE: https://www.dataquest.io/blog/jupyter-notebook-tips-tricks-shortcuts/

- **Esc** will take you into command mode where you can navigate around your notebook with arrow keys.

- In command mode:
    - **A** to insert a new cell above the current cell, **B** to insert a new cell below.
    - **M** to change the current cell to Markdown, **Y** to change it back to code
    - **D + D** (press the key twice) to delete the current cell
    
- **Enter** will take you from command mode back into edit mode for the given cell.
- You can also use **Shift + M** to merge multiple cells.



### Import packages

In [1]:
# Import python modules
import bs4        # BeautifulSoup4 is a Python package for parsing HTML and XML documents
import time       # We need to wait 3 seconds every time that we scroll in instagram
import requests   # It allows you to send HTTP requests in Python
import re
import json 
from datetime import datetime
import pandas as pd
import mysql.connector

from selenium import webdriver   # A collection of language specific bindings to drive a browser 
from webdriver_manager.chrome import ChromeDriverManager   # allows to automate the management of the binary drivers 
                                                           # (e.g. chromedriver, geckodriver...) required by Selenium WebDriver.

### Initialise global variables

In [3]:
# Hashtag/s
hashtag='laveganesa'

# The bot pretends to be a Chrome browser
hdrs = {"User-Agent": "Chrome/78.0"}

# Columns labels
db_user_names = ["name", "username"]
db_post_names = ["datetime", "likes"]

# Import credentials
%run credentials2.ipynb

### Function definitions

#### Removing duplicates from a list

In [4]:
# Let's remove duplicates (keepping the order of unique elements as it was in the original list)
def removeDuplicates(list_elements):
    
    # Create an empty list to store unique elements
    unique_list = []
    
    # Iterate over the original list and for each element
    # add it to uniqueList, if its not already there.
    for elem in list_elements:
        if elem not in unique_list:
            unique_list.append(elem)
    
    # Return the list of unique elements        
    return unique_list

#### Remove emojis from a string

In [5]:
# Let's transform the emojis into text
def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')

#### List to tuple and remove duplicates

In [6]:
def listToTuple(data):
    for i in range(0,len(data)):
        data[i] = tuple(data[i])
        
    data=list(set(data))
    return data

#### Scroll and store href of every post

In [7]:
# Selenium script to scroll to the bottom. We need to wait 3 seconds to the next batch of data to load, then continue scrolling
# It will continue to do this until the page stops loading new data.
# Meanwhile, we'll store in a list all the href from every post

def getHrefInstagram(hashtag):
    
    # Initialise browser
    browser = webdriver.Chrome(ChromeDriverManager().install())
    browser.get('https://www.instagram.com/explore/tags/'+hashtag)   

    # Scrolling and storing process    
    length_page = browser.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")

    match=False
    list_href=[]

    while(match==False):
        last_count = length_page
        time.sleep(3)
        length_page = browser.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")

        if last_count==length_page:
            match=True

        # Grab the source code
        source = browser.page_source

        # Transform to soup using html.parser (beautify)
        soup = bs4.BeautifulSoup(source, "html.parser")

        # Find all div-tags of class "v1Nh3 kIKUG  _bz0w" 
        links_posts = soup.find_all("div", class_=["v1Nh3 kIKUG _bz0w"]) 

        # Extract the href from every post
        for post in links_posts:

            ind_link = post.find("a")
            href = "https://www.instagram.com" + ind_link.get("href")
            list_href +=[href]
            
    # Remove the duplicates
    final_list_href = removeDuplicates(list_href)  
    
    return final_list_href

#### Get and store useful information 

In [8]:
def infoEveryPost(final_list_href):

    db_user = []
    db_post = [] 
    
    for post in final_list_href:
        
        # A. GETTING JSON
   
        # A.1. Call the url
        response_url = requests.get(post, headers=hdrs) 

        # A.2. Get the JSON from the url. There we'll find the graphql with the data we want
        source = response_url.text
        data_json = re.findall(r'<script type="text/javascript">window._sharedData = (.*);</script>', source)[0]
        data_json = json.loads(data_json)     # it gives back a python dictionary.

        # A.3. Let's go to the section we're interested in. Here ['PostPage'][0] contain a list (it has []), 
        # therefore, we use [0] to get get contented inside this list and continue navigating. 
        data = data_json['entry_data']['PostPage'][0]['graphql']['shortcode_media']   

        # B. GETTING INTERESTING INFORMATION FROM THE JSON

        # B.1. Empty item
        item_user = []
        item_post = []    
   
        # B.2. user database
        username = data['owner']['username']
        temp = data['owner']['full_name']
        name = deEmojify(temp)
        
        item_user += [username]   
        item_user += [name]
        
        # B.3. post database
        likes = data['edge_media_preview_like']['count']
        
        from datetime import datetime
        datetime = data['taken_at_timestamp']
        #temp = datetime.fromtimestamp(temp)
        #datetime = temp.strftime('%Y-%m-%d %H:%M:%S')
        
        item_post += [likes]
        item_post += [datetime]
        
        # C. ADD EVERYTHING TO THE LIST
        db_user += [item_user]
        db_post += [item_post] 

    
    
    # D. TIDY UP THE DATA  
    db_user = listToTuple(db_user)
    db_post = listToTuple(db_post)

    # E. STORE DATA IN MYSQL DATABASE

    connection = mysql.connector.connect(host=host,
                                database=dbname,
                                user= user,
                                password=password)
    
    mycursor = connection.cursor()
 
    sql_post = "INSERT INTO post(likes, datetime) VALUES (%s, %s)"
    sql_user = "INSERT INTO user(username, name) VALUES (%s, %s)"

    mycursor.executemany(sql_user,db_user)
    mycursor.executemany(sql_post, db_post)
    
    connection.commit()
    connection.close()
   

    print("Success. Data stored!") 

### The script

In [9]:
# Get the href of every post
href_everylink = getHrefInstagram(hashtag)


Looking for [chromedriver 78.0.3904.70 win32] driver in cache 
File found in cache by path [C:\Users\saram\.wdm\drivers\chromedriver\78.0.3904.70\win32\chromedriver.exe]


In [10]:
href_everylink2= href_everylink[0:10]
href_everylink2

['https://www.instagram.com/p/B42m41iIX4h/',
 'https://www.instagram.com/p/BdQKw9Pnqxr/',
 'https://www.instagram.com/p/Buj3NJIFzTa/',
 'https://www.instagram.com/p/BWaEyDSgZuq/',
 'https://www.instagram.com/p/BIVr8TdDn_A/',
 'https://www.instagram.com/p/B2bYcbMCePz/',
 'https://www.instagram.com/p/B2g9HwhCULe/',
 'https://www.instagram.com/p/BuyWogMlAPR/',
 'https://www.instagram.com/p/B4MwCMfISRp/',
 'https://www.instagram.com/p/B47v4IZoBUl/']

In [12]:
# Get information from every post and store it in the database
a=infoEveryPost(href_everylink2)
a

IntegrityError: 1062 (23000): Duplicate entry 'fran.godoy.r' for key 'username_UNIQUE'

In [15]:
connection.close()

NameError: name 'connection' is not defined

### Testing

In [101]:
test = [('distritovegano', '[DISTRITO VEGANO]®'),
 ('laveganesa', '🌱 La Veganesa 🌱'),
 ('j.a.romero2', 'POP ART KITSCH RETRO VINTAGE'),
 ('yas_says_yes', 'Yasmeen Iman A. | Yas'),
 ('fran.godoy.r', 'Fran Godoy'),
 ('producciones_leont', 'Leont Torres'),
 ('scer.sarita', 'Sara Espejo'),
 ('freshpotatofactory', 'Fresh Potato Factory'),]

test2= total_info[0]

In [None]:
total_info

In [15]:
connection = mysql.connector.connect(host=host,
                                database=dbname,
                                user= user,
                                password=password)
mycursor = connection.cursor()

sql_user = "INSERT INTO user(username, name) VALUES (%s, %s)"
mycursor.executemany(sql_user, total_info[0])

connection.commit()
connection.close()

In [11]:
connection = mysql.connector.connect(host=host,
                                database=dbname,
                                user= user,
                                password=password)

mycursor = connection.cursor()

values= [('tomate', "prueba"), ('ajo', "test"),]
sql = "INSERT INTO user(username, name) VALUES (%s, %s)"
mycursor.executemany(sql, values)

connection.commit()
connection.close()

print(mycursor.rowcount, "record inserted.")

2 record inserted.


In [221]:
     # E. STORE DATA IN MYSQL DATABASE
    connection = mysql.connector.connect(host=host,
                                database=dbname,
                                user= user,
                                password=password)
    
    mycursor = connection.cursor()
    
    sql_user = "INSERT INTO user(username, name) VALUES (%s, %s)"
#   sql_post = "INSERT INTO post(datetime, likes) VALUES (%s, %s)"

    mycursor.executemany(sql_user, db_user)
#    mycursor.executemany(sql_post, db_post)

In [113]:
connection.close()


In [27]:
# Get information from every post
first_post = href_everylink[1]
first_post

'https://www.instagram.com/p/Buj3NJIFzTa/'

In [28]:
# 1. Call the url
response_url = requests.get(first_post, headers=hdrs) 

# 2. Get the JSON from the url. There we'll find the graphql with the data we want
source = response_url.text
data_json = re.findall(r'<script type="text/javascript">window._sharedData = (.*);</script>', source)[0]
data_json = json.loads(data_json)     # it gives back a python dictionary.

# 3. Let's go to the section we're interested in. Here ['PostPage'][0] contain a list (it has []), 
# therefore, we use [0] to get get contented inside this list and continue navigating. 
data = data_json['entry_data']['PostPage'][0]['graphql']['shortcode_media']   

# 4. Let's get interesting information!!
dt =data['taken_at_timestamp']

In [12]:
from datetime import datetime
datetime = datetime.fromtimestamp(dt)
datetime = datetime.strftime("%Y-%m-%d %H:%M:%S%z")
datetime

'2019-03-03 21:56:11'

In [32]:
type(data['edge_media_preview_like']['count'])

int

In [29]:
data

{'__typename': 'GraphImage',
 'id': '1991678255938614490',
 'shortcode': 'Buj3NJIFzTa',
 'dimensions': {'height': 1080, 'width': 1080},
 'gating_info': None,
 'fact_check_overall_rating': None,
 'fact_check_information': None,
 'media_preview': 'ACoqw7ZFcnfjAXPJ285Hf/PrVvyYCG+78pIHznnAzkeue3SqlsqM/wC9yUwc44x7/wCeK1ILNkjMioG77nA6ewPH44OaTdhIrpBC4B+Vc46v69vqAOfr+AYYYt4XgDaTy/GQenTOccdO+ccVd/tAsCrsY9uNpC8fjikkuAFxKUl54+U9x6nkfgQeanm8itO5RngiVCU25GP4sk884H+eKoVoXFltXzY8lepHXb759PryO9Z9aJ3E9DViSODMh6qPlB559fwqWC4d33NvZerEZ6e3tUl2FEe0YXkH/wDXVCOV4jlNw+h//Xms9zVO3RGo0ts6lACgb7zfeYj056ZqJbS3wQkmQecEYJ9B3/lVc6jv/wBciyD3+VvwYY/UVZgiW8P7pBGAO5Y9+5qXcr3HvzL5p/5EcUDMHCk8ggY9enfjGKp/2fJ6H8v/AK9dUsOyMRqAAOuM9e/vUGRTTsZ2T9OhXIST73NRmxiP3fl+hNUoiasqT60xj1sFU5DHj1wf6VdsYPJY8k7h+A+npVIMfWp1JpAaaJsyT+lUagdj61V3H1NIEf/Z',
 'display_url': 'https://scontent-mrs2-1.cdninstagram.com/vp/ef0053eee86bd2fa5c11484c78e9352a/5E877C55/t51.2885-15/e35/52947444_2071422939620367_306876959366657938_n.jpg?_nc_ht=scontent-m