# Web Scraping MIDI

In [83]:
import urllib
import shutil
import re
from pathlib import Path
from bs4 import BeautifulSoup
import requests
import glob, time

We define a function that returns all anchors on a website according to its attributes:

In [16]:
def get_elements( url, tag, attrs ):
    page = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(page, 'html.parser')
    return soup.findAll(tag, attrs)

## Carlo's MIDI

The first midi website to try is: https://www.cprato.com/en/midi/all

In [11]:
target_page = 'https://www.cprato.com/en/midi/all'

On the website we have a table with a view of 50 song links. To get all the songs we have to go inside every link and go through the total of 8 views with 50 songs: 

<img src="notebooks_images\Carlos_MIDI_Table.png">

We get the links for the numbers:

In [12]:
get_elements(target_page, tag='a', attrs={'class': 'page-link'})[:3]

[<a class="page-link" href="https://www.cprato.com/en/midi/all?page=2">2</a>,
 <a class="page-link" href="https://www.cprato.com/en/midi/all?page=3">3</a>,
 <a class="page-link" href="https://www.cprato.com/en/midi/all?page=4">4</a>]

We get the link to each song:

In [13]:
get_elements(target_page, tag='a', attrs={'href': re.compile("^/en/midi/details")})[:3]

[<a href="/en/midi/details/267/3lau-feat-bright-lights-how-you-love-me" style="font-weight:700;">How You Love Me </a>,
 <a href="/en/midi/details/137/above-beyond-tri-state" style="font-weight:700;">Tri-State (Original Mix)</a>,
 <a href="/en/midi/details/247/above-beyond-were-all-we-need" style="font-weight:700;">We're All We Need (Original Mix)</a>]

Inside each song we find this web page: 

<img src="notebooks_images\Carlos_MIDI_Song_Detail.png">

In this webpage we have to get the element to make the download:

In [14]:
detail_page = 'https://www.cprato.com/en/midi/details/267/3lau-feat-bright-lights-how-you-love-me'

In [36]:
target_free_download = get_elements(detail_page, tag='a', attrs={'href': re.compile("^/en/midi/download")})[0]
target_free_download

<a class="btn btn-success px-3 py-3" href="/en/midi/download/267/3lau-feat-bright-lights-how-you-love-me/My40ODA1MDE4NDMzMDMxRSsxNg==" style="display:block;"><i class="fas fa-download pr-1"></i>
        Free Download
        </a>

Once we have the target free download we get the href value:

In [37]:
target_free_download['href']

'/en/midi/download/267/3lau-feat-bright-lights-how-you-love-me/My40ODA1MDE4NDMzMDMxRSsxNg=='

We get the download:

In [38]:
first_url_part = 'https://www.cprato.com'

In [39]:
download_url = first_url_part + target_free_download['href']
download_url

'https://www.cprato.com/en/midi/download/267/3lau-feat-bright-lights-how-you-love-me/My40ODA1MDE4NDMzMDMxRSsxNg=='

## Selenium

In [93]:
from selenium import webdriver
from selenium.webdriver import ActionChains

In [34]:
wd = webdriver.Chrome('chromedriver.exe')

In [40]:
wd.get(download_url)

# Automating the process

In [45]:
target_page = 'https://www.cprato.com/en/midi/all'

first_url_part = 'https://www.cprato.com'

First we get a list of all the songs download pages:

In [159]:
# We get all the views of the table
table_view_links = get_elements(target_page, tag='a', attrs={'class': 'page-link'})

# We get all the urls of the table views
table_view_urls = [target_page] + [link['href'] for link in table_view_links]
table_view_urls= table_view_urls[:8]

# We iterate trough the table views
songs_urls = []
for table_view_url in table_view_urls:
    songs_links = get_elements(table_view_url, tag='a', attrs={'href': re.compile("^/en/midi/details")})
    songs_urls = songs_urls + [first_url_part + link['href'] for link in songs_links]
        

In [160]:
len(songs_urls)

391

We use selenium to get all the downloads:

In [146]:
def download_file(wd, free_download_button, download_folder_path, max_seconds_download):
    number_files_download_folder = len(glob.glob(download_folder_path+r"\*.mid"))
    counter = 0
    #ActionChains(wd).click(free_download_button).perform()
    free_download_button.click()
    while True:
        current_number_files_download_folder = len(glob.glob(download_folder_path+r"\*.mid"))
        if current_number_files_download_folder > number_files_download_folder:
            number_files_download_folder = current_number_files_download_folder
            break
        time.sleep(1)
        counter+= 1
        if counter >= max_seconds_download:
            return False
    return True

In [148]:
download_folder_path = r"C:\Users\jvela\Downloads"
max_seconds_download = 30
max_retries = 3

# We initialize chrome web driver
wd = webdriver.Chrome(executable_path = 'chromedriver.exe')
wd.maximize_window()


for song_url in songs_urls[288:]:   
    retries = 0
    wd.get(song_url)
    while retries<max_retries:    
        try:
            free_download_button = wd.find_element_by_link_text('FREE DOWNLOAD')
        except:
            success = False
            break
        success = download_file(wd, free_download_button, download_folder_path, max_seconds_download)
        if success:
            break
        else:
            wd.close()
            wd = webdriver.Chrome(executable_path = 'chromedriver.exe')
            wd.get(song_url)
        retries +=1
    
    if not success:
        print("Could not download file: " + download_url) 

wd.close()

Could not download file: https://www.cprato.com/en/midi/download/247/above-beyond-were-all-we-need/My4yMTk3OTk1Nzk2MDY1RSsxNg==
Could not download file: https://www.cprato.com/en/midi/download/247/above-beyond-were-all-we-need/My4yMTk3OTk1Nzk2MDY1RSsxNg==
Could not download file: https://www.cprato.com/en/midi/download/247/above-beyond-were-all-we-need/My4yMTk3OTk1Nzk2MDY1RSsxNg==
Could not download file: https://www.cprato.com/en/midi/download/247/above-beyond-were-all-we-need/My4yMTk3OTk1Nzk2MDY1RSsxNg==
Could not download file: https://www.cprato.com/en/midi/download/247/above-beyond-were-all-we-need/My4yMTk3OTk1Nzk2MDY1RSsxNg==
Could not download file: https://www.cprato.com/en/midi/download/247/above-beyond-were-all-we-need/My4yMTk3OTk1Nzk2MDY1RSsxNg==
Could not download file: https://www.cprato.com/en/midi/download/247/above-beyond-were-all-we-need/My4yMTk3OTk1Nzk2MDY1RSsxNg==
Could not download file: https://www.cprato.com/en/midi/download/247/above-beyond-were-all-we-need/My4yM

KeyboardInterrupt: 

In [None]:
https://www.cprato.com/en/midi/download/137/above-beyond-tri-state/MS43ODU4ODA1Nzc0Mjg5RSsxNg==

In [72]:
 glob.glob(r"C:\Users\jvela\OneDrive\Escritorio\*.pdf")

['C:\\Users\\jvela\\OneDrive\\Escritorio\\Assess-Pkt-1-Burns-Depression-Checklist.pdf',
 'C:\\Users\\jvela\\OneDrive\\Escritorio\\FoamRollerES.pdf',
 'C:\\Users\\jvela\\OneDrive\\Escritorio\\Pullup-Dip-Best-exercises-training-guide_EN.pdf']