In [1]:
import requests
import os
import requests
from bs4 import BeautifulSoup
import asyncio
import aiohttp

In [None]:
async def fetch_url(session, url):
    async with session.get(url) as response:
        return await response.text()

async def extract_hrefs(session, page):
    base_url = "https://www.midis101.com/search/jazz"
    if page == 1:
        url = f"{base_url}"
    else:
        url = f"{base_url}/pg-{page}"

    html_content = await fetch_url(session, url)
    soup = BeautifulSoup(html_content, "html.parser")

    td_elements = soup.find_all("td")
    hrefs = [td.find("a")["href"] for td in td_elements if td.find("a") and td.find("a")["href"] != 'javascript:;']

    return hrefs

async def download_hrefs(url, queue):
    url = "https://www.midis101.com" + url
    async with aiohttp.ClientSession() as session:
        html_content = await fetch_url(session, url)
        soup = BeautifulSoup(html_content, "html.parser")

        elements = soup.find_all(class_="btn btn-primary btn-lg btn-block p-4")
        hrefs = [element["href"] for element in elements]

        for href in hrefs:
            print(f"Extracted {href}")
            await queue.put(href)

async def download_midi(url, session):
    url = "https://www.midis101.com" + url
    # Extract the filename from the URL
    filename = url.split("/")[-1]

    # Check if the file already exists in the directory
    if os.path.exists(f"midi_data/{filename}"):
        print(f"Skipping {filename} (already exists)")
        return

    async with session.get(url) as response:
        # Save the MIDI file to the "midi_data" folder
        with open(f"midi_data/{filename}", "wb") as file:
            # Iterate over the response content asynchronously
            async for chunk in response.content.iter_any():
                file.write(chunk)

        print(f"Downloaded {filename}")

async def main():
    # Fetch and extract hrefs asynchronously for pages 1 to 21
    all_hrefs = []
    async with aiohttp.ClientSession() as session:
        tasks = [extract_hrefs(session, page) for page in range(1, 22)]
        all_hrefs_per_page = await asyncio.gather(*tasks)
        all_hrefs = [href for hrefs in all_hrefs_per_page for href in hrefs]

    # Create the "midi_data" folder if it doesn't exist
    if not os.path.exists("midi_data"):
        os.makedirs("midi_data")

    # Create a queue to store hrefs
    href_queue = asyncio.Queue()

    # Enqueue URLs directly into the href_queue
    tasks = [download_hrefs(url, href_queue) for url in all_hrefs]
    await asyncio.gather(*tasks)

    # Create a list to store download tasks
    download_tasks = []

    # Download MIDI files from the URLs in href_queue
    async with aiohttp.ClientSession() as session:
        while not href_queue.empty():
            href = await href_queue.get()
            url = "https://www.midis101.com" + href
            download_tasks.append(download_midi(url, session))

    # Concurrently download MIDI files
    await asyncio.gather(*download_tasks)

    print("MIDI files downloaded successfully.")

await main()


In [2]:
import os
folder_path = "midi_data"
file_count = len(os.listdir(folder_path))

print(f"Number of files in the '{folder_path}' folder: {file_count}")


Number of files in the 'data' folder: 603


## EXperiments

In [7]:
async def fetch_url(session, url):
    async with session.get(url) as response:
        return await response.text()

async def extract_hrefs(session, page):
    base_url = "https://www.midis101.com/search/jazz"
    if page == 1:
        url = f"{base_url}"
    else:
        url = f"{base_url}/pg-{page}"

    html_content = await fetch_url(session, url)
    soup = BeautifulSoup(html_content, "html.parser")

    td_elements = soup.find_all("td")
    hrefs = [td.find("a")["href"] for td in td_elements if td.find("a") and td.find("a")["href"] != 'javascript:;']

    return hrefs

async def main_sub():
    all_hrefs = []

    # Create a new aiohttp session
    async with aiohttp.ClientSession() as session:
        # Fetch and extract hrefs asynchronously for pages 1 to 21
        tasks = [extract_hrefs(session, page) for page in range(1, 22)]
        all_hrefs_per_page = await asyncio.gather(*tasks)

        # Flatten the list of lists
        all_hrefs = [href for hrefs in all_hrefs_per_page for href in hrefs]

    return all_hrefs

# Run the event loop
all_hrefs = await main_sub()


In [6]:
async def download_hrefs(url, queue):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(await response.text(), "html.parser")

            # Find all elements with the class "btn btn-primary btn-lg btn-block p-4"
            elements = soup.find_all(class_="btn btn-primary btn-lg btn-block p-4")

            # Extract the hrefs from the elements
            hrefs = [element["href"] for element in elements]

            # Put the hrefs into the queue
            for href in hrefs:
                # print(f"Extracted {href}")
                await queue.put(href)

async def download_midi(url, session):
    # Extract the filename from the URL
    filename = url.split("/")[-1]

    # Check if the file already exists in the directory
    if os.path.exists(f"midi_midi_data/{filename}"):
        print(f"Skipping {filename} (already exists)")
        return

    async with session.get(url) as response:
        # Save the MIDI file to the "midi_data" folder
        with open(f"midi_data/{filename}", "wb") as file:
            # Iterate over the response content asynchronously
            async for chunk in response.content.iter_any():
                file.write(chunk)

        #print(f"Downloaded {filename}")


async def main():
    # Create the "midi_data" folder if it doesn't exist
    if not os.path.exists("midi_data"):
        os.makedirs("midi_data")

    # Create a queue to store hrefs
    href_queue = asyncio.Queue()

    # Create a list to store download tasks
    download_tasks = []

    # Iterate over the URLs in all_hrefs
    for url in all_hrefs:
        url = "https://www.midis101.com" + url

        # Send a GET request to the URL and extract hrefs
        download_tasks.append(download_hrefs(url, href_queue))

    # Concurrently extract hrefs
    await asyncio.gather(*download_tasks)

    # Create a list to store download tasks
    download_tasks = []

    # Download MIDI files from the URLs in href_queue
    async with aiohttp.ClientSession() as session:
        while not href_queue.empty():
            href = await href_queue.get()
            url = "https://www.midis101.com" + href
            download_tasks.append(download_midi(url, session))

    # Concurrently download MIDI files
        await asyncio.gather(*download_tasks)

    print("MIDI files downloaded successfully.")


await main()


Extracted /download/50600-jazz-unknown-jp059
Extracted /download/50604-jazz-unknown-jp063
Extracted /download/91999-drjazz
Extracted /download/89630-jazztango
Extracted /download/59488-tonny-bennet-young-and-foolish-jazz
Extracted /download/50625-jazz-yebisu
Extracted /download/89659-rockinjazz


  containerClass = self.string_container(containerClass)


Extracted /download/50607-jazz-unknown-jp066
Extracted /download/51182-jonasz-michel-la-boite-de-jazz
Extracted /download/50617-jazz-unknown-summertime
Extracted /download/75401-italian-ladridibiciclette-dr-jazz-e-mr-funk
Extracted /download/50599-jazz-unknown-jp058
Extracted /download/94439-jazz-demo-song
Extracted /download/50628-jazz-zanzibar
Extracted /download/91737-jazzpiano
Extracted /download/94440-jazzslow
Extracted /download/50592-jazz-unknown-jp051
Extracted /download/91742-jimtownjazz
Extracted /download/88238-hihojazz
Extracted /download/54726-nougaro-claude-le-jazz-et-la-java-1
Extracted /download/50609-jazz-unknown-jp068
Extracted /download/50608-jazz-unknown-jp067
Extracted /download/91735-jazzinterlude50
Extracted /download/91708-eastbayjazz
Extracted /download/50616-jazz-unknown-summert
Extracted /download/59484-tonny-bennet-just-in-timejazz
Extracted /download/59485-tonny-bennet-on-a-clear-day-you-can-see-forever-jazz
Extracted /download/50626-jazz-youdbe
Extracted /

In [None]:
import aiohttp
async def download_hrefs(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(await response.text(), "html.parser")

            # Find all elements with the class "btn btn-primary btn-lg btn-block p-4"
            elements = soup.find_all(class_="btn btn-primary btn-lg btn-block p-4")

            # Extract the hrefs from the elements
            hrefs = [element["href"] for element in elements]

            return hrefs
        
async def download_midi(url, session):
    # Extract the filename from the URL
    filename = url.split("/")[-1]

    # Check if the file already exists in the directory
    if os.path.exists(f"midi_data/{filename}"):
        print(f"Skipping {filename} (already exists)")
        return

    async with session.get(url) as response:
        # Save the MIDI file to the "midi_data" folder
        with open(f"midi_data/{filename}", "wb") as file:
            file.write(await response.read())

        print(f"Downloaded {filename}")

async def main():
    # Create the "midi_data" folder if it doesn't exist
    if not os.path.exists("midi_data"):
        os.makedirs("midi_data")

    download_links = []

    # Iterate over the URLs in all_hrefs
    for url in all_hrefs:
        url = "https://www.midis101.com" + url

        # Send a GET request to the URL
        hrefs = await download_hrefs(url)

        # Add the extracted hrefs to the list
        download_links.extend(hrefs)

    # Print the extracted hrefs
    print(download_links)

    # Download MIDI files from the URLs in download_links
    tasks = []
    async with aiohttp.ClientSession() as session:
        for url in download_links:
            url = "https://www.midis101.com" + url
            tasks.append(asyncio.create_task(download_midi(url, session)))

        await asyncio.gather(*tasks)

    print("MIDI files downloaded successfully.")
    

await main()


[]
MIDI files downloaded successfully.


## Synchronous implementation

In [None]:
# Create the "midi_data" folder if it doesn't exist
if not os.path.exists("midi_data"):
    os.makedirs("midi_data")
    # Create a list to store the extracted hrefs

download_links = []

    # Iterate over the URLs in all_hrefs
for url in all_hrefs:
    url = "https://www.midis101.com" + url
    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")

    # Find all elements with the class "btn btn-primary btn-lg btn-block p-4"
    elements = soup.find_all(class_="btn btn-primary btn-lg btn-block p-4")

    # Extract the hrefs from the elements
    hrefs = [element["href"] for element in elements]

    # Add the extracted hrefs to the list
    download_links.extend(hrefs)

# Print the extracted hrefs
print(download_links)

In [None]:
# Download MIDI files from the URLs in all_hrefs
for url in download_links:
    # Send a GET request to the URL
    response = requests.get(url)

    # Extract the filename from the URL
    filename = url.split("/")[-1]

    # Save the MIDI file to the "data" folder
    with open(f"midi_data/{filename}", "wb") as file:
        file.write(response.content)

print("MIDI files downloaded successfully.")

In [13]:
import dbt

In [10]:
with open('../config.yaml', 'r') as config_file:
            config = yaml.safe_load(config_file)
            
sequence_length = config['sequence_length']

In [11]:
sequence_length

100