In [55]:
import requests
from bs4 import BeautifulSoup as bs
import urllib.parse as ur
import os
import shutil

In [56]:
base_url = "https://www.presetshare.com"
first_page = ur.urljoin(base_url, "presets?query=&instrument=7&page=1")
save_path = "../../data/presets/presetshare/"

reset = False # delete existing files
if reset:
    shutil.rmtree(save_path, ignore_errors=True)
os.makedirs(save_path, exist_ok=True)

In [57]:
html = requests.get(first_page).text
soup = bs(html, "html.parser")
download_buttons = soup.find_all("a", class_="download-button")
print(f"Found {len(download_buttons)} presets on page 1.")

Found 24 presets on page 1.


In [58]:
# Download presets on the first page and save metadata to CSV
# Output: files saved into `save_path` with names like 001_<name>.<ext> and a `metadata.csv` file
import csv
import re
import unicodedata

def slugify(value):
    value = str(value)
    value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
    value = re.sub(r'[^a-zA-Z0-9._-]+', '_', value).strip('_')
    return value or 'preset'

# Use a requests.Session() so you can later authenticate if needed.
# By default we do not attempt to log in; set `session_login = True` and
# add the login steps below if you want the scraper to perform a login.
session = requests.Session()
session_login = False
# Example placeholder for a login flow (site-specific; uncomment and edit if you want):
# if session_login:
#     login_url = ur.urljoin(base_url, '/login')
#     payload = {'username': 'your', 'password': 'pass'}
#     session.post(login_url, data=payload)

# Try loading a cookies.txt (Netscape format) from a few likely locations
# so you can reuse authenticated browser cookies. If found, cookies are
# injected into the session.
notebook_path = os.path.abspath('scripts/scripting/presetshare.ipynb')
notebook_dir = os.path.dirname(notebook_path)
cookies_candidates = [
    os.path.join(notebook_dir, 'cookies.txt'),
    os.path.join(os.getcwd(), 'scripts', 'scripting', 'cookies.txt'),
    os.path.join(os.getcwd(), 'cookies.txt'),
]
cookies_path = None
for p in cookies_candidates:
    if os.path.exists(p):
        cookies_path = p
        break

def load_netscape_cookies(path, session):
    """Load Netscape-format cookies (cookies.txt) and add them to requests.Session.
    This is a small tolerant parser for the common format.
    """
    try:
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if not line or line.startswith('#'):
                    continue
                parts = line.split('\t')
                if len(parts) < 7:
                    # some cookie files may use spaces; try splitting on whitespace as fallback
                    parts = line.split()
                    if len(parts) < 7:
                        continue
                domain, flag, cpath, secure, expiry, name, value = parts[:7]
                # Add cookie to session; requests will use domain/path when sending
                try:
                    session.cookies.set(name, value, domain=domain, path=cpath)
                except Exception:
                    # best-effort: ignore invalid cookie lines
                    continue
        return True
    except Exception as e:
        print(f'Failed loading cookies from {path}: {e}')
        return False

if cookies_path:
    ok = load_netscape_cookies(cookies_path, session)
    print(f'Loaded cookies from {cookies_path}: {ok}')
else:
    print('No cookies.txt found in notebook directory or cwd; proceeding unauthenticated.')

items = soup.find_all('div', class_='preset-item')
print(f'Found {len(items)} preset items on the page.')
rows = []
for idx, item in enumerate(items, start=1):
    uid = f'{idx:03d}'

    # Name
    name_tag = item.find('a', class_='preset-item__name')
    original_name = name_tag.get_text(strip=True) if name_tag else ''

    # Author
    author_tag = item.find('a', class_='preset-item-username')
    author = author_tag.get_text(strip=True) if author_tag else ''

    # Genre - look for link with 'genre=' in href
    genre_tag = item.find('a', href=re.compile(r'genre='))
    genre = genre_tag.get_text(strip=True) if genre_tag else ''

    # Counters
    def int_or_zero(t):
        try:
            return int(t.get_text(strip=True))
        except Exception:
            return 0

    likes = int_or_zero(item.find('span', class_='like-counter'))
    downloads = int_or_zero(item.find('span', class_='download-counter'))
    comments = int_or_zero(item.find('span', class_='comment-counter'))

    # Date
    date_tag = item.find('div', class_='preset-item-date')
    date = date_tag.get_text(strip=True) if date_tag else ''

    # Download URL - prefer anchor with class 'download-button' inside this item
    dl_tag = item.find('a', class_='download-button')
    download_url = ur.urljoin(base_url, dl_tag['href']) if (dl_tag and dl_tag.get('href')) else None

    # Prepare a sensible Referer (the preset page) if available
    preset_page_ref = None
    preset_page_link = item.find('a', class_='preset-item__name')
    if preset_page_link and preset_page_link.get('href'):
        preset_page_ref = ur.urljoin(base_url, preset_page_link.get('href'))
    else:
        preset_page_ref = base_url

    # Determine filename and download file if possible
    file_name_on_disk = ''
    if download_url:
        try:
            # Build browser-like headers; Referer is often required for direct downloads
            headers = {
                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36',
                'Referer': preset_page_ref,
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Accept-Language': 'en-US,en;q=0.5',
                'Connection': 'keep-alive',
                'X-Requested-With': 'XMLHttpRequest',
            }
            r = session.get(download_url, headers=headers, stream=True)

            # If the response is HTML, it's probably a login/redirect page.
            content_type = r.headers.get('content-type', '').lower()
            is_html = ('text/html' in content_type) or (r.content.lstrip().startswith(b'<'))
            if is_html:
                # Save diagnostic HTML so you can inspect what's returned (login page, error, etc.)
                err_path = os.path.join(save_path, f'{uid}_download_error.html')
                with open(err_path, 'wb') as ef:
                    ef.write(r.content)
                print(f'Preset {uid} appears to be an HTML page (likely login or blocking). Saved diagnostic to {err_path}.')
                print('If you are logged in in a browser but not here, consider exporting cookies or enabling session login.')
                file_name_on_disk = ''
            else:
                r.raise_for_status()
                # Try to get filename from Content-Disposition header
                cd = r.headers.get('content-disposition', '')
                m = re.search(r'filename=\\?\"?([^\\\";]+)\\?\"?', cd)
                if m:
                    orig_filename = m.group(1)
                else:
                    # fallback: try last part of URL path or use slugified original name
                    parsed = ur.urlparse(download_url)
                    last = os.path.basename(parsed.path) or ''
                    if last:
                        orig_filename = last
                    else:
                        orig_filename = slugify(original_name) + '.fxp'
                # sanitize and prefix with uid
                base, ext = os.path.splitext(orig_filename)
                if not ext:
                    ext = '.fxp'
                safe = slugify(base)
                file_name_on_disk = f'{uid}_{safe}{ext}'
                preset_path = os.path.join(save_path, file_name_on_disk)
                with open(preset_path, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        if chunk:
                            f.write(chunk)
                print(f'Downloaded: {file_name_on_disk}')
        except Exception as e:
            print(f'Failed downloading preset {uid} ({original_name}): {e}')
            file_name_on_disk = ''
    else:
        print(f'No download URL for preset {uid} ({original_name})')

    rows.append({
        'id': uid,
        'original_name': original_name,
        'author': author,
        'genre': genre,
        'likes': likes,
        'comments': comments,
        'downloads': downloads,
        'date': date,
        'file': file_name_on_disk,
    })

# Write CSV metadata (overwrite for this page run)
csv_path = os.path.join(save_path, 'metadata.csv')
with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['id','original_name','author','genre','likes','comments','downloads','date','file']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for r in rows:
        writer.writerow(r)

print(f'Wrote metadata for {len(rows)} presets to {csv_path}')

Loaded cookies from /home/ben/Synthetizers-parameters-estimation/scripts/scripting/cookies.txt: True
Found 24 preset items on the page.
Preset 001 appears to be an HTML page (likely login or blocking). Saved diagnostic to ../../data/presets/presetshare/001_download_error.html.
If you are logged in in a browser but not here, consider exporting cookies or enabling session login.
Preset 001 appears to be an HTML page (likely login or blocking). Saved diagnostic to ../../data/presets/presetshare/001_download_error.html.
If you are logged in in a browser but not here, consider exporting cookies or enabling session login.
Preset 002 appears to be an HTML page (likely login or blocking). Saved diagnostic to ../../data/presets/presetshare/002_download_error.html.
If you are logged in in a browser but not here, consider exporting cookies or enabling session login.
Preset 002 appears to be an HTML page (likely login or blocking). Saved diagnostic to ../../data/presets/presetshare/002_download_er