In [10]:
import os
from ossapi import Ossapi
import sqlite3
from time import strftime, localtime
import time
from concurrent.futures import ThreadPoolExecutor
import sys
from numpy import array_split
import pickle
from datetime import datetime
sys.path.insert(0, "../") # Required for database in different directory. 

OSU_CLIENT_ID = os.environ.get('OSU_CLIENT_ID')
OSU_CLIENT_SECRET = os.environ.get('OSU_CLIENT_SECRET')

api = Ossapi(OSU_CLIENT_ID, OSU_CLIENT_SECRET)

top_or_recent = "top"
mode = "std"

num_done = 0
last_time = time.time()

beatmapset = api.beatmapset(2165225)
beatmapset.language

{'id': 3, 'name': 'Japanese'}

In [12]:
api.score('osu', 1799705339)

ValueError: api returned an error of `Specified Solo\Score couldn't be found.` for a request to https://osu.ppy.sh/api/v2/scores/osu/1799705339

In [2]:
def scrape_beatmapsets(beatmapset_ids):
    global num_done
    global last_time
     
    conn = sqlite3.connect("../data/UserScores.db")
    cursor = conn.cursor()
    
    for beatmapset_id in beatmapset_ids:
        beatmapset_id = beatmapset_id[0]
        try:
            beatmapset = api.beatmapset(beatmapset_id)
        except Exception as e:
            print(e)
            print("failed to get beatmapset", beatmapset_id)
            continue

        try:
            artist = getattr(beatmapset, 'artist', None)
            creator = getattr(beatmapset, 'creator', None)
            genre = getattr(beatmapset, 'genre', None)
            genre = getattr(genre, 'name', None)
            
            language = getattr(beatmapset, 'language', None)
            language = getattr(language, 'name', None)
            
            covers = getattr(beatmapset, 'covers', None)
            list_2x_url = getattr(covers, 'list_2x', None) if covers else None
            
            preview_url = getattr(beatmapset, 'preview_url', None)
            title = getattr(beatmapset, 'title', None)
            
            beatmapset = (int(beatmapset_id), artist, creator, genre, language, list_2x_url, preview_url, title)
            
            query = """
            INSERT INTO beatmapsets_std 
            (beatmapset_id, artist, creator, genre, language, list_2x_url, preview_url, title) 
            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
            """
            
            cursor.execute(query, beatmapset)
            conn.commit()
        except Exception as e:
            print(e)
            print("failed to insert beatmapset", beatmapset_id)
        
        
        num_done += 1
        if num_done % 100 == 0:
            print(
                str(num_done) + ": " + str(time.time() - last_time),
                strftime("%H:%M:%S", localtime(time.time())),
            )
            last_time = time.time()
        time.sleep(0.075)
        
    conn.close()
        

In [3]:
conn = sqlite3.connect('../data/UserScores.db')
cursor = conn.cursor()

mode = 'std'

top_beatmapset_ids = cursor.execute(f"SELECT DISTINCT beatmapset_id FROM beatmaps_{mode}").fetchall()
recent_beatmapset_ids = set()
# recent_beatmapset_ids = cursor.execute(f'SELECT DISTINCT beatmapset_id FROM recent_scores_{mode}').fetchall()

completed_beatmapset_ids = cursor.execute(f'SELECT DISTINCT beatmapset_id from beatmapsets_{mode}').fetchall()

remaining_beatmapset_ids = list((set(top_beatmapset_ids) | set(recent_beatmapset_ids)) - set(completed_beatmapset_ids))

print(f"Remaining beatmapset ids to scrape: {len(remaining_beatmapset_ids)}")

num_partitions = 4
partitioned_beatmapset_ids = array_split(remaining_beatmapset_ids, num_partitions)

conn.close()

Remaining beatmapset ids to scrape: 28593


In [4]:
for beatmapset_ids in partitioned_beatmapset_ids:
    print(len(beatmapset_ids))

with ThreadPoolExecutor(max_workers=num_partitions) as executor:
    for beatmapset_ids in partitioned_beatmapset_ids:
        executor.submit(scrape_beatmapsets, beatmapset_ids)

7149
7148
7148
7148
100: 7.755260944366455 20:56:19
200: 6.8113179206848145 20:56:26
300: 6.827624797821045 20:56:33
400: 6.965630054473877 20:56:40
500: 6.802145004272461 20:56:47
600: 6.918905019760132 20:56:53
700: 6.953130483627319 20:57:00
800: 6.884090423583984 20:57:07
900: 6.70130729675293 20:57:14
1000: 6.872616767883301 20:57:21
1100: 6.7019336223602295 20:57:28
1200: 7.036317586898804 20:57:35
1300: 6.901072263717651 20:57:42
1400: 6.854767322540283 20:57:48
1500: 6.779651641845703 20:57:55
1600: 6.963581562042236 20:58:02
1700: 7.5181872844696045 20:58:10
exactly one of beatmap_id and beatmapset_id must be passed.
failed to get beatmapset None
1800: 7.056440114974976 20:58:17
1900: 6.843167781829834 20:58:24
2000: 7.015375852584839 20:58:31
2100: 6.867074728012085 20:58:37
2200: 6.885905742645264 20:58:44
2300: 7.223113059997559 20:58:52
2400: 7.150284051895142 20:58:59
2500: 6.977209568023682 20:59:06
2600: 7.0583765506744385 20:59:13
2700: 8.084341287612915 20:59:21
2800: