In [1]:
import os
from ossapi import Ossapi
import sqlite3
from time import strftime, localtime
import time
from concurrent.futures import ThreadPoolExecutor
import sys
from numpy import array_split
import pickle
from datetime import datetime
sys.path.insert(0, "../") # Required for database in different directory. 

OSU_CLIENT_ID = os.environ.get('OSU_CLIENT_ID')
OSU_CLIENT_SECRET = os.environ.get('OSU_CLIENT_SECRET')

api = Ossapi(OSU_CLIENT_ID, OSU_CLIENT_SECRET)

top_or_recent = "top"
mode = "std"

num_done = 0
last_time = time.time()


In [2]:
def scrape_beatmaps(beatmap_ids):
    global num_done
    global last_time
     
    conn = sqlite3.connect("../data/UserScores.db")
    cursor = conn.cursor()
    
    for beatmap_id in beatmap_ids:
        try:
            beatmap = api.beatmap(beatmap_id)
        except Exception as e:
            print(e)
            print("failed to get beatmap", beatmap_id)

        try:
            accuracy = getattr(beatmap, "accuracy", None)
            ar = getattr(beatmap, "ar", None)
            beatmapset_id = getattr(beatmap, "beatmapset_id", None)
            bpm = getattr(beatmap, "bpm", None)
            cs = getattr(beatmap, "cs", None)
            difficulty_rating = getattr(beatmap, "difficulty_rating", None)
            drain = getattr(beatmap, "drain", None)
            max_combo = getattr(beatmap, "max_combo", None)
            owner_user_id = getattr(beatmap, "user_id", None)
            total_length = getattr(beatmap, "total_length", None)
            url = getattr(beatmap, "url", None)
            version = getattr(beatmap, 'version', None)
            
            beatmap = (accuracy, ar, int(beatmap_id[0]), beatmapset_id, bpm, cs, difficulty_rating, drain, max_combo, owner_user_id, total_length, url, version)
            
            query = """INSERT INTO beatmaps_std (accuracy, ar, beatmap_id, beatmapset_id, bpm, cs, difficulty_rating, drain, max_combo, owner_user_id, total_length, url, version)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
            cursor.execute(query, beatmap)
            conn.commit()
        except Exception as e:
            print(e)
            print("failed to insert beatmap", beatmap_id)
        
        
        num_done += 1
        if num_done % 100 == 0:
            print(
                str(num_done) + ": " + str(time.time() - last_time),
                strftime("%H:%M:%S", localtime(time.time())),
            )
            last_time = time.time()
        time.sleep(0.075)
        
    conn.close()
        

In [3]:
conn = sqlite3.connect('../data/UserScores.db')
cursor = conn.cursor()

mode = 'std'

top_beatmap_ids = cursor.execute(f"SELECT DISTINCT beatmap_id FROM top_scores_{mode}").fetchall()
recent_beatmap_ids = set()
# recent_beatmap_ids = cursor.execute(f'SELECT DISTINCT beatmap_id FROM recent_scores_{mode}').fetchall()

completed_beatmap_ids = cursor.execute(f'SELECT DISTINCT beatmap_id from beatmaps_{mode}').fetchall()

remaining_beatmap_ids = list((set(top_beatmap_ids) | set(recent_beatmap_ids)) - set(completed_beatmap_ids))

print(f"Remaining beatmap ids to scrape: {len(remaining_beatmap_ids)}")

num_partitions = 4
partitioned_beatmap_ids = array_split(remaining_beatmap_ids, num_partitions)

conn.close()

Remaining beatmap ids to scrape: 95388


In [4]:
for beatmap_ids in partitioned_beatmap_ids:
    print(len(beatmap_ids))

with ThreadPoolExecutor(max_workers=num_partitions) as executor:
    for beatmap_ids in partitioned_beatmap_ids:
        executor.submit(scrape_beatmaps, beatmap_ids)

23847
23847
23847
23847
100: 18.697840929031372 18:21:02
200: 5.3563878536224365 18:21:08
300: 5.240906000137329 18:21:13
400: 5.234847784042358 18:21:18
500: 5.335598945617676 18:21:23
600: 5.3646016120910645 18:21:29
700: 5.328839540481567 18:21:34
800: 5.150911331176758 18:21:39
900: 5.218613624572754 18:21:44
1000: 5.207609176635742 18:21:50
1100: 5.3963775634765625 18:21:55
1200: 5.215417861938477 18:22:00
1300: 5.347398281097412 18:22:06
1400: 5.13692307472229 18:22:11
1500: 5.589474439620972 18:22:16
1600: 6.078691482543945 18:22:22
1700: 5.109249591827393 18:22:28
1800: 5.3461103439331055 18:22:33
1900: 5.221375226974487 18:22:38
2000: 5.326294422149658 18:22:43
2100: 5.348423004150391 18:22:49
2200: 5.46890926361084 18:22:54
2300: 5.183611154556274 18:22:59
2400: 5.4964728355407715 18:23:05
2500: 5.451440811157227 18:23:10
2600: 5.357426643371582 18:23:16
2700: 5.2444093227386475 18:23:21
2800: 5.567412614822388 18:23:27
2900: 5.403875827789307 18:23:32
3000: 5.205388069152832