In [2]:
import os
from ossapi import Ossapi
import sqlite3
from time import strftime, localtime
import time
from concurrent.futures import ThreadPoolExecutor
import sys
from numpy import array_split
import pickle
from datetime import datetime
sys.path.insert(0, "../") # Required for database in different directory. 

OSU_CLIENT_ID = os.environ.get('OSU_CLIENT_ID')
OSU_CLIENT_SECRET = os.environ.get('OSU_CLIENT_SECRET')

api = Ossapi(OSU_CLIENT_ID, OSU_CLIENT_SECRET)

with open("leaderboard_ids.pickle", "rb") as handle:
    leaderboard_ids = pickle.load(handle)
leaderboard_ids = set(leaderboard_ids)

top_or_recent = "top"
mode = "std"


In [3]:
num_done = 0
last_time = time.time()

def scrape_users(ids):
    """
    Adds user data to users table and scores to scores in ../data/osu.db after scraping.
    ids: list of ids to scrape
    """
    global num_done
    global last_time
    conn = sqlite3.connect("../data/UserScores.db")  # Change to osu.db in the future
    cursor = conn.cursor()
    try:
        for user_id in ids:
            try:
                top_scores = api.user_scores(user_id, type="best", mode="osu", limit=100)
                scores = []
            except Exception as e:
                print(e)
                continue

            for score in top_scores:
                try:
                    beatmap = getattr(score, "beatmap", None)
                    beatmap_id = getattr(beatmap, "id", None) if beatmap else None
                    
                    beatmapset_id = getattr(beatmap, "beatmapset_id", None) if beatmap else None

                    mods = getattr(score, "mods", None)
                    mods = getattr(mods, "value", None) if mods else None

                    pp = getattr(score, "pp", None)

                    created_at = getattr(score, "created_at", None)
                    created_at = (
                        datetime.strftime(created_at, "%Y-%m-%d %H:%M:%S")
                        if created_at
                        else None
                    )

                    score = (
                        int(user_id),
                        beatmap_id,
                        beatmapset_id,
                        pp,
                        mods,
                        created_at,
                    )
                    scores.append(score)

                except Exception as e:
                    print(e)
                    continue

            try:
                cursor.executemany(
                    f"""
                    INSERT INTO {top_or_recent}_scores_{mode} (user_id, beatmap_id, beatmapset_id, pp, mods, created_at)
                    VALUES (?, ?, ?, ?, ?, ?)
                    """,
                    scores,
                )
                conn.commit()
            except Exception as e:
                print(e)
                continue

            num_done += 1
            if num_done % 100 == 0:
                print(
                    str(num_done) + ": " + str(time.time() - last_time),
                    strftime("%H:%M:%S", localtime(time.time())),
                )
                last_time = time.time()
                
    except Exception as e:
        print(e)
    finally:
        conn.close()

In [4]:
conn = sqlite3.connect('../data/UserScores.db')
cursor = conn.cursor()


completed_user_ids = cursor.execute(f"SELECT DISTINCT user_id FROM {top_or_recent}_scores_{mode}").fetchall()
completed_user_ids = set([x[0] for x in completed_user_ids])
conn.close()

remaining_ids = list(leaderboard_ids - completed_user_ids)
print(f"Remaining user ids to scrape: {len(remaining_ids)}")

num_partitions = 5
partitioned_user_ids = array_split(remaining_ids, num_partitions)

for user_ids in partitioned_user_ids:
    print(len(user_ids))

with ThreadPoolExecutor(max_workers=num_partitions) as executor:
    for user_ids in partitioned_user_ids:
        executor.submit(scrape_users, user_ids)
        


Remaining user ids to scrape: 471449
94290
94290
94290
94290
94289
api returned an error of `None` for a request to https://osu.ppy.sh/api/v2/users/33556414/scores/best?mode=osu&limit=100
100: 29.106772899627686 19:13:19
api returned an error of `None` for a request to https://osu.ppy.sh/api/v2/users/33774777/scores/best?mode=osu&limit=100
200: 25.173134565353394 19:13:44
300: 31.5966215133667 19:14:16
400: 28.21389126777649 19:14:44
500: 28.982627630233765 19:15:13
600: 26.91258478164673 19:15:40
700: 34.15570259094238 19:16:14
800: 39.062509536743164 19:16:53
900: 39.93741583824158 19:17:33
1000: 39.567368268966675 19:18:13
1100: 37.212759494781494 19:18:50
1200: 37.45939326286316 19:19:27
1300: 35.763702154159546 19:20:03
1400: 38.494345903396606 19:20:42
1500: 36.37955141067505 19:21:18
1600: 36.18187880516052 19:21:54
1700: 36.670026540756226 19:22:31
1800: 33.92823314666748 19:23:05
1900: 36.476410150527954 19:23:41
2000: 36.65852403640747 19:24:18
2100: 27.447745084762573 19:24:

In [5]:
conn.close()

CREATE TABLE IF NOT EXISTS top_scores_std (
    user_id INTEGER,
    beatmap_id INTEGER,
    beatmapset_id INTEGER,
    pp INTEGER,
    mods INTEGER,
    created_at TEXT
);