**DEPRECATED**

# Scrapes User IDS from osu leaderboards. 
This notebook scrapes all user_ids available in osu leaderboards by going over all pages for all countries leaderboards.
Leaderboards are limited to 10,000 users, and there is no other way of getting (valid) user_ids. Up to 10,000 users 
for each country (approx 400,000 users due to some countries not having that many players). 

Stores user information into user table of data/osu.db database. 

In [1]:
from ossapi import Ossapi
from ossapi import Cursor
import sqlite3 
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import json
from time import strftime, localtime
from dataclasses import dataclass

import plyvel
import pickle
import ossapi.models
import time
from concurrent.futures import ThreadPoolExecutor
import threading
import asyncio

## Step 1: Scrape user_id only.
Only id is available on leaderboard pages.

Use synchronous variant because it's easier to debug. Shouldn't take too long, couple of hours maybe. 

In [2]:
client_id = "28051"
client_secret = "Ry1MCvj2OHJ1mK19M9x1j1sUNFwLQdeJb8Xpfegs"
api = Ossapi(client_id, client_secret)

In [4]:
def add_user_ids(user_ids):
    """
    Adds user ids to the user table in data/osu_recommender.db
    user_ids: List of user_ids to add to db
    """
    try:
        conn = sqlite3.connect('data/osu_recommender.db')
        cursor = conn.cursor()
        cursor.executemany("INSERT INTO user (user_id) VALUES (?)", user_ids)
        conn.commit()        
        user_ids.clear() 
    except Exception as e:
        print(e)
    finally:
        conn.close()

In [5]:
def get_user_ids_from_lb(country_code):
    """
    Gets user ids from that countries leaderboards
    country_code: alpha_2 ISO3166 code
    """   
    try:
        formatted_time = strftime('%H:%M:%S', localtime(time.time()))
        print(f"Getting user ids from {country_code}. Time: {formatted_time}", end = '')
        ids = []
        lb_cursor = Cursor(page=1) # page cursor for lb. api call returns next page cursor, none if none
        while lb_cursor is not None:
            lb = api.ranking(mode = "osu", type = "performance", cursor = lb_cursor, country = country_code)
            lb_cursor = lb.cursor
            for user_stats in lb.ranking:
                if user_stats.pp < 500: # Stop when we get to users with less than 500pp
                    break
                ids.append(user_stats.user.id)
    except Exception as e:
        print("\nError getting user ids from lb")
        print(e)
    finally:
        formatted_time = strftime('%H:%M:%S', localtime(time.time()))
        print(f"\nDone. Time: {formatted_time}")
        return ids

In [6]:
# Get all country codes from https://osu.ppy.sh/rankings/osu/performance
page_html = requests.get("https://osu.ppy.sh/rankings/osu/performance").text
soup = bs(page_html, 'html.parser')
script = soup.find("script", {"id": "json-country-filter"})
json_text = script.get_text()
data = json.loads(json_text)
country_codes = [country['id'] for country in data['items']]

In [7]:
for country_code in country_codes:
    ids = get_user_ids_from_lb(country_code)
    add_user_ids(ids)

Getting user ids from AD. Time: 16:00:48
Done. Time: 16:00:50
no such table: user
Getting user ids from AE. Time: 16:00:50
Done. Time: 16:00:58
no such table: user
Getting user ids from AF. Time: 16:00:58
Done. Time: 16:00:59
no such table: user
Getting user ids from AG. Time: 16:00:59
Done. Time: 16:00:59
no such table: user
Getting user ids from AI. Time: 16:00:59
Done. Time: 16:01:00
no such table: user
Getting user ids from AL. Time: 16:01:00
Done. Time: 16:01:05
no such table: user
Getting user ids from AM. Time: 16:01:05

## Step 2: Scrape user information from their profile page. 
Scrapes user information from their profile page. 
Refer to https://dbdiagram.io/d/osu-654e8e887d8bbd6465f40357 for stored information


In [10]:
from classes import User, Beatmap, Score, Beatmapset
%load_ext autoreload
%autoreload 2

In [4]:

conn = sqlite3.connect('data/osu_recommender.db')
cursor = conn.cursor()
user_ids = cursor.execute("SELECT user_id FROM leaderboard_players").fetchall()
user_ids = [user_id[0] for user_id in user_ids]
completed_ids = cursor.execute("SELECT id FROM user").fetchall()
completed_ids = [user_id[0] for user_id in completed_ids]

user_ids = list(set(user_ids) - set(completed_ids))

num_partitions = 5
size_partition = len(user_ids) // num_partitions
p_user_ids = [user_ids[i:i + size_partition] for i in range(0, (len(user_ids)//num_partitions)*num_partitions, size_partition)]
if len(user_ids) % num_partitions != 0:
    p_user_ids[-1].extend(user_ids[(len(user_ids)//num_partitions)*num_partitions:])
    
num_done = 0
last_time = time.time()

def scrape_players(ids):
    """
    Adds user and score data to db after scraping.
    ids: list of ids to scrape
    """
    global num_done
    global last_time
    conn = sqlite3.connect('data/osu_recommender.db')
    cursor = conn.cursor()
    
    for user_id in ids:
        try:
            user = User(api.user(user_id, mode ='osu', key = 'id'))
            top_scores = api.user_scores(user_id, type = 'best', mode = 'osu', limit = 100)
            scores = []
            
            for score in top_scores:
                try:
                    score = Score(score)
                    # beatmap and beatmapset are None for returned scores. Need to do this with seperate api call.
                    scores.append(score)
                except Exception as e:
                    print(f"Error creating score object: {e}")
                    continue
                
            # Insert everything into db
            try: 
                user.insert(cursor)
            except:
                print(f"Error inserting user {user} into db")
                continue
            
            for score in scores:
                try:
                    score.insert(cursor)
                except:
                    print(f"Error inserting score {score} into db")
                    continue
                
        except Exception as e:
            continue
        finally:
            conn.commit()
            
        num_done += 1
        if num_done % 100 == 0:
            print(str(num_done) + ": " + str(time.time() - last_time), strftime('%H:%M:%S', localtime(time.time())))
            last_time = time.time()
            
    
    conn.close()

In [5]:
with ThreadPoolExecutor(max_workers=num_partitions) as executor:
    for user_ids in p_user_ids:
        executor.submit(scrape_players, user_ids)

Error scraping user 24655709: api returned an error of `None` for a request to https://osu.ppy.sh/api/v2/users/24655709/osu?key=id
Error scraping user 20986289: api returned an error of `None` for a request to https://osu.ppy.sh/api/v2/users/20986289/osu?key=id
Error scraping user 34098677: api returned an error of `None` for a request to https://osu.ppy.sh/api/v2/users/34098677/osu?key=id
Error scraping user 28332859: api returned an error of `None` for a request to https://osu.ppy.sh/api/v2/users/28332859/osu?key=id
100: 32.73060154914856 16:24:28
200: 24.39741277694702 16:24:52
300: 28.55128574371338 16:25:21
400: 24.910887956619263 16:25:45
500: 28.994643449783325 16:26:14
600: 32.20376753807068 16:26:47
Error scraping user 29698312: api returned an error of `None` for a request to https://osu.ppy.sh/api/v2/users/29698312/osu?key=id
700: 28.168621063232422 16:27:15
800: 26.877935886383057 16:27:42
900: 26.972663640975952 16:28:09
1000: 26.30151343345642 16:28:35
1100: 23.9088385105

KeyboardInterrupt: 

Error inserting user User(id=10829486, username='My Angel AnthV', total_pp=3591.65, hit_acc=97.6052, ranked_score=11099188813, play_count=25494, playtime=2439946, count_100=1075898, count_50=121761, count_300=7178593, count_miss=284391, total_hits=8376252, country='CN', join_date='2017-09-11 09:11:31', update_date='2023-12-08 17:15:23') into db
Error inserting user User(id=17329042, username='MatChumski', total_pp=763.414, hit_acc=91.7046, ranked_score=249433772, play_count=6826, playtime=412811, count_100=173917, count_50=19498, count_300=941429, count_miss=45085, total_hits=1134844, country='CO', join_date='2020-06-08 17:54:43', update_date='2023-12-08 17:15:23') into db
Error inserting user User(id=24043340, username='lekkus', total_pp=748.521, hit_acc=94.1012, ranked_score=217137499, play_count=308, playtime=25819, count_100=10995, count_50=1296, count_300=65605, count_miss=2695, total_hits=77896, country='NL', join_date='2021-06-13 11:19:18', update_date='2023-12-08 17:15:24') int