In [2]:
import sqlite3
from ossapi import Ossapi
from time import strftime, localtime
import pandas as pd
from gensim.test.utils import datapath
from gensim import utils
import gensim
from gensim.models.callbacks import CallbackAny2Vec
from sklearn.model_selection import train_test_split
import os
import sys

sys.path.insert(0, '../') # Add parent directory because data is in parent directory

In [3]:
class ScoreGen:
    def __init__(self, top_or_recent = 'top', mode = 'std', keepHD = True):
        self.conn = sqlite3.connect('../data/UserScores.db')
        self.cursor = self.conn.cursor()
        self.top_or_recent = top_or_recent
        self.mode = mode
        
        NF = 1
        HD = 8  # Removed only for no HD
        SD = 32
        NC = 512
        SO = 4096
        PF = 16384
        SV2 = 536870912
        
        if keepHD:
            self.mods_removed = NF | SD | SO | PF | SV2
        else:
            self.mods_removed = NF | SD | HD | SO | PF | SV2
    def __iter__(self):
        ids = self.cursor.execute(f"SELECT DISTINCT user_id FROM {self.top_or_recent}_scores_{self.mode}").fetchall()
        for id in ids:
            id = id[0]
            scores = self.cursor.execute(f"SELECT beatmap_id, mods FROM {self.top_or_recent}_scores_{self.mode} WHERE user_id = ? ORDER BY created_at DESC", (id,)).fetchall()
            
            to_yield = []
            for score in scores:
                bm_id, mods = score
                mods &= ~self.mods_removed
                
                to_yield.append(str(bm_id) + '-' + str(mods))
            
            yield to_yield
        
        self.conn.close()

gen = ScoreGen()
sentences = []
for sentence in gen:
    sentences.append(sentence)

In [4]:
class MonitorCallback(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(
        self, model
    ):  # word2vec accumulates loss, so we need to subtract the previous loss
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print("Loss after epoch {}: {}".format(self.epoch, loss))
        else:
            print(
                "Loss after epoch {}: {}".format(
                    self.epoch, loss - self.loss_previous_step
                )
            )
        self.epoch += 1
        self.loss_previous_step = loss


model = gensim.models.Word2Vec(
    sentences=sentences,
    vector_size=15,
    epochs=50,
    window=100,
    min_count=20,  # Higher for generic scores, lower for low frequency scores (top players/niche players).
    workers=16,  # 16 thread computer. dunno if it does anything (idek if i have cython working)
    sg=0,  # skip-gram significantly better for low freqency words, cbow slightly better for high frequency. SG unreasonable training time with window 100.
    hs=0,  # negative sampling for large datasets (training speed).
    compute_loss=True,
    callbacks=[MonitorCallback()],
)



Loss after epoch 0: 352532.375
Loss after epoch 1: 271912.3125
Loss after epoch 2: 270669.125
Loss after epoch 3: 277259.0625
Loss after epoch 4: 281700.375
Loss after epoch 5: 262041.125
Loss after epoch 6: 243979.625
Loss after epoch 7: 251841.25
Loss after epoch 8: 246119.5
Loss after epoch 9: 257464.75
Loss after epoch 10: 268937.0
Loss after epoch 11: 261969.75
Loss after epoch 12: 258850.25
Loss after epoch 13: 259014.75
Loss after epoch 14: 252468.75
Loss after epoch 15: 243152.0
Loss after epoch 16: 230815.5
Loss after epoch 17: 245442.5
Loss after epoch 18: 235175.5
Loss after epoch 19: 226398.0
Loss after epoch 20: 228922.0
Loss after epoch 21: 234063.5
Loss after epoch 22: 241622.5
Loss after epoch 23: 245390.5
Loss after epoch 24: 225986.5
Loss after epoch 25: 230523.5
Loss after epoch 26: 247625.5
Loss after epoch 27: 246951.5
Loss after epoch 28: 246342.5
Loss after epoch 29: 235261.5
Loss after epoch 30: 227203.0
Loss after epoch 31: 231854.0
Loss after epoch 32: 219300.

In [5]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

# # word2vec_model = model
# # else:
#     # word2vec_model = gensim.models.Word2Vec.load("word2vec-pp/word2vec-pp")
# model = gensim.models.Word2Vec.load("w2v_model/w2v_model")
# model.wv.index_to_key[10000:20000]
word2vec_model = model
NN = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(word2vec_model.wv.vectors)

In [8]:
import sys
sys.path.insert(0, "../")
from data.classes import Score

OSU_CLIENT_ID = os.environ.get("OSU_CLIENT_ID")
OSU_CLIENT_SECRET = os.environ.get("OSU_CLIENT_SECRET")
user_id = '28956125'

api = Ossapi(OSU_CLIENT_ID, OSU_CLIENT_SECRET)
top_scores = api.user_scores(user_id, type="best", mode="osu", limit=100)

top_scores = [Score(score) for score in top_scores]
top_scores.sort(key=lambda x: x.pp, reverse=True)

top_scores = [
    str(score.beatmap_id) + "-" + str(score.mods) for score in top_scores # Limit to top 50
]

top_scores_vec = [
    score for score in top_scores if score in word2vec_model.wv.index_to_key
]
top_scores_vec = [word2vec_model.wv[score] for score in top_scores_vec]

neighbor = NN.kneighbors([np.mean(top_scores_vec, axis=0)])

In [9]:
# Get the top 5 beatmaps
beatmaps = [model.wv.index_to_key[i] for i in neighbor[1][0]]
beatmaps = [beatmap for beatmap in beatmaps if beatmap not in top_scores]

beatmaps

['554627-64', '709075-0', '489768-64', '630591-64', '1791108-64']