# Nearest Neighbors Prediction for Beatmap in Top Plays


In [None]:
import sqlite3
from ossapi import Ossapi
import gensim
from gensim.models.callbacks import CallbackAny2Vec

%load_ext cython

# Word2Vec


In [None]:
# Each sentence is a iterator of users top score beatmap_ids in order of pp, limited to top plays only.
# Sentences is an iterator of sentence(s).
class Corpus:
    def __init__(self):
        self.conn = sqlite3.connect("../data/osu.db")
        self.cursor = self.conn.cursor()
        query = """CREATE TEMP VIEW user_scores AS SELECT beatmap_id, mods, scores.user_id, pp FROM scores JOIN users on scores.user_id = users.user_id"""
        self.cursor.execute(query)

    def __iter__(self):
        # Iterate over users
        ids = self.cursor.execute(
            "SELECT user_id FROM users ORDER BY user_id ASC"
        ).fetchall()

        NF = 1
        HD = 8  # Removed only for no HD
        SD = 32
        # NC = 512
        SO = 4096
        PF = 16384
        SV2 = 536870912
        # standard_removed_mods = NF | SD | SO | PF | SV2
        noHD_removed_mods = NF | SD | HD | SO | PF | SV2

        for id in ids:
            id = id[0]
            scores = self.cursor.execute(
                "SELECT beatmap_id, mods FROM user_scores WHERE user_id = ? ORDER BY pp DESC LIMIT 100",
                (id,),
            ).fetchall()

            to_yield = []
            for score in scores:
                bm_id, mod_enum = score

                mod_enum &= ~noHD_removed_mods

                to_yield.append(str(bm_id) + "-" + str(mod_enum))

            yield to_yield

        self.conn.close()


gen = Corpus()
sentences = []
for sentence in gen:
    sentences.append(sentence)

In [None]:
class MonitorCallback(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(
        self, model
    ):  # word2vec accumulates loss, so we need to subtract the previous loss
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print("Loss after epoch {}: {}".format(self.epoch, loss))
        else:
            print(
                "Loss after epoch {}: {}".format(
                    self.epoch, loss - self.loss_previous_step
                )
            )
        self.epoch += 1
        self.loss_previous_step = loss


model = gensim.models.Word2Vec(
    sentences=sentences,
    vector_size=15,
    epochs=50,
    window=100,
    min_count=20,  # Higher for generic scores, lower for low frequency scores (top players/niche players).
    workers=16,  # 16 thread computer. dunno if it does anything (idek if i have cython working)
    sg=0,  # skip-gram significantly better for low freqency words, cbow slightly better for high frequency. SG unreasonable training time with window 100.
    hs=0,  # negative sampling for large datasets (training speed).
    compute_loss=True,
    callbacks=[MonitorCallback()],
)

In [None]:
import os

model_name = "w2v_model_noHD_15d_50e"
os.mkdir(model_name)
model.save(model_name + "/" + model_name)

## Visualize word2vec


In [None]:
from sklearn.manifold import TSNE  # final reduction
import numpy as np
import random
import matplotlib.pyplot as plt


def reduce_dimensions(model):
    num_dimensions = 2

    # Limit points for performance
    n_points = 5000
    vectors_sub, labels_sub = zip(
        *random.sample(list(zip(model.wv.vectors, model.wv.index_to_key)), n_points)
    )

    vectors = np.asarray(vectors_sub)
    labels = np.asarray(labels_sub)  # fixed-width numpy strings

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


def plot_with_matplotlib(x_vals, y_vals, labels):
    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    # Label randomly subsampled 25 data points
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))


x_vals, y_vals, labels = reduce_dimensions(model)
plot_with_matplotlib(x_vals, y_vals, labels)

# Nearest Neighbors Prediction


In [None]:
from sklearn.neighbors import NearestNeighbors

# # word2vec_model = model
# # else:
#     # word2vec_model = gensim.models.Word2Vec.load("word2vec-pp/word2vec-pp")
# model = gensim.models.Word2Vec.load("w2v_model/w2v_model")
# model.wv.index_to_key[10000:20000]
word2vec_model = model
NN = NearestNeighbors(n_neighbors=5, algorithm="ball_tree").fit(
    word2vec_model.wv.vectors
)

In [None]:
import sys

sys.path.insert(0, "../")
from data.classes import Score
from osu_access_token import client_id, client_secret

user_id = "1009285"
api = Ossapi(client_id, client_secret)
top_scores = api.user_scores(user_id, type="best", mode="osu", limit=100)

top_scores = [Score(score) for score in top_scores]
top_scores.sort(key=lambda x: x.pp, reverse=True)

top_scores = [
    str(score.beatmap_id) + "-" + str(score.mods)
    for score in top_scores  # Limit to top 50
]

top_scores_vec = [
    score for score in top_scores if score in word2vec_model.wv.index_to_key
]
top_scores_vec = [word2vec_model.wv[score] for score in top_scores_vec]

neighbor = NN.kneighbors([np.mean(top_scores_vec, axis=0)])

In [None]:
# Get the top 5 beatmaps
beatmaps = [model.wv.index_to_key[i] for i in neighbor[1][0]]
beatmaps = [beatmap for beatmap in beatmaps if beatmap not in top_scores]

beatmaps

# Compare

Select random players at different pps from test set, and compare their top plays to avg pp of scores with predicted beatmap_id and pp combination


In [None]:
def compare(model, user_id, NN):
    """
    Selects users from the database, and compares their total_pp to average pp of model's recommended beatmpas.
    """
    # total_pp
    api = Ossapi(client_id, client_secret)
    user = api.user(user_id)
    pp = user.statistics.pp

    # average of recommended beatmaps
    top_scores = api.user_scores(user_id, type="best", mode="osu", limit=100)
    top_scores = [Score(score) for score in top_scores]
    top_scores.sort(key=lambda x: x.pp, reverse=True)
    top_scores = [
        str(score.beatmap_id) + "-" + str(score.mods) for score in top_scores
    ]  # Limit to top 50
    top_scores_vec = [
        score for score in top_scores if score in word2vec_model.wv.index_to_key
    ]
    top_scores_vec = [word2vec_model.wv[score] for score in top_scores_vec]
    neighbor = NN.kneighbors([np.mean(top_scores_vec, axis=0)])
    beatmaps = [model.wv.index_to_key[i] for i in neighbor[1][0]]

    conn = sqlite3.connect("../data/osu.db")
    cursor = conn.cursor()
    query = """SELECT pp from score where beatmap_id = ? and mods = ?"""
    cursor.execute(query, (beatmaps[0].split("-")[0], beatmaps[0].split("-")[1]))
    all_pp = cursor.fetchall()
    all_pp = [pp[0] for pp in all_pp]
    conn.close()

    print("User pp/20: ", pp / 20)
    print("Avg.: ", sum(all_pp) / len(all_pp), "Beatmap pp: ", all_pp)


user_ids = [
    "13767572",
    "10073635",
    "8359561",
    "2944449",
    "9987634",
    "7236907",
    "12900463",
    "12363937",
    "13741560",
    "28956125",
    "13859779",
    "2146481",
    "24195234",
    "10577632",
]
# Manually randomly chosen user_ids with pp ranging from ~15k to ~2k
for user_id in user_ids:
    compare(model, user_id, NN)