In [None]:
from ossapi import Ossapi
import pandas as pd
import gensim
from gensim.models.callbacks import CallbackAny2Vec
import os
from sklearn.neighbors import NearestNeighbors

import numpy as np
import sys

sys.path.insert(0, "../")  # Add parent directory because data is in parent directory

In [None]:
df = pd.read_parquet("../data/recent_sentences_std2.parquet", engine="pyarrow")
sentences = df["sentences"].tolist()
sentences = [list(sentence) for sentence in sentences]

In [None]:
class MonitorCallback(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(
        self, model
    ):  # word2vec accumulates loss, so we need to subtract the previous loss
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print("Loss after epoch {}: {}".format(self.epoch, loss))
        else:
            print(
                "Loss after epoch {}: {}".format(
                    self.epoch, loss - self.loss_previous_step
                )
            )
        self.epoch += 1
        self.loss_previous_step = loss


model = gensim.models.Word2Vec(
    sentences=sentences,
    vector_size=15,
    epochs=100,
    window=20,
    min_count=2,
    workers=16,
    sg=0,
    hs=0,
    negative=20,
    ns_exponent=1,
    compute_loss=True,
    callbacks=[MonitorCallback()],
)

# loss = model.get_latest_training_loss()

In [None]:
model.save("recent_word2vec_1.model")

In [None]:
from sklearn.manifold import TSNE  # final reduction
import random
import matplotlib.pyplot as plt


def reduce_dimensions(model):
    num_dimensions = 2

    # Limit points for performance
    n_points = 3000
    vectors_sub, labels_sub = zip(
        *random.sample(list(zip(model.wv.vectors, model.wv.index_to_key)), n_points)
    )

    vectors = np.asarray(vectors_sub)
    labels = np.asarray(labels_sub)  # fixed-width numpy strings

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


def plot_with_matplotlib(x_vals, y_vals, labels):
    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    # Label randomly subsampled 25 data points
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))


x_vals, y_vals, labels = reduce_dimensions(model)
plot_with_matplotlib(x_vals, y_vals, labels)

In [None]:
# # word2vec_model = model
# # else:
#     # word2vec_model = gensim.models.Word2Vec.load("word2vec-pp/word2vec-pp")
# model = gensim.models.Word2Vec.load("w2v_model/w2v_model")
# model.wv.index_to_key[10000:20000]
word2vec_model = model
NN = NearestNeighbors(n_neighbors=50, algorithm="ball_tree").fit(
    word2vec_model.wv.vectors
)

In [None]:
import sys

sys.path.insert(0, "../")
from data.classes import Score

OSU_CLIENT_ID = os.environ.get("OSU_CLIENT_ID")
OSU_CLIENT_SECRET = os.environ.get("OSU_CLIENT_SECRET")
user_id = "28956125"

api = Ossapi(OSU_CLIENT_ID, OSU_CLIENT_SECRET)
top_scores = api.user_scores(user_id, type="best", mode="osu", limit=100)

top_scores = [Score(score) for score in top_scores]
top_scores.sort(key=lambda x: x.pp, reverse=True)

# top_scores = [
#     str(score.beatmap_id) + "-" + str(score.mods) for score in top_scores # Limit to top 50
# ]

top_scores = ["1872812-0"]
top_scores_vec = [
    score for score in top_scores if score in word2vec_model.wv.index_to_key
]
top_scores_vec = [word2vec_model.wv[score] for score in top_scores_vec]

neighbor = NN.kneighbors([np.mean(top_scores_vec, axis=0)])

In [None]:
# Get the top 5 beatmaps
beatmaps = [model.wv.index_to_key[i] for i in neighbor[1][0]]
beatmaps = [beatmap for beatmap in beatmaps if beatmap not in top_scores]

beatmaps