In [None]:
import fasttext as ft
import pandas as pd
import numpy as np
import ivis
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from collections import Counter
import os
import json

In [None]:
# Parameter and Paths

fasttext_path = "models/oscar_ft_model_dim1536_ws2.bin.bin"
supersense_path = "resources/supersenses.tsv"
output_path = "models/supersenser.bin"
weight = 0.8
early_stoping = 20
kNN = 30
ivis_model = "maaten"
embedding_dimensions = 10

In [None]:
# Load Resources

ft_model = ft.load_model(fasttext_path)
data = pd.read_csv(supersense_path, sep="\t")

In [None]:
# data selection

gps = data.groupby("Klasse")
ix = np.hstack([np.random.choice(v, 900, replace=False) for v in gps.groups.values()])

In [None]:
# build class to integer dictionary

l_dict = {}
i = 0
for c in set(data["Klasse"]):
    l_dict[c] = i
    i+=1
    
with open("resources/supersense_dict.json","w") as f:
    json.dump(l_dict,f)

In [None]:
# train/test split

X = data["wort"]
Y = np.array([l_dict[x] for x in data["Klasse"]])
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.1,stratify=Y)

In [None]:
# to fasttext vectors

X_train_vec = np.stack([ft_model[str(x)] for x in X_train])
X_test_vec = np.stack([ft_model[str(x)] for x in X_test])
X_all = np.concatenate([X_train_vec,X_test_vec])
Y_semi = np.concatenate([y_train,np.array([-1]*len(X_test))])

In [None]:
# init model
model = ivis.Ivis(n_epochs_without_progress=early_stoping, 
                  supervision_weight=weight,
                  k=kNN,
                  model=ivis_model,
                  embedding_dims=embedding_dimensions)

In [None]:
# fit model
model.fit(X_all,np.array(Y_semi))

In [None]:
# evaluate

y_pred = model.score_samples(X_test_vec)
print(classification_report(np.argmax(y_pred, axis=1), y_test))

In [None]:
# save
model.save_model(output_path)