# KNN w/ AWDLSTM

In this notebook we train an AWD-LSTM model for the proxy task with language model pretraining on IMSLP and language model fine-tuning on the target data.  This notebook has been adapted from the fast.ai [ULMFit tutorial](https://github.com/fastai/course-nlp/blob/master/nn-vietnamese.ipynb).

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai import *
from fastai.text import *
import glob
import eval_models

In [None]:
bs=48

In [None]:
torch.cuda.set_device(1)
torch.cuda.current_device()

In [None]:
data_path = Config.data_path()

In [None]:
name = 'bscore_lm'
path = data_path/name
path.mkdir(exist_ok=True, parents=True)

### IMSLP Language Model

In [None]:
data = load_data(path, 'lm_imslp_databunch-augmented', bs=bs)

In [None]:
learn = language_model_learner(data, AWD_LSTM, drop_mult=0.5, pretrained=False)

In [None]:
train_df = pd.read_csv(path/'train64.csv')
valid_df = pd.read_csv(path/'valid64.csv')
test_df = pd.read_csv(path/'test64.csv')

In [None]:
basicTokenizer = Tokenizer(pre_rules=[], post_rules=[])
data_clas = TextDataBunch.from_df(path, train_df, valid_df, tokenizer=basicTokenizer,
                                  vocab = data_target_lm.vocab, bs=bs, num_workers=1)

In [None]:
ftmodel_basename = 'awdlstm_train-imslp_finetune-target_lm-augmented'
learn_c = text_classifier_learner(data_clas, AWD_LSTM, pretrained=False, drop_mult=0.5, 
                                  metrics=[accuracy, FBeta(average='macro', beta=1)])
learn_c.load_encoder(f'{ftmodel_basename}enc')
learn_c.freeze()

In [None]:
from fastai.callbacks.hooks import *
from sklearn.manifold import TSNE
from plotnine import *
from matplotlib import pyplot as plt
learn_c.load('awdlstm_train-imslp_finetune-target_clas-augment')

In [None]:
net = learn_c.model
encoder = net[0]
enc = list(encoder.children())[0]
w = enc.encoder.weight
vocab = learn_c.data.vocab
tensor = w.cpu().detach().numpy()

In [None]:
def encodeColumn(arr):
    bitstring = ""
    for i in range(len(arr)):
        if arr[i] == 1:
            bitstring+="1"
        else:
            bitstring+="0"
    hashint = int(bitstring,2)
    return hashint

In [None]:
points = []
labels = []
for idx in range(62):
    bscore_column = np.zeros(62)
    bscore_column[idx] = 1
    bscore_int = encodeColumn(bscore_column)
    id_num = vocab.numericalize([str(bscore_int)])[0]
    points.append(tensor[id_num])
    labels.append(idx)

In [None]:
def plot_tSNE(x, y):
    tsne = TSNE(n_components=2, random_state=0)
    tsne_obj = tsne.fit_transform(x)
    tsne_df = pd.DataFrame({'X': tsne_obj[:, 0],
                           'Y': tsne_obj[:, 1],
                           'composer': y})
    plt.figure(figsize=(16, 10))
    legend_position = "right"
    return (ggplot(tsne_df, aes(x="X", y="Y")) + 
        geom_point(alpha=0.8) +
        geom_text(aes(label=y),position = position_nudge(y = 5),size=5) +
        theme_bw() +
        guides(colour = guide_legend(override_aes = {'alpha': 1})) +
        theme(dpi=300, legend_position="none",
            axis_text_x = element_blank(),
            axis_text_y = element_blank(),
            axis_title_x = element_blank(),
            axis_title_y = element_blank(),
            axis_ticks = element_blank()))

In [None]:
def map_labels(labels):
    out = []
    for i in labels:
        note = int2note(i)
        out.append(note)
    return out

In [None]:
def int2note(i):
    notes = ['A','B','C','D','E','F','G']
    offset  =0
    if i >= 28:
        k = i-28+4+21
        if k%7 >= 2:
            offset=1
        if i<=37:
            note = notes[k%7]+f'{int(k/7)-1+offset}(R)'
        else:  
            note = notes[k%7]+f'{int(k/7)-1+offset}'
    else:
        if i%7 >= 2:
            offset=1
        if i>=18:
            note = notes[i%7]+f'{int(i/7)+offset}(L)'
        else:
            note = notes[i%7]+f'{int(i/7)+offset}'
    return note

In [None]:
labels1 = map_labels(labels)
plot_tSNE(points,labels1)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(points,labels)

In [None]:
#E4
bscore_column = np.zeros(62)
bscore_column[7] = 1
bscore_int = encodeColumn(bscore_column)
id_num = vocab.numericalize([str(bscore_int)])[0]
vec1 = tensor[id_num].reshape(1,-1)
y_pred1 = classifier.kneighbors(vec1, n_neighbors=5)
y_pred1

In [None]:
def getNeighbors(vec, n=5):
    y_pred = classifier.kneighbors(vec, n_neighbors=n)
    return y_pred

In [None]:
def find_analogies():
    # Analogies are of the form E4 - E5 + A4 = X
    analogies = []
    for idx in range(62-7):
        p1 = idx
        p2 = idx+7
        for i in range(1,7):
            p3 = idx + 7+i
            if p3 >= 62:
                continue
                
            c1 = getCol([p1])
            c2 = getCol([p2])
            c3 = getCol([p3])
            
            v1 = getvec(c1)
            v2 = getvec(c2)
            v3 = getvec(c3)
            
            dist, neighbors = getNeighbors(v1-v2+v3)
            if p3-7 in neighbors:
                analogies.append([idx,idx+7+i])
    return analogies

In [None]:
def getCol(l):
    col = np.zeros(62)
    for idx in l:
        col[idx]=1
    return col

In [None]:
def getvec(col):
    bscore_int = encodeColumn(col)
    id_num = vocab.numericalize([str(bscore_int)])[0]
    vec = tensor[id_num].reshape(1,-1)
    return vec

In [None]:
analogies = find_analogies()

In [None]:
for i1, i2 in analogies:
    n1 = int2note(i1)
    n2 = int2note(i2)
    print(f'{n1}-{n2[0]+str(int(n2[1])-1)}={n1[0]+str(int(n1[1])+1)}-{n2}')