In [None]:
from sklearn.manifold import TSNE
from model import MyModel
from build import build_dataset
from dataset import PrefixDataset1

import os
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import matplotlib.pyplot as plt
import argparse

In [None]:
parser = argparse.ArgumentParser()

def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--lr', type=float, default=0.0003, help='Learning rate')
    parser.add_argument('--batch_size', type=int, default=64, help='batch size')
    parser.add_argument('--epochs', type=int, default=30, help='train epochs')
    parser.add_argument('--milestones', type=int, nargs='+', default=[116, 233], help='Milestones')
    parser.add_argument('--gamma', type=float, default=0.1, help='Gamma')
    # parser.add_argument('--optimizer', type=str, default='sgd', help='optimizer')

    parser.add_argument('--voc_len',type=int, default=42020, help='voc number')
    parser.add_argument('--embedding_dim',type=int, default=1024, help='embedding size')
    parser.add_argument('--output_dim', type=int, default=64, help="output dim")
    parser.add_argument('--dstore_mmap',type=str, default='/data/zqh/NLP/adaptive-knn-mt/store/datastore/it_finetune')
    parser.add_argument('--dstore_size',type=int, default=3608731, help='datastore size')
    parser.add_argument('--use_cluster', type=bool, default=True, help="if use word cluster")
    parser.add_argument('--cluster_type', type=str, default='spectrum', help='cluster type')
    
    # contrastive learning
    parser.add_argument('--K', type=int, default=200, help='queue size')
    parser.add_argument('--m', type=float, default=0.999, help='momentum')
    parser.add_argument('--class_num', type=int, default=42020, help="class number")
    

    # save
    parser.add_argument('--save_path', type=str, default='/data/zqh/adaptive-knn-mt/checkpoints/koran', help='save checkpoint dir')
    # dataset
    args = parser.parse_args([])
    return args

In [None]:
args = get_args()
dataset= PrefixDataset1(args=args)

from sklearn.cluster import SpectralClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
import math

choice_label = np.arange(args.voc_len)

new_keys = None
new_values = None

for i in choice_label:
    embedding = dataset.data[dataset.label==i]
    labels = dataset.label[dataset.label==i]

    # 少于某个数的时候保留全部的key-value对
    if len(embedding) <= 1000:
        if new_keys is None:
            new_keys = embedding
            new_values = labels
        else:
            new_keys = np.concatenate((new_keys, embedding))
            new_values = np.concatenate((new_values, labels))
        continue

    # sc = SpectralClustering(n_clusters=8, affinity='nearest_neighbors', n_init=3, verbose=3)
    sc = KMeans(n_clusters=3)
    sc.n_clusters = int(math.log(len(embedding))) + 1
    
    sc.fit(embedding)
    cluster_label = sc.predict(embedding)

    cls_embedding = None
    cls_labels = None
    for cls_label in np.unique(cluster_label):
        temp_embedding = embedding[cluster_label==cls_label]
        temp_labels = labels[cluster_label==cls_label]
        select_index = np.arange(len(temp_embedding))
        np.random.shuffle(select_index)
        select_index = select_index[:int(0.3 * len(temp_embedding))]


        if cls_embedding is None:
            cls_embedding = temp_embedding[select_index]
            cls_labels = temp_labels[select_index]
        else:
            cls_embedding = np.concatenate((cls_embedding, temp_embedding[select_index]))
            cls_labels = np.concatenate((cls_labels, temp_labels[select_index]))
        
    if new_keys is None:
        new_keys = np.array(cls_embedding, dtype=np.float16)
        new_values = np.array(cls_labels, dtype=np.int)
    else:
        new_keys = np.concatenate((new_keys, np.array(cls_embedding, dtype=np.float16)))
        new_values = np.concatenate((new_values, np.array(cls_labels, dtype=np.int)))

dstore_key = np.memmap("./new_kyes.npy", dtype=np.float16, mode="w+",
shape=new_keys.shape)
dstore_values = np.memmap("./new_values.npy", dtype=np.int, mode="w+",
shape=new_values.shape)

dstore_key[:,:] = new_keys
dstore_values[:] = new_values 


In [None]:
new_keys.shape
# IT domain 59132 

In [None]:
src_embedding = embedding
src_labels = labels

embedding = np.array(embedding[cluster_label!=-1])
cluster_label = np.array(cluster_label[cluster_label!=-1])

In [None]:
args = get_args()
dataset= PrefixDataset1(args=args)

In [None]:
frequency_vals = [0 for i in range(args.voc_len)]
for i in dataset.label:
    frequency_vals[i] += 1

plt.plot(np.arange(100, args.voc_len), frequency_vals[100:])
plt.show()
print(np.float32(sum(frequency_vals)/args.voc_len))

In [None]:
tsne = TSNE(n_components=2)
tsne_embedding = tsne.fit_transform(embedding)

plt.scatter(tsne_embedding[:,0], 
tsne_embedding[:,1], c=cluster_label)
plt.show()

In [None]:
plt.scatter(tsne_embedding[:,0], 
tsne_embedding[:,1], c=labels)
plt.show()

In [None]:
from torch.utils.data import Dataset

class EmbeddingDataset(Dataset):
    def __init__(self, args):
        super().__init__()
        self.args = args

        self.data = embedding
        self.labels = cluster_label
        self.data = np.array(self.data)
        self.labels = np.array(self.labels)

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        embedding = self.data[index]
        label = self.labels[index]
        return embedding, label

In [None]:
from torch.utils.data import DataLoader

mymodel = MyModel(args).cuda()

dataset = EmbeddingDataset(args)

dataloader = DataLoader(
    dataset = dataset,
    batch_size = args.batch_size,
    shuffle = True
)

optimizer = torch.optim.SGD(mymodel.parameters(), args.lr, 
                                         momentum=0.9, nesterov=True,
                                         weight_decay=0.0004)
# self.optimizer = optim.FP16Optimizer.build_optimizer(self.args, params)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.milestones, gamma=args.gamma)


for epoch in range(args.epochs):
    correct = 0
    data_len = 0
    for i, (x, label) in enumerate(dataloader):
        x = x.cuda()
        label = label.cuda().long()

        logits, loss = mymodel.fc_encode(x, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        predictions = logits.argmax(dim=-1, keepdim=True)

        if i % 50 == 0:
            batch_correct = predictions.eq(label.view_as(predictions)).sum().item()
            acc = batch_correct / x.shape[0]

            print(f"Train epoch: {epoch} loss: {loss} acc: {acc}")

In [None]:
# cluster_label = np.array(cluster_label)
# index = np.where(cluster_label==8)[0]
# temp_labels = labels[index]
# print("cluster embedding nums:", len(temp_labels))
# print(np.unique(temp_labels))

cluster_embedding_num = {}
for c_label in cluster_label:
    cluster_embedding_num[c_label]=[]
    for i in range(args.voc_len):
        cluster_embedding_num[c_label].append(0)
    index = np.where(cluster_label==c_label)[0]
    label = labels[index]
    for i in label:
        cluster_embedding_num[c_label][i] += 1.0
    cluster_embedding_num[c_label] = np.array(cluster_embedding_num[c_label])
    cluster_embedding_num[c_label] = cluster_embedding_num[c_label]/np.float32(len(label))

    


In [None]:
from enum import unique


print(embedding.shape)
print(cluster_label.shape)
for i in cluster_label:
    print(i)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

KNN = KNeighborsClassifier(n_neighbors=8)
KNN.fit(src_embedding, src_labels)

result = KNN.predict(embedding[0:1])
result_prob = KNN.predict_proba(embedding[0:1])
print(result)
print(result_prob)
print(result_prob.shape)

In [None]:
temp_embedding = torch.tensor(embedding[0:1]).cuda()
temp_label = torch.tensor(cluster_label[0:1]).cuda().long()
result, loss = mymodel.fc_encode(temp_embedding, temp_label)
print(result)
predictions = result.argmax(dim=-1, keepdim=True)
for i in cluster_embedding_num[predictions[0][0].item()]:
    if i !=0:
        print(i)