In [None]:
import gzip
import math
import numpy as np
import re
from copy import deepcopy
from sklearn import decomposition
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from numpy.linalg import norm
from tqdm.notebook import tqdm
import torch
import os

### Hyperparameter
Step = 1
WordDim = 300
NormRead = False
nNorm = 2

GPUIdx = "0"
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]=GPUIdx
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
#####
# WordVecKinds = "Extro" # "GloVe", "TestGloVe","Word2Vec", or "FastText" + "Extro"
#####

In [None]:
def read_word_vecs(filename):
    print("Vectors read from", filename)
    wordVectors = {}
    fileObject = open(filename, 'r')
    fileObject.readline() # For handling First Line
    for line_num, line in enumerate(fileObject):
        line = line.strip().lower()
        word = line.split()[0]
        wordVectors[word] = np.zeros(len(line.split())-1, dtype=np.float64)
        vector = line.split()[1:]
        if len(vector) == WordDim:
            for index, vecVal in enumerate(vector):
                wordVectors[word][index] = float(vecVal)
            if NormRead:
                wordVectors[word] = wordVectors[word] / math.sqrt((wordVectors[word]**2).sum() + 1e-5)
        else:
            print(line)
            break
#         if line_num == 1000:
#             break
    print("Done . ")
    return wordVectors

isNumber = re.compile(r'\d+')
def norm_word(word): # Could Add Preprocessing
    if isNumber.search(word.lower()):
        return '---num---'
    elif re.sub(r'\W+', '', word) == '':
        return '---punc---'
    else:
        return word.lower()
    
def wordVecsLDA(wordVecs):
    newWordVecs = deepcopy(wordVecs)
    wordVec_np = []
    
    pbar = tqdm(total = len(newWordVecs))
    for v in newWordVecs.values():
        pbar.update(1)
        wordVec_np.append(v)
    wordVec_np = np.array(wordVec_np)
    pbar.close()
    print("Run LDA ...")
    
    pbar = tqdm(total = 1)
    lda = LinearDiscriminantAnalysis(n_components=WordDim)
    print(wordVec_np[:,:-1].shape, wordVec_np[:,-1].shape)
    wordVec_np = lda.fit_transform(wordVec_np[:,:-1], wordVec_np[:,-1].astype('int'))
    print(wordVec_np.shape)
    pbar.update(1)
    pbar.close()
    
    print("LDA Done ...")
    pbar = tqdm(total = len(newWordVecs))
    for i, k in enumerate(newWordVecs.keys()):
        pbar.update(1)
        newWordVecs[k] = wordVec_np[i]
    pbar.close()
    return newWordVecs

def self_extrofit(wordVecs, threshold, it, k):
    newWordVecs = deepcopy(wordVecs)
    Vocab = newWordVecs.keys()
    WVs = torch.tensor([newWordVecs[v] for v in Vocab]).to(device)
    print("svd")
    U, S, Vh = torch.linalg.svd(WVs)
    print("done")    
    U_k = U[:,:k]
    S_k = S[:k]
    WVs = U_k * S_k
    labels = torch.zeros(len(Vocab))-1
#     labels = labels
    cos = torch.nn.CosineSimilarity(dim=1)
    ii = 0
    pbar = tqdm(total = len(Vocab))
    for i, v in enumerate(Vocab):
        if labels[i] == -1:
            pivot = WVs[i:i+1]#.transpose(0,1)
            full_matrix = WVs[i:]
#             print(torch.matmul(full_matrix,pivot).shape)
            cosine = cos(pivot, full_matrix) # /(torch.norm(full_matrix, dim=1)*torch.norm(pivot))
            pick = cosine > threshold
            similar_word_idx = torch.nonzero(pick, as_tuple=True)[0] + i # Offset
            labels[similar_word_idx] = ii
            ii += 1
        pbar.update(1)
    pbar.close()
    
    if it == 0:
        print(len(Vocab), "words will be extrofitted")
        
    for i, w in enumerate(Vocab):
        newWordVecs[w] = np.hstack((newWordVecs[w], np.mean(wordVecs[w])))
        newWordVecs[w] = np.hstack((newWordVecs[w], labels[i]))
        
#     for word in wvVocab:
#         wordidx = wordidx+1
#         try:
#             wordNeighbours = set(lexicon[word]).intersection(wvVocab)
#             numNeighbours = len(wordNeighbours)
#         except KeyError: numNeighbours = 0

#         if numNeighbours == 0:
#             newWordVecs[word][-1] = wordidx
#         else:
#             newWordVecs[word][-2] += np.mean([np.mean(wordVecs[w]) for w in wordNeighbours])
#             for w in wordNeighbours:
#                 newWordVecs[w][-1] = wordidx

#     ### LDA for dimension reduction
    print("Dimension Reduction ... ")
    newWordVecs = wordVecsLDA(newWordVecs)
    return newWordVecs

def print_word_vecs(wordVectors, outFileName):
    print('Writing down the vectors in', outFileName)
    outFile = open(outFileName, 'w')
    outFile.write(str(len(wordVectors)) + ' ' + str(WordDim) + '\n')
    pbar = tqdm(total = len(wordVectors), desc = 'Writing')
    for word, values in wordVectors.items():
        pbar.update(1)
        outFile.write(word+' ')
        for val in wordVectors[word]:
            outFile.write('%.5f' %(val)+' ')
        outFile.write('\n')
    outFile.close()
    pbar.close()
    

In [None]:
wordVecs = read_word_vecs("/mnt/hdd-nfs/Data/PretrainedWV/wiki-news-300d-1M-subword.vec")
wordVecs_extro = self_extrofit(wordVecs, threshold=0.95, it=0, k=300)
print("Ready (>_<)")

print_word_vecs(wordVecs_extro, "/mnt/hdd-nfs/temp/fastTextSelfExtro_threshold95_dim300.txt")