# Implement of Extrofitting

In [1]:
from __future__ import print_function
import math
import numpy as np
import re
from copy import deepcopy
from tqdm import tqdm_notebook
from sklearn import decomposition
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

### Hyperparameter
WordDim = 300
ExpandDim = 1
NormRead = True
ExpandNorm = True
#####

# Algorithms

In [11]:
def read_word_vecs(filename):
    print("Vectors read from", filename)
    wordVectors = {}
    fileObject = open(filename, 'r')
    fileObject.readline() # For handling First Line
    for line in fileObject:
        line = line.strip().lower()
        word = line.split()[0]
        wordVectors[word] = np.zeros(len(line.split())-1, dtype=np.float64)
        vector = line.split()[1:]
        if len(vector) == WordDim:
            for index, vecVal in enumerate(vector):
                wordVectors[word][index] = float(vecVal)
            if NormRead:
                wordVectors[word] = wordVectors[word] / math.sqrt((wordVectors[word]**2).sum() + 1e-5)
    return wordVectors

isNumber = re.compile(r'\d+')
def norm_word(word): # Could Add Preprocessing
    if isNumber.search(word.lower()):
        return '---num---'
    elif re.sub(r'\W+', '', word) == '':
        return '---punc---'
    else:
        return word.lower()
    
def read_lexicon(filename, wordVecs):
    lexicon = {}
    for line in open(filename, 'r'):
        words = line.lower().strip().split()
        lexicon[norm_word(words[0])] = [norm_word(word) for word in words[1:]]
    return lexicon

def wordVecsLDA(wordVecs):
    newWordVecs = deepcopy(wordVecs)
    wordVec_np = []
    
    for k in newWordVecs.keys():
        wordVec_np.append(newWordVecs[k])
    wordVec_np = np.array(wordVec_np)
    
    if ExpandNorm:
        wordVec_np[:,-ExpandDim:-1] \
        = wordVec_np[:,-ExpandDim:-1] / np.sqrt(np.sum(wordVec_np[:,-ExpandDim:-1]**2, axis=0) + 1e-5)

    lda = LinearDiscriminantAnalysis(n_components=WordDim)
    wordVec_np = lda.fit_transform(wordVec_np[:,:-1], wordVec_np[:,-1])
        
    for i, k in enumerate(newWordVecs.keys()):
        newWordVecs[k] = wordVec_np[i]
    return newWordVecs

def extrofit(wordVecs, lexicon, it):
    newWordVecs = deepcopy(wordVecs)
    wvVocab = set(newWordVecs.keys())
    loopVocab = wvVocab.intersection(set(lexicon.keys()))
    if it == 0: print(len(loopVocab), "words will be extrofitted")
    wordidx = 0
    for w in wvVocab:
        for i in range(ExpandDim):
            newWordVecs[w] = np.hstack((newWordVecs[w], np.mean(wordVecs[w])))
        newWordVecs[w] = np.hstack((newWordVecs[w], np.zeros(1)))

    for word in wvVocab:
        wordidx = wordidx+1
        try:
            wordNeighbours = set(lexicon[word]).intersection(wvVocab)
            numNeighbours = len(wordNeighbours)
        except KeyError: numNeighbours = 0

        if numNeighbours == 0:
            newWordVecs[word][-1] = wordidx
        else:
            for w in wordNeighbours:
                newWordVecs[w][-1] = wordidx
    print("Dimension Reduction ... ")
    newWordVecs = wordVecsLDA(newWordVecs)
        
    return newWordVecs

def print_word_vecs(wordVectors, outFileName):
    print('Writing down the vectors in', outFileName)
    outFile = open(outFileName, 'w')
    outFile.write(str(len(wordVectors)) + ' ' + str(WordDim) + '\n')
    pbar = tqdm_notebook(total = len(wordVectors), desc = 'Writing')
    for word, values in wordVectors.iteritems():
        pbar.update(1)
        outFile.write(word+' ')
        for val in wordVectors[word]:
            outFile.write('%.5f' %(val)+' ')
        outFile.write('\n')
    outFile.close()
    pbar.close()

# Run

In [10]:
### Loading Pretrained Word Vector
wordVecs = read_word_vecs("./word_vectors/glove.txt")

### Select Lexicon
lexicon = read_lexicon('./lexicons/ppdb-xl.txt', wordVecs)
# lexicon = read_lexicon("./lexicons/wordnet-synonyms.txt", wordVecs)
# lexicon = read_lexicon("./lexicons/wordnet-synonyms+.txt", wordVecs)
# lexicon = read_lexicon("./lexicons/framenet.txt", wordVecs)

### Run
wordVecs_extro = extrofit(wordVecs, lexicon, 0)
print("Ready (>_<)")

Vectors read from ./word_vectors/glove.txt


UnicodeDecodeError: 'cp949' codec can't decode byte 0xcf in position 6768: illegal multibyte sequence

# Word Similarity Tasks (MEN-3000)

In [7]:
import operator
import scipy.stats
    
""" MEN-3k """
fp_men = open("./MEN_dataset_natural_form_full.txt", 'r')
fp_men_ = fp_men.read().split('\n')

data_men = []
for row in fp_men_: data_men.append(row.split(' '))
data_men.pop() # Removing empty list
data_men = np.array(data_men)
fp_men.close()

word_to_idx_men = {}

idx = 0
for w in data_men[:,0]:
    try: word_to_idx_men[w]
    except KeyError:
        word_to_idx_men[w] = idx
        idx = idx+1
for w in data_men[:,1]:
    try: word_to_idx_men[w]
    except KeyError:
        word_to_idx_men[w] = idx
        idx = idx+1

word_to_idx_men = sorted(word_to_idx_men.items(), key=operator.itemgetter(1))

""" Calculating Similarity """
def LoadInputVector(wordvec, data, lookup):
    input1, input2 = [], []
    unk_cnt = 0
    for i in range(len(data[:,0])):
        try: input1.append(wordvec[data[i,0]])
        except KeyError: input1.append(np.random.normal(0., 1., WordDim)); unk_cnt = unk_cnt+1
        try: input2.append(wordvec[data[i,1]])
        except KeyError: input2.append(np.random.normal(0., 1., WordDim)); unk_cnt = unk_cnt+1
    return np.array(input1), np.array(input2)

def Evaluating_MEN(wordvec, data, lookup):
    input1, input2 = LoadInputVector(wordvec, data, lookup)
    output = []
    epsilon = 1e-5
    for i in range(len(input1)):
        output.append(np.dot(input1[i], input2[i])/(np.linalg.norm(input1[i])*np.linalg.norm(input2[i])))
    output = (np.array(output)).reshape(-1)
    return round(scipy.stats.spearmanr(output, np.array(data[:,2], dtype=float))[0], 4)

lookup_men = dict(word_to_idx_men)

In [8]:
print('<MEN-3k Dataset>')
print("Original :", Evaluating_MEN(wordVecs, data_men, lookup_men))
print("Extrofit :", Evaluating_MEN(wordVecs_extro, data_men, lookup_men))

print_word_vecs(wordVecs_extro, 'SimpleGloveExtrofit_Dim' + str(ExpandDim) + '.txt')

<MEN-3k Dataset>


NameError: name 'wordVecs' is not defined