# Comparison of different word embedding techniques

In [1]:
import sys
sys.path.append('../')
import logging
import importlib
# importlib.reload(logging)
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim.models.word2vec import Word2Vec
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [10]:
def load_model(path):
    return Word2Vec.load_word2vec_format(path)

sem_cats = [
    'Politiker,Partei',
    'Gotteshaus,Religion',
    'Werk,Erzeuger',
    'Wissenschaftsbereich,Berufsbezeichnung',
    'Land,Währung',
    'Hauptstadt,Land',
    'Bundesland,Landeshauptstadt',
    'Land,Kontinent',
    'Technik - Produkte',
    'opposite',
    'Land,Sprache/Adjektiv'
]
syn_cats = [
    'Geschlecht',
    'signular,plural',
    'adjectives - Positiv, Komparativ',
    'adjectives - Positiv, Superlativ',
    'verbs - 3.P.Sg., 1.P.Sg.',
    'verbs - 3.P.Sg., 1.P.Pl.',
    'verbs - 3.P.Sg., Sg.Vg.',
    'verbs - 3.P.Sg., Pl.Vg.',
    'verbs - 3.P.Sg., Partizip'
]

def acc(results, cats):
    correct = sum([len(r['correct']) for r in results if r['section'] in cats])
    incorrect = sum([len(r['incorrect']) for r in results if r['section'] in cats])
    count = correct + incorrect
    acc = 100 * correct / count if count != 0 else 0
    return acc, correct, incorrect

def acc_cat(results, cat):
    return acc(results, [cat])

def mean(nums):
    return sum(nums) / len(nums)

def nb_questions(questions_file):
    count = 0
    for _ in open(questions_file):
        count += 1
    return count

def accuracy(path, questions_file):
    model = load_model(path)
    results = model.accuracy(questions_file)
    
    cols = []
    all_accs = []
    
    for res in results:
        name = res['section']
        corr = len(res["correct"])
        incorr = len(res['incorrect'])
        count = corr + incorr
        _acc = 100 * corr / count if count != 0 else 0
        cols.append(name)
        all_accs.append(_acc)
        print('{:s}: {:.2f}% ({:d}/{:d})'.format(name, _acc, corr, count))
        
    sem_acc, sem_corr, sem_incorr = acc(results, sem_cats)
    syn_acc, syn_corr, syn_incorr = acc(results, syn_cats)
    total_acc, total_corr, total_incorr = acc(results, sem_cats + syn_cats)
    oov = (1 - (total_corr + total_incorr) / nb_questions(questions_file)) * 100
    print("")
    print("oov: {:.2f}%".format(oov))
    print("")
    print('Semantic: {:d}/{:d}, Accuracy: {:.2f}%'.format(sem_corr, sem_corr + sem_incorr, sem_acc))
    print('Syntactic: {:d}/{:d}, Accuracy: {:.2f}%'.format(syn_corr, syn_corr + syn_incorr, syn_acc))
    print('Total: {:d}/{:d}, Accuracy: {:.2f}%'.format(total_corr, total_corr + total_incorr, total_acc))
    
    sem_accs = [acc_cat(results, r["section"])[0] for r in results if r["section"] in sem_cats]
    syn_accs = [acc_cat(results, r["section"])[0] for r in results if r["section"] in syn_cats]
    print("")
    print("Mean Accuracies:")
    print("Semantic: {:.2f}%".format(mean(sem_accs)))
    print("Syntactic: {:.2f}%".format(mean(syn_accs)))
    print("Total: {:.2f}%".format(mean(sem_accs + syn_accs)))
    return ['oov'] + cols + ['semantic', 'syntactic'], [oov] + all_accs + [sem_acc, syn_acc]

## Word2Vec

### Eval dataset

In [11]:
heads, w2v_eval_sampled_accs = accuracy('../../models/gensim/eval.vec', '../evaluation/question-words.txt')

Politiker,Partei: 0.00% (0/2)
Gotteshaus,Religion: 0.00% (0/0)
Werk,Erzeuger: 0.00% (0/0)
Wissenschaftsbereich,Berufsbezeichnung: 0.00% (0/0)
Land,Währung: 0.00% (0/6)
Hauptstadt,Land: 0.00% (0/110)
Bundesland,Landeshauptstadt: 0.00% (0/0)
Land,Kontinent: 0.00% (0/2)
Land,Sprache/Adjektiv: 0.00% (0/0)
Technik - Produkte: 0.00% (0/6)
Geschlecht: 0.00% (0/30)
signular,plural: 0.11% (4/3540)
opposite: 0.00% (0/702)
adjectives - Positiv, Komparativ: 0.00% (0/342)
adjectives - Positiv, Superlativ: 0.00% (0/12)
verbs - 3.P.Sg., 1.P.Sg.: 0.00% (0/380)
verbs - 3.P.Sg., 1.P.Pl.: 0.09% (3/3192)
verbs - 3.P.Sg., Sg.Vg.: 0.53% (7/1332)
verbs - 3.P.Sg., Pl.Vg.: 0.00% (0/182)
verbs - 3.P.Sg., Partizip: 0.00% (0/992)
total: 0.13% (14/10830)

Semantic: 0/828, Accuracy: 0.00%
Syntactic: 14/10002, Accuracy: 0.14%
Total: 14/10830, Accuracy: 0.13%
[0.0, 0, 0, 0, 0.0, 0.0, 0, 0.0, 0, 0.0, 0.0]
[0.0, 0.11299435028248588, 0.0, 0.0, 0.0, 0.09398496240601503, 0.5255255255255256, 0.0, 0.0]

Mean Accuracies:
Sem

In [None]:
heads, w2v_eval_complete_accs = accuracy('../../models/gensim/eval.vec', '../evaluation/question-words-large.txt')

### Leizpig dataset

In [None]:
heads, w2v_leipzig_sampled_accs = accuracy('../../models/gensim/biggy.vec', '../evaluation/question-words.txt')

In [None]:
heads, w2v_leipzig_complete_accs = accuracy('../../models/gensim/biggy.vec', '../evaluation/question-words-large.txt')

## Dependency based Word Embeddings

### Eval dataset

In [None]:
_, depsbased_eval_sampled_accs = accuracy('../../models/word2vecf/eval.vec', '../evaluation/question-words.txt')

In [None]:
_, depsbased_eval_complete_accs = accuracy('../../models/word2vecf/eval.vec',  '../evaluation/question-words-large.txt')

### Leipzig dataset

In [None]:
_, depsbased_eval_complete_accs = accuracy('../../models/word2vecf/eval.vec',  '../evaluation/question-words-large.txt')

In [None]:
_, depsbased_leipzig_clean_sampled_accs = accuracy('../../models/word2vecf/leipzig.clean.vec',  '../evaluation/question-words.txt')

In [None]:
_, depsbased_leipzig_complete_accs = accuracy('../../models/word2vecf/leipzig.vec',  '../evaluation/question-words-large.txt')

In [None]:
_, depsbased_leipzig_clean_complete_accs = accuracy('../../models/word2vecf/leipzig.clean.vec',  '../evaluation/question-words-large.txt')

## Fasttext - Enriching Word Vectors with Subword Information

### Eval dataset

In [15]:
heads, fasttext_eval_sampled_accs = accuracy('../../models/fasttext/eval.vec', '../evaluation/question-words.txt')

2017-02-21 18:00:53,478 : INFO : loading projection weights from ../../fastText/leipzig-small.vec
2017-02-21 18:00:54,505 : INFO : loaded (3942, 300) matrix from ../../fastText/leipzig-small.vec
2017-02-21 18:00:54,510 : INFO : precomputing L2-norms of word weight vectors
2017-02-21 18:00:54,521 : INFO : Politiker,Partei: 0.0% (0/2)
2017-02-21 18:00:54,527 : INFO : Land,Währung: 16.7% (1/6)
2017-02-21 18:00:54,626 : INFO : Hauptstadt,Land: 0.0% (0/110)
2017-02-21 18:00:54,629 : INFO : Land,Kontinent: 0.0% (0/2)
2017-02-21 18:00:54,634 : INFO : Technik - Produkte: 0.0% (0/6)
2017-02-21 18:00:54,667 : INFO : Geschlecht: 0.0% (0/30)
2017-02-21 18:00:56,450 : INFO : signular,plural: 2.7% (94/3540)
2017-02-21 18:00:56,858 : INFO : opposite: 0.1% (1/702)
2017-02-21 18:00:57,094 : INFO : adjectives - Positiv, Komparativ: 1.2% (4/342)
2017-02-21 18:00:57,160 : INFO : adjectives - Positiv, Superlativ: 0.0% (0/12)
2017-02-21 18:00:57,405 : INFO : verbs - 3.P.Sg., 1.P.Sg.: 1.8% (7/380)
2017-02-21

Semantic: 2/828, Accuracy: 0.24%
Syntactic: 336/10002, Accuracy: 3.36%
Total: 676/21660, Accuracy: 3.12%


In [None]:
heads, fasttext_eval_complete_accs = accuracy('../../models/fasttext/eval.vec', '../evaluation/question-words-large.txt')

### Leipzig dataset

In [None]:
heads, fasttext_leipzig_sampled_accs = accuracy('../../models/fasttext/leipzig.vec', '../evaluation/question-words.txt')

In [None]:
heads, fasttext_leipzig_complete_accs = accuracy('../../models/fasttext/leipzig.vec', '../evaluation/question-words-large.txt')

## Comparison

In [None]:
from IPython.display import display
def compare(df):
    display(df)
    rows = df.shape[0]
    cols = df.shape[1]
    plt.figure(figsize=(cols*0.5+3,rows*0.5))
    sns.heatmap(df, annot=True, fmt=".1f")
    plt.show()

### Leipzig dataset

In [None]:
index = ["word2vec", "deps-based", "deps-based-clean", "fasttext"]
columns = heads

data_sampled = [
    w2v_leipzig_sampled_accs, 
    depsbased_leipzig_sampled_accs, 
    depsbased_leipzig_clean_sampled_accs,
    fasttext_leipzig_sampled_accs
]

data_complete = [
    w2v_leipzig_complete_accs,
    depsbased_leipzig_complete_accs,
    depsbased_leipzig_clean_complete_accs,
    fasttext_leipzig_complete_accs
]

leipzig_df_sampled = pd.DataFrame(data=data_sampled, columns=columns, index=index)
leipzig_df_complete = pd.DataFrame(data=data_complete, columns=columns, index=index)

# leipzig_df_sampled = pd.read_csv('accs-sampled-questions.csv', index_col=0)
# leipzig_df_complete = pd.read_csv('accs-all-questions.csv', index_col=0)

In [None]:
compare(leipzig_df_sampled)

In [None]:
compare(leipzig_df_complete)

### Eval dataset

In [None]:
index = ["word2vec", "dependency-based", "fasttext"]
columns = heads

data_sampled = [w2v_eval_sampled_accs, depsbased_eval_sampled_accs, fasttext_eval_sampled_accs]
# data_complete = [w2v_eval_complete_accs, depsbased_eval_complete_accs, fasttext_eval_complete_accs]

eval_df_sampled = pd.DataFrame(data=data_sampled, columns=columns, index=index)
# eval_df_complete = pd.DataFrame(data=data_complete, columns=columns, index=index)

# eval_df_sampled = pd.read_csv('accs-sampled-questions.csv', index_col=0)
# eval_df_complete = pd.read_csv('accs-all-questions.csv', index_col=0)

In [None]:
compare(eval_df_sampled)

In [None]:
compare(eval_df_complete)