In [1]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

import logging
from gensim.models import word2vec

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
import gensim
# Need the interactive Tools for Matplotlib
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt

In [3]:
model = gensim.models.KeyedVectors.load_word2vec_format('entire_corpora.model.bin', binary=True)

2018-03-28 02:46:20,591 : INFO : loading projection weights from entire_corpora.model.bin
2018-03-28 02:46:34,587 : INFO : loaded (108813, 200) matrix from entire_corpora.model.bin


In [3]:
def most_similar(model, a, b, c, topn=10, topn2=10):
    nearest = model.wv.most_similar(positive=[a, b], negative=[c], topn=topn)
    output = []
    for d, score in nearest:
        cumul_score = score

        other_nearest = model.wv.most_similar(positive=[a, b], negative=[d], topn=topn2)
        for e, other_score in other_nearest:
            if c == e:
                cumul_score = cumul_score + other_score
                break

        other_nearest = model.wv.most_similar(
            positive=[c, d], negative=[a], topn=topn2)
        for e, other_score in other_nearest:
            if b == e:
                cumul_score = cumul_score + other_score
                break

        other_nearest = model.wv.most_similar(
            positive=[c, d], negative=[b], topn=topn2)
        for e, other_score in other_nearest:
            if a == e:
                cumul_score = cumul_score + other_score
                break

        output.append((d, cumul_score))

    return output

In [1]:
most_similar(model,'babae','tito','lalaki')

NameError: name 'most_similar' is not defined

In [15]:
model.wv.most_similar(positive=['prinsipe', 'dalaga'], negative=['binata'])

[('prinsesa', 0.6678964495658875),
 ('diwata', 0.6029185652732849),
 ('dalagita', 0.6027273535728455),
 ('oso', 0.5654577016830444),
 ('Birhen', 0.5585415363311768),
 ('binatilyo', 0.5436294078826904),
 ('tupa', 0.5353958010673523),
 ('tango', 0.5343475937843323),
 ('Apo', 0.5315272212028503),
 ('Kabalyero', 0.5307762622833252)]

In [18]:
model.wv.accuracy('output.gender.csv')

2018-03-28 03:02:50,399 : INFO : gender: 57.1% (156/273)
2018-03-28 03:02:50,401 : INFO : total: 57.1% (156/273)


[{'correct': [('PRINSESA', 'DALAGA', 'PRINSIPE', 'BINATA'),
   ('HIPAG', 'DALAGA', 'BAYAW', 'BINATA'),
   ('NINANG', 'REYNA', 'NINONG', 'HARI'),
   ('NANAY', 'LOLA', 'TATAY', 'LOLO'),
   ('INA', 'LOLA', 'AMA', 'LOLO'),
   ('BABAE', 'DALAGA', 'LALAKI', 'BINATA'),
   ('LOLA', 'BABAE', 'LOLO', 'LALAKI'),
   ('BABAE', 'PRINSESA', 'LALAKI', 'PRINSIPE'),
   ('PRINSESA', 'DALAGA', 'PRINSIPE', 'BINATA'),
   ('BABAE', 'PRINSESA', 'LALAKI', 'PRINSIPE'),
   ('NANAY', 'BABAE', 'TATAY', 'LALAKI'),
   ('NANAY', 'KUYA', 'TATAY', 'ATE'),
   ('NANAY', 'DALAGA', 'TATAY', 'BINATA'),
   ('INA', 'REYNA', 'AMA', 'HARI'),
   ('KUYA', 'BABAE', 'ATE', 'LALAKI'),
   ('TIYA', 'BABAE', 'TIYO', 'LALAKI'),
   ('LOLA', 'DALAGA', 'LOLO', 'BINATA'),
   ('INA', 'TIYA', 'AMA', 'TIYO'),
   ('KUYA', 'DALAGA', 'ATE', 'BINATA'),
   ('HIPAG', 'MAMA', 'BAYAW', 'PAPA'),
   ('BABAE', 'DALAGA', 'LALAKI', 'BINATA'),
   ('PRINSESA', 'REYNA', 'PRINSIPE', 'HARI'),
   ('REYNA', 'DALAGA', 'HARI', 'BINATA'),
   ('TITA', 'BABAE', 'TITO'

In [19]:
# Returns {analogy_header:(a, b, c, d)[]}, i.e a map of lists of tuples containing 4 words
def read_analogy_file(analogy_file):
    analogies = {}
    current_topic = "default"
    analogies[current_topic] = []
    for line in analogy_file.readlines():
        if line.startswith(": "):
            current_topic = line[2:]
            analogies[current_topic] = []
        else:
            a, b, c, d = line.split()
            analogies[current_topic].append((a, b, c, d))
    return analogies


# Returns {analogy_header:float[]}, i.e. a map of accuracy scores
def accuracy(model, analogy_file, topn=5):
    analogies = read_analogy_file(analogy_file)
    scores = {}
    for header in analogies:

        tasks = analogies[header]

        # Initialize list containing scores for topn
        topn_scores = [0] * topn

        for task in tasks:

            # Retrieve the words
            a = task[0]
            b = task[1]
            c = task[2]
            d = task[3]
            # --------------------------------
            # WARNING WARNING WARNING WARNING
            # WARNING WARNING WARNING WARNING
            # Pa-ayos kung tama ung arrangement
            # --------------------------------
            print(a," is to ", b, "as ", c, "is to ",d,"\n")
            tuples = model.wv.most_similar(
                positive=[c, b], negative=[d], topn=topn)

            for i in range(0, topn):
                # Get subset of top i tuple
                sub_tuple = tuples[0:i]

                # Filter out the scores, only the word shall remain
                word_list = [tup[0] for tup in sub_tuple]

                # Count if it is in the word list
                if a in word_list:
                    topn_scores[i] = topn_scores[i] + 1

        # Normalize and assign
        count = len(tasks)
        
        if count == 0:
            scores[header] = [0] * topn
        else:
            scores[header] = [n / count for n in topn_scores]

    return scores

# Sample usage
print(accuracy(model, open("output.magkasalungat.csv", "r", encoding="utf-8"), 10))

ValueError: too many values to unpack (expected 4)

In [None]:
mo