In [2]:
import numpy as np
from scipy.spatial import distance
from gensim.models import KeyedVectors

In [3]:
HD_W2V_PATH = "./word2vec/GoogleNews-vectors-negative300-hard-debiased.bin.gz"
W2V_PATH = "./word2vec/GoogleNews-vectors-negative300.bin.gz"

In [5]:
model = KeyedVectors.load_word2vec_format(HD_W2V_PATH, binary=True)

In [6]:
model_ND = KeyedVectors.load_word2vec_format(W2V_PATH, binary=True)

In [14]:
def get_word_centroid_vec(model, doc):
    """ Convert the document to a vector using the word centroid method """
    wcm = None
    for wrd in doc:
        try:
            vec = model[wrd]
        except:
            continue

        if wcm is None:
            wcm = vec
        wcm += vec

    wcm /= float(len(doc))
    return wcm

In [86]:
def cosine_filter_candidates(candidates, job):
    """ Filter candidates using the cosine similarity """

    scores = []
    for candidate in candidates:
        scores.append(distance.cosine(candidate, job))

    # Index list of best candidates sorted in descending order
    ranks = sorted(range(len(scores)), key=lambda k: scores[k])

    return ranks

def euclidean_filter_candidates(candidates, job):
    """ Filter candidates using the euclidean distance """

    scores = []
    for candidate in candidates:
        scores.append(distance.euclidean(candidate, job))

    # Index list of best candidates sorted in descending order
    ranks = sorted(range(len(scores)), key=lambda k: scores[k])

    return ranks

def jaccard_filter_candidates(candidates, job):
    """ Filter candidates using the jacard distance """

    scores = []
    for candidate in candidates:
        scores.append(distance.jaccard(candidate, job))

    # Index list of best candidates sorted in descending order
    ranks = sorted(range(len(scores)), key=lambda k: scores[k])

    return ranks

def load_jobs(filename):
    """ Load the jobs from the target file """
    # For now just return dummy list

    jobs = [
        ["Computer", "science", "software", "data", "science", "engineering", "junior", "engineer"],
        ["Computer", "hardware", "circuit", "data", "science", "engineering", "junior", "engineer"],
        ["Computer", "hardware", "electrical", "data", "science", "engineering", "junior", "engineer"]
    ]

    return jobs

def load_candidates(filename):
    """ Load the jobs from the target file """
    # For now just return dummy list

    candidates = [
        ["woman", "computer science"],
        ["man", "computer science"],
        ["she", "computer science"],
        ["he", "computer science"],
        ["she", "woman", "computer science"],
    ]

    candidate_genders = [
        "F", "M", "F", "M", "F"
    ]

    return {"candidates": candidates, "genders": candidate_genders}


In [87]:
model['melinda'].shape

(300,)

In [88]:
users = load_candidates("dummy.csv")
user_profiles, user_genders = users['candidates'], users['genders']
job_profiles = load_jobs("dummy_j.csv")

In [89]:
user_vectors = [get_word_centroid_vec(model, u) for u in user_profiles]

In [90]:
job_vectors = [get_word_centroid_vec(model, j) for j in job_profiles]

In [91]:
cosine_filter_candidates(user_vectors, job_vectors[0])

[3, 2, 4, 0, 1]

In [92]:
euclidean_filter_candidates(user_vectors, job_vectors[0])

[0, 1, 2, 4, 3]

In [93]:
user_vectors = [get_word_centroid_vec(model_ND, u) for u in user_profiles]

In [94]:
job_vectors = [get_word_centroid_vec(model_ND, j) for j in job_profiles]

In [95]:
cosine_filter_candidates(user_vectors, job_vectors[0])

[3, 2, 4, 1, 0]

In [96]:
euclidean_filter_candidates(user_vectors, job_vectors[0])

[1, 0, 2, 4, 3]