In [3]:
from gensim.models import KeyedVectors

glove_file = "glove.6B.300d.txt"
word2vec_output_file = "glove.6B.300d.word2vec"

from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_file, word2vec_output_file)
model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

  glove2word2vec(glove_file, word2vec_output_file)


[('queen', 0.6336469054222107), ('prince', 0.6196622848510742), ('monarch', 0.5899620652198792), ('kingdom', 0.5791266560554504), ('throne', 0.5606487989425659), ('ii', 0.5562329292297363), ('iii', 0.5503199100494385), ('crown', 0.5224862694740295), ('reign', 0.5217353701591492), ('kings', 0.5066401958465576)]


In [25]:
female = ["female", "tv", "sport", "susan", "today", "verb", "beauty", "lesson", "party", "ok", "early", "usual", "emily", "often", "baby", "dad", "peter", "stevenson", "yesterday", "hope", "marriage", "photo", "st", "adj", "michael", "weekend", "bag", "pair", "tom", "dress", "niece", "enjoy", "lyn", "mum", "arrive", "bad", "french", "power", "problem", "care", "ed", "london", "practice", "short", "someone", "train", "voice", "without", "busy", "dance", "done", "drink", "drive", "grammar", "near", "note", "sing", "sometime", "stand", "wash", "wonder"]
male = ["prince", "romeo", "juliet", "arthur", "kill", "dead", "tree", "god", "ben", "wise", "water", "face", "lawrence", "order", "become", "shall", "april", "paris", "cloth", "england", "film", "sound", "carry", "heard", "soon", "found", "land", "leon", "open", "laugh", "hermit", "sword", "draft", "dream", "gave", "lo", "around", "cri", "goes", "head", "white", "hour", "later", "street", "might", "police", "task", "idea", "bed", "bedivere", "mark", "match", "met", "six", "summer", "true", "modred", "game", "martin", "news"]

In [10]:
import numpy as np
from scipy.spatial.distance import cosine

def load_glove_model(glove_file):
    embeddings = {}
    with open(glove_file, "r", encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype=np.float32)
            embeddings[word] = vector
    return embeddings

glove_vectors = load_glove_model("glove.6B.300d.txt")

def average_pairwise_similarity(words, embeddings):
    valid_words = [w for w in words if w in embeddings]
    vectors = [embeddings[w] for w in valid_words]
    
    if len(vectors) < 2:
        return None  # Not enough valid words

    similarities = []
    for i in range(len(vectors)):
        for j in range(i + 1, len(vectors)):
            sim = 1 - cosine(vectors[i], vectors[j])
            similarities.append(sim)

    return np.mean(similarities)

In [11]:
avgSimilarityFemale = average_pairwise_similarity(female, glove_vectors)
avgSimilarityMale = average_pairwise_similarity(male, glove_vectors)
print(avgSimilarityFemale,avgSimilarityMale)

0.14951851686816736 0.15208059726675743


In [14]:
def compute_average_distance(words, target_word, embeddings):
    if target_word not in embeddings:
        print(f"Target word '{target_word}' not found in embeddings.")
        return None

    target_vector = embeddings[target_word]
    total_distance = 0
    valid_word_count = 0

    for word in words:
        if word in embeddings:
            sim = 1 - cosine(target_vector, embeddings[word])
            distance = 1 - sim  # Cosine distance = 1 - similarity
            total_distance += distance
            valid_word_count += 1
        else:
            print(f"Word '{word}' not found in embeddings.")

    if valid_word_count == 0:
        print("No valid words found in embeddings.")
        return None

    average_distance = total_distance / valid_word_count
    return average_distance

In [27]:
print(compute_average_distance(female, "death" , glove_vectors))
print(compute_average_distance(male, "death" , glove_vectors))

0.8277830121380514
0.791310230345657


In [33]:
print(compute_average_distance(female, "food" , glove_vectors))
print(compute_average_distance(male, "food" , glove_vectors))

0.8379290468008587
0.8524189559185354


In [30]:
print(compute_average_distance(female, "baby" , glove_vectors))
print(compute_average_distance(male, "baby" , glove_vectors))

0.7860307028214608
0.8319752556243112


In [39]:
print(compute_average_distance(female, "pretty" , glove_vectors))
print(compute_average_distance(male, "pretty" , glove_vectors))

0.7610475418217224
0.8070285897044759


In [40]:
print(compute_average_distance(female, "love" , glove_vectors))
print(compute_average_distance(male, "love" , glove_vectors))

0.7472878959950222
0.7761614034862541


In [44]:
print(compute_average_distance(female, "violence" , glove_vectors))
print(compute_average_distance(male, "violence" , glove_vectors))

0.8704367872561857
0.8594025454232637


In [21]:
from scipy.spatial.distance import cosine

def compute_average_distance_to_cluster(word, cluster_words, embeddings):
    """Compute the average distance of a word to all words in a cluster."""
    if word not in embeddings:
        return None

    total_distance = 0
    valid_word_count = 0

    for cluster_word in cluster_words:
        if cluster_word in embeddings:
            sim = 1 - cosine(embeddings[word], embeddings[cluster_word])
            distance = 1 - sim  # Cosine distance = 1 - similarity
            total_distance += distance
            valid_word_count += 1
        else:
            print(f"Cluster word '{cluster_word}' not found in embeddings.")

    if valid_word_count == 0:
        return None

    return total_distance / valid_word_count

def find_central_word(cluster_words, embeddings, search_in_vocab=False):
    """Find the word that is closest to all words in the cluster."""
    central_word = None
    min_avg_distance = float('inf')

    # If search_in_vocab is True, search the entire vocabulary for the central word
    if search_in_vocab:
        search_words = embeddings.keys()
    else:
        search_words = cluster_words

    for word in search_words:
        avg_distance = compute_average_distance_to_cluster(word, cluster_words, embeddings)
        if avg_distance is not None and avg_distance < min_avg_distance:
            min_avg_distance = avg_distance
            central_word = word

    return central_word, min_avg_distance

In [35]:
# Find the central word within the female list
central_word, avg_distance = find_central_word(female, glove_vectors, search_in_vocab=False)
print(f"Central word (within female list): {central_word}, Average distance: {avg_distance}")

# Find the central word in the entire vocabulary
central_word, avg_distance = find_central_word(female, glove_vectors, search_in_vocab=True)
print(f"Central word (in entire vocabulary): {central_word}, Average distance: {avg_distance}")


Central word (within female list): done, Average distance: 0.7400832127994309
Central word (in entire vocabulary): n't, Average distance: 0.6911780058018797


In [36]:
# Find the central word within the female list
central_word, avg_distance = find_central_word(male, glove_vectors, search_in_vocab=False)
print(f"Central word (within female list): {central_word}, Average distance: {avg_distance}")

# Find the central word in the entire vocabulary
central_word, avg_distance = find_central_word(male, glove_vectors, search_in_vocab=True)
print(f"Central word (in entire vocabulary): {central_word}, Average distance: {avg_distance}")


Central word (within female list): later, Average distance: 0.7181909478531953
Central word (in entire vocabulary): but, Average distance: 0.6918507222120284
