In [None]:
import pickle
import numpy as np

# Load the vocabulary file
with open('1900-vocab.pkl', 'rb') as vocab_file:
    vocab = pickle.load(vocab_file)

# Load the NumPy array file
w = np.load('1900-w.npy')

# Inspecting the contents
vocab_sample = vocab[:5]  # Sample of the vocabulary, showing the first 5 items
w_shape = w.shape  # Shape of the NumPy array to understand its dimensions

vocab_sample, w_shape


In [None]:
import numpy as np

# Assuming `vocab` is your vocabulary list and `w` is your embeddings numpy array

# Step 1: Find the index of "gay" in the vocabulary
word = "gay"
try:
    index = vocab.index(word)
except ValueError:
    print(f"Word '{word}' not found in vocabulary.")
    index = None

if index is not None:
    # Step 2: Calculate cosine similarity between "gay"'s embedding and all others
    # Normalize the vectors to unit length
    w_normalized = w / np.linalg.norm(w, axis=1, keepdims=True)
    word_vector = w_normalized[index]
    similarities = np.dot(w_normalized, word_vector)
    
    # Step 3: Sort the words by similarity
    # Exclude the word itself by setting its similarity to -1
    similarities[index] = -1
    nearest_indices = np.argsort(-similarities)[:10]  # Get indices of top-10 nearest words
    
    # Step 4: Display the nearest words
    nearest_words = [vocab[i] for i in nearest_indices]
    print("Nearest words to 'gay':", nearest_words)

In [60]:
import numpy as np
import pickle

def find_nearest_words(vocab_file_path, embeddings_file_path, target_word, top_n=10):
    # Load the vocabulary and embeddings
    with open(vocab_file_path, 'rb') as file:
        vocab = pickle.load(file)
    w = np.load(embeddings_file_path)
    
    # Normalize embeddings
    w_normalized = w / np.linalg.norm(w, axis=1, keepdims=True)
    
    # Check if the target word is in the vocabulary
    if target_word in vocab:
        index = vocab.index(target_word)
        word_vector = w_normalized[index]
        
        # Calculate cosine similarity
        similarities = np.dot(w_normalized, word_vector)
        similarities[index] = -1  # Ignore the word itself
        
        # Find the nearest words
        nearest_indices = np.argsort(-similarities)[:top_n]
        nearest_words = [vocab[i] for i in nearest_indices]
        nearest_similarities = similarities[nearest_indices]
        
        # Print the nearest words and their similarities
        print(f"Nearest words to '{target_word}' in {vocab_file_path.split('-')[0]}s:")
        for word, similarity in zip(nearest_words, nearest_similarities):
            print(f"{word}: {similarity:.4f}")
    else:
        print(f"Word '{target_word}' not found in the vocabulary for {vocab_file_path.split('-')[0]}s.")

# Example usage for the decades of interest
find_nearest_words('1900-vocab.pkl', '1900-w.npy', 'gay')
find_nearest_words('1950-vocab.pkl', '1950-w.npy', 'gay')
find_nearest_words('1990-vocab.pkl', '1990-w.npy', 'gay')

  w_normalized = w / np.linalg.norm(w, axis=1, keepdims=True)


Nearest words to 'gay' in 1900s:
lively: 0.4500
cheery: 0.4390
lark: 0.4136
humoured: 0.4119
jovial: 0.4013
apparel: 0.3890
cheerful: 0.3850
brilliant: 0.3829
natured: 0.3627
dresses: 0.3527
Nearest words to 'gay' in 1950s:
jovial: 0.4178
cheerful: 0.3944
lively: 0.3550
thoughtless: 0.3496
humoured: 0.3487
careless: 0.3483
witty: 0.3336
merry: 0.3314
dresses: 0.3295
joyous: 0.3263
Nearest words to 'gay' in 1990s:
lesbian: 0.6767
transgender: 0.6188
lesbians: 0.5500
katz: 0.5465
bisexual: 0.5296
bisexuals: 0.4834
coalition: 0.4513
gays: 0.4273
bi: 0.4142
gras: 0.4139


In [61]:
find_nearest_words('1850-vocab.pkl', '1850-w.npy', 'mail')
find_nearest_words('1900-vocab.pkl', '1900-w.npy', 'mail')
find_nearest_words('1990-vocab.pkl', '1990-w.npy', 'mail')

  w_normalized = w / np.linalg.norm(w, axis=1, keepdims=True)


Nearest words to 'mail' in 1850s:
velvet: 0.6453
horn: 0.6427
loaded: 0.6394
silk: 0.6354
lace: 0.6341
cap: 0.6296
powder: 0.6282
bowl: 0.6247
feathers: 0.6244
robe: 0.6205
Nearest words to 'mail' in 1900s:
dover: 0.4131
coaches: 0.3572
coach: 0.3517
coat: 0.3404
hackney: 0.3396
ordered: 0.3352
boat: 0.3271
vest: 0.3262
cart: 0.3197
coats: 0.3163
Nearest words to 'mail' in 1990s:
coupon: 0.4346
102902: 0.4256
hahn: 0.4064
backlist: 0.3979
relayed: 0.3869
fax: 0.3762
ba: 0.3637
fawcett: 0.3590
cods: 0.3582
letter: 0.3521


In [62]:
find_nearest_words('1900-vocab.pkl', '1900-w.npy', 'stress')
find_nearest_words('1950-vocab.pkl', '1950-w.npy', 'stress')
find_nearest_words('1990-vocab.pkl', '1990-w.npy', 'stress')

  w_normalized = w / np.linalg.norm(w, axis=1, keepdims=True)


Nearest words to 'stress' in 1900s:
excessive: 0.3580
owing: 0.3377
strain: 0.3306
violent: 0.3233
indications: 0.3168
storm: 0.3111
hardships: 0.3108
degradation: 0.3098
mental: 0.3026
fatigue: 0.3024
Nearest words to 'stress' in 1950s:
weakened: 0.3575
strife: 0.3203
consequent: 0.3177
exhaustion: 0.3130
emphasis: 0.3025
hardships: 0.3003
memorable: 0.2985
futility: 0.2905
abated: 0.2859
strain: 0.2830
Nearest words to 'stress' in 1990s:
traumatic: 0.4323
strain: 0.3782
emphasis: 0.3280
delaying: 0.3145
exhaustion: 0.3124
pressure: 0.3102
trauma: 0.3070
attach: 0.3030
forethought: 0.2972
usage: 0.2953
