In [424]:
import os
import random
import pickle
import zipfile
import numpy as np

#### Load HistWords dataset:

In [425]:
# # dataset obtained from here: https://github.com/williamleif/histwords

# # load dataset here:
# !git clone https://github.com/williamleif/histwords.git
# !wget -O all_english_embeddings.zip "http://snap.stanford.edu/historical_embeddings/eng-all_sgns.zip"

# # unzip loaded dataset
# with zipfile.ZipFile("all_english_embeddings.zip", 'r') as zip_ref:
#     zip_ref.extractall("all_english_embeddings")

## Step 1: Load and Preprocess Word Embeddings

In [426]:
# for loading historical embeddings from .npy files
def load_embeddings(file_path):
    return np.load(file_path)

In [427]:
# base dir for embeddings
base_dir = "all_english_embeddings/sgns"
print(os.getcwd()) # debugging

/Users/bryan/Documents/College/Computer Science/CMSC 491 - Modern Regression/Project/code


#### Obtain subsample of words from dataset:

In [428]:
# specify decades of interest
decades = ['1880', '1890', '1900', '1910', '1920', '1930', '1940', '1950', '1960', '1970', '1980']
vocab_dict = {}
embeddings_dict = {}
random.seed(690)         # set desired seed here
sample_size = 500       # set desired sample size here

In [429]:
# load and optionally sample each decade's embeddings and vocabulary
for decade in decades:
    # load embeddings
    embedding_path = os.path.join(base_dir, f"{decade}-w.npy")
    embeddings = load_embeddings(embedding_path)
    
    # load vocabulary
    vocab_path = os.path.join(base_dir, f"{decade}-vocab.pkl")
    with open(vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    
    # ensure embeddings and vocab are same size
    assert len(embeddings) == len(vocab), f"Mismatch in size for {decade}"

    # filter out any numerical values counted as "words"
    filtered_vocab = []
    filtered_embeddings = []
    for word, embedding in zip(vocab, embeddings):
        if not any(char.isdigit() for char in word):
            filtered_vocab.append(word)
            filtered_embeddings.append(embedding)

    # fill dicts
    vocab_dict[decade] = filtered_vocab
    embeddings_dict[decade] = np.array(filtered_embeddings)

#### Filter subsamples by intersection across decades:

In [430]:
# obtain common vocabulary across all decades
common_vocab = set(vocab_dict[decades[0]])
for decade in decades[1:]:
    common_vocab.intersection_update(vocab_dict[decade])

common_vocab = list(common_vocab)  # converted to list for indexing

In [431]:
# precompute word_to_index dictionaries for each decade
word_to_index_dict = {}
for decade in decades:
    vocab = vocab_dict[decade]
    word_to_index = {word: idx for idx, word in enumerate(vocab)}
    word_to_index_dict[decade] = word_to_index

# rand sample from common_vocab
initial_sample_size = sample_size * 2  # increase here to account for possible zero vectors
sampled_words = random.sample(common_vocab, min(initial_sample_size, len(common_vocab)))

valid_words = []
zero_vector = np.zeros(embeddings_dict[decades[0]].shape[1])

for word in sampled_words:
    has_zero_vector = False
    for decade in decades:
        idx = word_to_index_dict[decade][word]
        embedding = embeddings_dict[decade][idx]
        if np.array_equal(embedding, zero_vector):
            has_zero_vector = True
            break
    if not has_zero_vector:
        valid_words.append(word)
    if len(valid_words) >= sample_size:
        break  # stop if we have enough valid words

print(f"Number of valid words (non-zero embeddings in all decades): {len(valid_words)}")

# update vocab_dict and embeddings_dict
for decade in decades:
    embeddings = embeddings_dict[decade]
    word_to_index = word_to_index_dict[decade]
    
    # obtain embeddings for valid_words
    indices = [word_to_index[word] for word in valid_words]
    filtered_embeddings = embeddings[indices]
    
    # update dicts with filtered values
    vocab_dict[decade] = valid_words
    embeddings_dict[decade] = filtered_embeddings


Number of valid words (non-zero embeddings in all decades): 263


In [432]:
# debugging
for decade, embeddings in embeddings_dict.items():
    print(f"{decade}'s embeddings:")
    print(f"shape: {embeddings.shape}") # display shape of embeddings per decade
    print(embeddings_dict[decade][:5])  # display first n rows of embeddings per decade
    print()

1880's embeddings:
shape: (263, 300)
[[ 0.09041874 -0.00612802  0.01955957 ...  0.02496664  0.02773585
   0.13545066]
 [ 0.06082223  0.00696597 -0.01710782 ... -0.00121106 -0.05964078
  -0.00937574]
 [-0.08250675  0.11109397  0.06320263 ... -0.03088265  0.02888915
   0.05499104]
 [ 0.01258085  0.0607544  -0.00747693 ...  0.00385742  0.0275719
   0.00498267]
 [ 0.02761255  0.09864677 -0.07375364 ...  0.00660208 -0.11403383
  -0.00164322]]

1890's embeddings:
shape: (263, 300)
[[ 0.03266551 -0.01404713 -0.03602988 ... -0.02661876  0.00807596
   0.08193445]
 [ 0.05486863  0.03193286  0.02250172 ... -0.0227119  -0.07622627
  -0.0087016 ]
 [-0.12209348  0.02937011 -0.03219827 ...  0.01018743 -0.01123918
   0.06216462]
 [ 0.03176818  0.06476442 -0.00460684 ... -0.0217839   0.01552975
   0.01805309]
 [ 0.02049031  0.09460364  0.00652229 ...  0.00168115 -0.02624192
  -0.02895931]]

1900's embeddings:
shape: (263, 300)
[[ 0.01112203  0.0405182   0.05997216 ...  0.04752148 -0.03171587
   0.04911

## Step 2: Standardize Data

In [433]:
def standardize(data):
    """
    Manually standardizes data by centering (subtracting mean) and scaling (dividing by standard deviation).
    
    Parameters:
    data (numpy.ndarray): Input data matrix of shape (num_vectors, num_dimensions).
    
    Returns:
    standardized_data (numpy.ndarray): Standardized data with zero mean and unit variance.
    mean_vector (numpy.ndarray): Computed mean vector.
    std_vector (numpy.ndarray): Computed standard deviation vector.
    """
    num_vectors, num_dimensions = len(data), len(data[0])
    mean_vector = [0] * num_dimensions
    std_vector = [0] * num_dimensions
    
    # compute mean
    for vector in data:
        mean_vector += vector
    mean_vector /= num_vectors
    
    # compute standard deviation
    for vector in data:
        std_vector += (vector - mean_vector) ** 2
    std_vector = (std_vector / num_vectors) ** 0.5
    
    # avoid division by zero (s.t. if a dimension has zero variance, set std to 1)
    std_vector[std_vector == 0] = 1.0
    
    # center and scale data
    standardized_data = [0] * len(data)
    for i in range(num_vectors):
        standardized_data[i] = (data[i] - mean_vector) / std_vector
    
    return np.array(standardized_data), np.array(mean_vector), np.array(std_vector)

In [434]:
std_embeddings_dict = {}
for decade, embeddings in embeddings_dict.items():
    standardized_data, _, _ = standardize(embeddings_dict[decade])
    std_embeddings_dict[decade] = standardized_data

#### Before centering (values small, but not entirely close to zero):

In [435]:
# debugging
for decade, embeddings in embeddings_dict.items():
    _, mean_vector, _ = standardize(embeddings)
    print(f"Mean vector for non-centered embeddings ({decade}'s):")
    print(mean_vector[:5])
    print()

Mean vector for non-centered embeddings (1880's):
[ 0.01598126  0.028779    0.00514996 -0.0028301  -0.00697005]

Mean vector for non-centered embeddings (1890's):
[ 0.01419444  0.02804747  0.01079166 -0.00275307  0.00041992]

Mean vector for non-centered embeddings (1900's):
[ 0.01516215  0.02700809  0.00477829 -0.00847414 -0.0021769 ]

Mean vector for non-centered embeddings (1910's):
[ 0.02383563  0.03073717  0.01008311 -0.01221684 -0.00330186]

Mean vector for non-centered embeddings (1920's):
[ 0.02116798  0.03588789  0.00803558 -0.01314943 -0.00383174]

Mean vector for non-centered embeddings (1930's):
[ 0.02326301  0.03230297  0.01036185 -0.00597291 -0.00884577]

Mean vector for non-centered embeddings (1940's):
[ 0.02814097  0.03647311  0.0126074  -0.00780998 -0.00626966]

Mean vector for non-centered embeddings (1950's):
[ 0.01986312  0.0400191   0.01201099 -0.00893077 -0.0073232 ]

Mean vector for non-centered embeddings (1960's):
[ 0.02298411  0.03942112  0.0145983  -0.011436

#### After centering (values now much closer to zero):

In [436]:
# debugging
for decade, embeddings in std_embeddings_dict.items():
    _, mean_vector, _ = standardize(embeddings)
    print(f"Mean vector for centered embeddings ({decade}'s):")
    print(mean_vector[:5])
    print()

Mean vector for centered embeddings (1880's):
[ 3.96809750e-17 -8.44276064e-17  3.46153186e-17 -3.88366990e-17
 -3.92588370e-17]

Mean vector for centered embeddings (1890's):
[-8.14726402e-17 -1.30018514e-16 -1.90806391e-16 -6.92306373e-17
  1.77297974e-17]

Mean vector for centered embeddings (1900's):
[ 3.88366990e-17  2.36397298e-17 -2.27954537e-17 -6.16321527e-17
  2.53282819e-17]

Mean vector for centered embeddings (1910's):
[-1.87429286e-16  1.31707066e-16  6.33207048e-17 -7.42962937e-17
  5.25561850e-17]

Mean vector for centered embeddings (1920's):
[ 1.99249151e-16  6.66978091e-17 -3.29267665e-17  8.27390543e-17
 -2.91275242e-17]

Mean vector for centered embeddings (1930's):
[ 1.53658244e-16  1.09755888e-16 -5.86771865e-17  8.44276064e-18
  1.23053236e-16]

Mean vector for centered embeddings (1940's):
[-1.78142250e-16  2.37241574e-16  1.57879624e-16  3.12382144e-17
 -8.61161586e-17]

Mean vector for centered embeddings (1950's):
[-7.78844669e-17 -2.44840059e-17 -2.10646878

## Step 3: Compute Covariance Matrix

In [437]:
def compute_cv_matrix(data):
    """
    Manually computes covariance matrix of input data.
    
    Parameters:
    data (numpy.ndarray): Centered data matrix of shape (num_vectors, num_dimensions).
    
    Returns:
    covariance_matrix (numpy.ndarray): Covariance matrix of shape (num_dimensions, num_dimensions).
    """
    data = np.array(data)
    num_vectors = data.shape[0] # init cv matrix
    covariance_matrix = (data.T @ data) / (num_vectors - 1) # cv cmoputed with matrix multiplication and element-wise averaging

    return covariance_matrix

In [438]:
cv_matrices_dict = {}
for decade, embedding in std_embeddings_dict.items():
    cv_matrices_dict[decade] = compute_cv_matrix(embedding)

In [439]:
# debugging
for decade, matrix in cv_matrices_dict.items():
    print(f"{decade}'s covariance matrix shape: {cv_matrices_dict[decade].shape}")
    # the cv matrix should be symmetric
    print("Is symmetric? ", np.allclose(cv_matrices_dict[decade], cv_matrices_dict[decade].T)) # should print True if symmetric
    print()

1880's covariance matrix shape: (300, 300)
Is symmetric?  True

1890's covariance matrix shape: (300, 300)
Is symmetric?  True

1900's covariance matrix shape: (300, 300)
Is symmetric?  True

1910's covariance matrix shape: (300, 300)
Is symmetric?  True

1920's covariance matrix shape: (300, 300)
Is symmetric?  True

1930's covariance matrix shape: (300, 300)
Is symmetric?  True

1940's covariance matrix shape: (300, 300)
Is symmetric?  True

1950's covariance matrix shape: (300, 300)
Is symmetric?  True

1960's covariance matrix shape: (300, 300)
Is symmetric?  True

1970's covariance matrix shape: (300, 300)
Is symmetric?  True

1980's covariance matrix shape: (300, 300)
Is symmetric?  True



## Step 4: Compute Eigenvalues/Eigenvectors of Covariance Matrix

In [440]:
def random_vector(size):
    """
    Generates a random vector of specified size using Python's random module.
    
    Parameters:
    size (int): Size of random vector.
    
    Returns:
    list: Random vector with values between 0 and 1.
    """
    return [random.random() for _ in range(size)]

In [441]:
def l2_norm(vector):
    """
    Manually computes Euclidean (L2) norm of vector.
    
    Parameters:
    vector (numpy.ndarray): Input vector.
    
    Returns:
    float: Euclidean norm of vector.
    """
    sum_of_squares = 0.0
    for value in vector:
        sum_of_squares += value ** 2
    return sum_of_squares ** 0.5

In [442]:
def power_iteration(matrix, num_iterations=1000, tolerance=1e-9):
    """
    Manually computes largest eigenvalue and its corresponding eigenvector using power iteration.
    
    Parameters:
    matrix (numpy.ndarray): Input square matrix.
    num_iterations (int): Number of iterations to perform.
    tolerance (float): Convergence tolerance.
    
    Returns:
    largest_eigenvalue (float): Largest eigenvalue.
    eigenvector (numpy.ndarray): Corresponding eigenvector.
    """
    # init random vector
    b_k = random_vector(matrix.shape[1])
    
    for _ in range(num_iterations):
        # matrix-vector multiplication
        b_k1 = (matrix @ b_k)

        # normalize vector
        b_k1_norm = l2_norm(b_k1)
        b_k1 = b_k1 / b_k1_norm
        
        # check convergence
        if l2_norm(b_k1 - b_k) < tolerance:
            break
        b_k = b_k1
    
    # compute corresponding eigenvalue
    largest_eigenvalue = (b_k.T @ (matrix @ b_k)) / (b_k.T @ b_k)
    
    return largest_eigenvalue, b_k

In [443]:
# debugging
for decade, matrix in cv_matrices_dict.items():
    largest_eigenvalue, eigenvector = power_iteration(matrix)
    print(f"{decade}'s Largest Eigenvalue:", largest_eigenvalue)
    print()

1880's Largest Eigenvalue: 9.704292564260058

1890's Largest Eigenvalue: 9.35315399403982

1900's Largest Eigenvalue: 9.589888368240851

1910's Largest Eigenvalue: 9.807224349658869

1920's Largest Eigenvalue: 9.644302962474768

1930's Largest Eigenvalue: 9.601697574114038

1940's Largest Eigenvalue: 9.998662355013959

1950's Largest Eigenvalue: 10.12111658956849

1960's Largest Eigenvalue: 10.572366252798298

1970's Largest Eigenvalue: 11.07305471996823

1980's Largest Eigenvalue: 11.276444778079394



## Step 5: Select Largest k Eigenvalues

In [444]:
def largest_k_eigenvalues(matrix, k, num_iterations=1000, tolerance=1e-9):
    """
    Computes largest k eigenvalues and their corresponding eigenvectors using power iteration and deflation.
    
    Parameters:
    matrix (numpy.ndarray): Input square matrix.
    k (int): Number of top eigenvalues/eigenvectors to find.
    num_iterations (int): Number of iterations for each power iteration step.
    tolerance (float): Convergence tolerance for power iteration.
    
    Returns:
    eigenvalues (list): List of top k eigenvalues.
    eigenvectors (list): List of corresponding eigenvectors.
    """
    matrix_copy = matrix[:] # copy of matrix to avoid modifying original
    eigenvalues = []
    eigenvectors = []

    for _ in range(k):
        # find largest eigenvalue and corresponding eigenvector
        largest_eigenvalue, eigenvector = power_iteration(matrix_copy, num_iterations, tolerance)
        eigenvalues.append(largest_eigenvalue)
        eigenvectors.append(eigenvector)

        # deflation step, remove component corresponding to found eigenvector
        matrix_copy = matrix_copy - largest_eigenvalue * np.outer(eigenvector, eigenvector)

    return eigenvalues, np.array(eigenvectors)

In [445]:
# debugging
k = 3
eigenvalues_dict = {}
eigenvectors_dict = {}
for decade, matrix in cv_matrices_dict.items():
    eigenvalues, eigenvectors = largest_k_eigenvalues(matrix, k)
    eigenvalues_dict[decade], eigenvectors_dict[decade] = eigenvalues, eigenvectors
    print(f"{decade}'s largest k = {k} eigenvalues:", eigenvalues)
    print()

1880's largest k = 3 eigenvalues: [9.704292564260049, 9.287608736724541, 7.855021782308997]

1890's largest k = 3 eigenvalues: [9.353153994039824, 9.207174955452224, 7.134263722072801]

1900's largest k = 3 eigenvalues: [9.589888368240851, 9.354388424985924, 7.456796700655962]

1910's largest k = 3 eigenvalues: [9.807224349658863, 9.113684786117691, 7.299313597973953]

1920's largest k = 3 eigenvalues: [9.644302962474764, 9.232778461227895, 7.147979469309917]

1930's largest k = 3 eigenvalues: [9.601697574114025, 9.386224696842914, 7.052220851361747]

1940's largest k = 3 eigenvalues: [9.998662355013966, 9.032794084579256, 7.089983530456277]

1950's largest k = 3 eigenvalues: [10.121116589568487, 9.150386665366526, 7.108610318970511]

1960's largest k = 3 eigenvalues: [10.5723662527983, 9.285836803009172, 7.170663310361255]

1970's largest k = 3 eigenvalues: [11.07305471996823, 8.960012685948968, 6.9122511356750245]

1980's largest k = 3 eigenvalues: [11.276444778079396, 8.645903574261

## Step 6: Transform Data Onto New Space

In [446]:
def transpose(matrix):
    """
    Manually transposes a given matrix.
    
    Parameters:
    matrix (list of lists): Input matrix for transposition.
    
    Returns:
    transposed (list of lists): Transposed matrix.
    """
    return [[row[i] for row in matrix] for i in range(len(matrix[0]))]

#### Before transpose (mismatched dims between data and eigenvectors):

In [447]:
print(f"Shape of standardized data for {decade}: {std_embeddings_dict[decade].shape}")
print(f"Shape of eigenvectors for {decade}: {np.array(eigenvectors_dict[decade]).shape}")

Shape of standardized data for 1980: (263, 300)
Shape of eigenvectors for 1980: (3, 300)


#### After transpose (now matching dims between data and eigenvectors):

In [448]:
print(f"Shape of standardized data for {decade}: {std_embeddings_dict[decade].shape}")
print(f"Shape of eigenvectors for {decade}: {np.array(transpose(eigenvectors_dict[decade])).shape}")

Shape of standardized data for 1980: (263, 300)
Shape of eigenvectors for 1980: (300, 3)


In [449]:
transf_embeddings_dict = {}
for decade in decades:
    transformed_data = (std_embeddings_dict[decade] @ transpose(eigenvectors_dict[decade]))
    transf_embeddings_dict[decade] = transformed_data

In [450]:
print("Keys in transf_embeddings_dict:", transf_embeddings_dict.keys())


Keys in transf_embeddings_dict: dict_keys(['1880', '1890', '1900', '1910', '1920', '1930', '1940', '1950', '1960', '1970', '1980'])


## Step 7: Plot Resulting Data in Reduced Domain

In [451]:
import plotly.graph_objs as go
import plotly.offline as py

In [452]:
def interpolate_points(data1, data2, num_interpolations=10):
    """Interpolates between two sets of points."""
    interpolated_data = []
    for alpha in np.linspace(0, 1, num_interpolations):
        interpolated = (1 - alpha) * data1 + alpha * data2
        interpolated_data.append(interpolated)
    return interpolated_data

In [None]:
frames = []
text_size = 8
marker_size = 3
num_interpolations = 20  # adjust for smoother transitions here

for i in range(len(decades) - 1):
    curr_decade = decades[i]
    next_decade = decades[i + 1]
    curr_data = transf_embeddings_dict[curr_decade]
    next_data = transf_embeddings_dict[next_decade]
    
    # create interpolated frames
    interpolated_frames = interpolate_points(curr_data, next_data, num_interpolations)
    
    for j, interpolated_data in enumerate(interpolated_frames):
        frame = go.Frame(
            data=[
                go.Scatter3d(
                    x=interpolated_data[:, 0],
                    y=interpolated_data[:, 1],
                    z=interpolated_data[:, 2],
                    mode='markers+text',
                    marker=dict(size=marker_size, color='blue'),
                    text=vocab_dict[curr_decade],
                    textfont=dict(size=text_size),
                    name=f"Interpolation {i}-{j}"
                )
            ],
            name=f"Frame {i}-{j}",
        )
        frames.append(frame)

frames.append(
    go.Frame(
        data=[
            go.Scatter3d(
                x=transf_embeddings_dict[decades[-1]][:, 0],
                y=transf_embeddings_dict[decades[-1]][:, 1],
                z=transf_embeddings_dict[decades[-1]][:, 2],
                mode='markers+text',
                marker=dict(size=marker_size, color='blue'),
                text=vocab_dict[decades[-1]],
                name=decades[-1]
            )
        ],
        name=f"Frame {len(decades) - 1}-0"
    )
)

In [460]:
# fixed axis ranges
axes_bounds = 0.2
x_min, x_max = -axes_bounds, axes_bounds
y_min, y_max = -axes_bounds, axes_bounds
z_min, z_max = -axes_bounds, axes_bounds

axes_intervals = [x_min, -axes_bounds/2, 0, axes_bounds/2, x_max]

layout = go.Layout(
    title="3D Visualization of Word Embeddings Over Time",
    margin=dict(l=0, r=0, b=0, t=40),
    scene=dict(
        xaxis=dict(title="PC1", range=[x_min, x_max], tickvals=axes_intervals, autorange=False),
        yaxis=dict(title="PC2", range=[y_min, y_max], tickvals=axes_intervals, autorange=False),
        zaxis=dict(title="PC3", range=[z_min, z_max], tickvals=axes_intervals, autorange=False),
        aspectmode='manual',
        aspectratio=dict(x=1, y=1, z=1),
    ),
    height=800,
    sliders=[{
        "steps": [
            {
                "args": [[f"Frame {i}-0"], {
                    "frame": {"duration": 100, "redraw": True},
                    "mode": "immediate",
                    "transition": {"duration": 100}
                }],
                "label": f"{decades[i]}",
                "method": "animate"
            }
            for i in range(len(decades))
        ],
        "active": 0,
        "x": 0.2,
        "xanchor": "left",
        "y": 0,
        "yanchor": "top",
        "len": 0.5
    }],
    updatemenus=[{
        "buttons": [
            {
                "args": [None, {
                    "frame": {"duration": 100, "redraw": True},
                    "mode": "immediate",
                    "fromcurrent": True
                }],
                "label": "Play",
                "method": "animate"
            },
            {
                "args": [[None], {
                    "frame": {"duration": 0, "redraw": True},
                    "mode": "immediate",
                    "transition": {"duration": 0}
                }],
                "label": "Pause",
                "method": "animate"
            },
            {
                "args": [[f"Frame 0-0"], {  # reset to first interpolated frame
                    "frame": {"duration": 0, "redraw": True},
                    "mode": "immediate",
                    "transition": {"duration": 0}
                }],
                "label": "Reset",
                "method": "animate"
            }
        ],
        "direction": "left",
        "pad": {"r": 10, "t": 87},
        "showactive": False,
        "type": "buttons",
        "x": 0.1,
        "xanchor": "right",
        "y": 0,
        "yanchor": "top"
    }]
)

fig = go.Figure(data=[frames[0].data[0]], layout=layout, frames=frames)
py.iplot(fig)

## Step 9: Quantify Semantic Volatility of Select Words

In [458]:
# calculate shifts for all words across decades
def calculate_shift_magnitudes(embeddings, vocab_dict):
    shift_magnitudes = {}  # dict to store word-shift pairs

    # sort decades for consistency
    decades = sorted(embeddings.keys())

    # traverse all words in vocab dict
    for word in vocab_dict[decades[0]]:  # vocab should be same for all decades given preprocessing step
        word_shifts = []

        # traverse through consecutive decades to calculate shifts for each word
        for i in range(len(decades) - 1):
            curr_decade, next_decade = decades[i], decades[i + 1]

            # check if word exists in both curr and next decades
            if word in vocab_dict[curr_decade] and word in vocab_dict[next_decade]:
                # index of word in each decade's vocabulary
                index_curr = vocab_dict[curr_decade].index(word)
                index_next = vocab_dict[next_decade].index(word)

                # embeddings for word in curr and next decade
                curr_embedding = embeddings[curr_decade][index_curr]
                next_embedding = embeddings[next_decade][index_next]

                # find delta (change in coordinates)
                delta = next_embedding - curr_embedding

                # calculate Euclidean norm (magnitude of shift)
                shift_magnitude = l2_norm(delta)

                # add shift magnitude to list for targeted word
                word_shifts.append(shift_magnitude)

        # add word and its corresponding shift magnitudes to dict
        shift_magnitudes[word] = word_shifts

    return shift_magnitudes


In [459]:
# calculate shift magnitudes for each word
volatility_scores = calculate_shift_magnitudes(transf_embeddings_dict, vocab_dict)

# dict for overall volatility score for each word
overall_volatility_scores = {}

# calculate an overall volatility score for each word (std dev)
for word, shifts in volatility_scores.items():
    if len(shifts) > 0:  # consider words with at least one shift value
        # other options here include using: np.sum(shifts), np.mean(shifts), or np.std(shifts) to quantify volatility
        overall_volatility_scores[word] = np.std(shifts)

# sort words by overall volatility score in descending order
sorted_volatility = sorted(overall_volatility_scores.items(), key=lambda x: x[1], reverse=True)

# output top N most volatile words
N = 300
for word, score in sorted_volatility[:N]:
    print(f"Volatility score for '{word}': {score}")


Volatility score for 'parallel': 5.765737020464597
Volatility score for 'york': 5.757127876540719
Volatility score for 'diameters': 5.502584103673272
Volatility score for 'requirement': 5.374388321555966
Volatility score for 'lens': 5.342515763940987
Volatility score for 'cortex': 5.295525393977922
Volatility score for 'connection': 5.191993675034573
Volatility score for 'basilar': 5.173549093780053
Volatility score for 'suggests': 5.16905369509542
Volatility score for 'motor': 5.113116027699951
Volatility score for 'employer': 5.097111188020316
Volatility score for 'everett': 5.007233462706582
Volatility score for 'northampton': 5.002743581639189
Volatility score for 'johns': 4.99527995562802
Volatility score for 'fro': 4.958404622545052
Volatility score for 'material': 4.955778656260895
Volatility score for 'cosines': 4.8581052142363
Volatility score for 'drunken': 4.82230710344255
Volatility score for 'leslie': 4.795026279166137
Volatility score for 'uterus': 4.794893454866701
Volat