In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
### Note that the reference year is year one (2018)
from gensim.models import FastText

file_path = '/content/drive/MyDrive/SLANGuage_Data/embeddings/'

# Load models for each year
model_2018 = FastText.load(f"{file_path}2018-06-07.model")
model_2019 = FastText.load(f"{file_path}2019-06-01.model")
model_2020 = FastText.load(f"{file_path}2020-06-10.model")
model_2021 = FastText.load(f"{file_path}2021-06-12.model")
model_2022 = FastText.load(f"{file_path}2022-06-18.model")

In [6]:
# Get shared vocabulary across models
vocab_2018 = set(model_2018.wv.key_to_index.keys())
vocab_2019 = set(model_2019.wv.key_to_index.keys())
vocab_2020 = set(model_2020.wv.key_to_index.keys())
vocab_2021 = set(model_2021.wv.key_to_index.keys())
vocab_2022 = set(model_2022.wv.key_to_index.keys())

shared_vocab = vocab_2018 & vocab_2019 & vocab_2020 & vocab_2021 & vocab_2022


In [7]:
from scipy.linalg import orthogonal_procrustes
import numpy as np

def align_to_reference(ref_model, target_model, shared_vocab):
    # Build matrices of embeddings for the shared vocabulary
    ref_matrix = np.array([ref_model.wv[word] for word in shared_vocab])
    tgt_matrix = np.array([target_model.wv[word] for word in shared_vocab])

    # Perform Orthogonal Procrustes
    R, _ = orthogonal_procrustes(tgt_matrix, ref_matrix)

    # Align the target model
    target_model.wv.vectors = np.dot(target_model.wv.vectors, R)
    target_model.wv.vectors_ngrams = np.dot(target_model.wv.vectors_ngrams, R)  # Align subword vectors
    return target_model

# Align each model to 2018
model_2019 = align_to_reference(model_2018, model_2019, shared_vocab)
model_2020 = align_to_reference(model_2018, model_2020, shared_vocab)
model_2021 = align_to_reference(model_2018, model_2021, shared_vocab)
model_2022 = align_to_reference(model_2018, model_2022, shared_vocab)


In [8]:
# Set output directory
output_dir = '/content/drive/MyDrive/SLANGuage_Data/aligned_embeddings/'

# Save aligned models
model_2018.save(f"{output_dir}2018-06-07-aligned.model")
model_2019.save(f"{output_dir}2019-06-18-aligned.model")
model_2020.save(f"{output_dir}2020-06-18-aligned.model")
model_2021.save(f"{output_dir}2021-06-18-aligned.model")
model_2022.save(f"{output_dir}2022-06-18-aligned.model")