## Activate virtual environment

In [1]:
import os

path='/work/NLP_exam'
os.chdir(path)

# Make the activate.sh file executable
!chmod +x activate.sh

# Now run the script
!./activate.sh

Installed kernelspec virt_env in /home/ucloud/.local/share/jupyter/kernels/virt_env
Done! Remember changing the kernel in Jupyter.


## Import packages

In [8]:
import os
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

## Defining model paths

In [2]:
# Paths
models_path = "/work/NLP_exam/aligned_models"
output_path = "/work/NLP_exam/he_she_aligned_models"

## Defining words for the he and she vector

In [5]:
# Word lists
he_words = ["he", "father", "grandpa", "son", "man", "men", "sir", "his", "him"]
she_words = ["she", "mother", "grandma", "daughter", "woman", "women", "madam", "her", "hers"]

## Update he and she vectors

In [6]:
# Function to compute the average vector
def compute_average_vector(model, word_list):
    vectors = []
    for word in word_list:
        if word in model:
            vectors.append(model[word])
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        raise ValueError(f"None of the words in {word_list} exist in the model.")

# Process each model
for model_file in sorted(os.listdir(models_path)):
    if model_file.endswith(".w2v"):
        model_path = os.path.join(models_path, model_file)
        
        # Load the model
        model = Word2Vec.load(model_path)
        word_vectors = model.wv  # Access the word vectors

        try:
            # Compute new "he" and "she" vectors
            he_vector = compute_average_vector(word_vectors, he_words)
            she_vector = compute_average_vector(word_vectors, she_words)
            
            # Update the vectors in the model
            word_vectors["he"] = he_vector
            word_vectors["she"] = she_vector
            
            # Save the updated model
            updated_model_path = os.path.join(output_path, model_file)
            model.save(updated_model_path)
            print(f"Updated and saved model: {model_file}")
        
        except ValueError as e:
            print(f"Skipping model {model_file}: {e}")

Updated and saved model: 1900.w2v
Updated and saved model: 1901.w2v
Updated and saved model: 1902.w2v
Updated and saved model: 1903.w2v
Updated and saved model: 1904.w2v
Updated and saved model: 1905.w2v
Updated and saved model: 1906.w2v
Updated and saved model: 1907.w2v
Updated and saved model: 1908.w2v
Updated and saved model: 1909.w2v
Updated and saved model: 1910.w2v
Updated and saved model: 1911.w2v
Updated and saved model: 1912.w2v
Updated and saved model: 1913.w2v
Updated and saved model: 1914.w2v
Updated and saved model: 1915.w2v
Updated and saved model: 1916.w2v
Updated and saved model: 1917.w2v
Updated and saved model: 1918.w2v
Updated and saved model: 1919.w2v
Updated and saved model: 1920.w2v
Updated and saved model: 1921.w2v
Updated and saved model: 1922.w2v
Updated and saved model: 1923.w2v
Updated and saved model: 1924.w2v
Updated and saved model: 1925.w2v
Updated and saved model: 1926.w2v
Updated and saved model: 1927.w2v
Updated and saved model: 1928.w2v
Updated and sa

## Compare vectors before and after updating he and she vectors

In [7]:
aligned_model_1900 = Word2Vec.load("/work/NLP_exam/aligned_models/1900.w2v").wv  
aligned_model_1999 = Word2Vec.load("/work/NLP_exam/aligned_models/1999.w2v").vw 

updated_aligned_model_1900 = Word2Vec.load("/work/NLP_exam/he_she_aligned_models/1900.w2v").wv  
updated_aligned_model_1999 = Word2Vec.load("/work/NLP_exam/he_she_aligned_models/1999.w2v").wv  

In [13]:
# Function to compare vectors
# We can't use gensim's similarity function as we want to look at similarity between models
def compare_word_vectors(word, model1, model2):
    try:
        vec1 = model1[word]
        vec2 = model2[word]
        similarity = cosine_similarity([vec1], [vec2])[0][0]
        return similarity
    except KeyError as e:
        print(f"Word not found in one of the models: {e}")
        return None

# Example: Similarity of "man" before and after alignment
he_vector_comparison_1900 = compare_word_vectors("he", aligned_model_1900, updated_aligned_model_1900)
he_vector_comparison_1999 = compare_word_vectors("he", aligned_model_1999, updated_aligned_model_1999)
she_vector_comparison_1900 = compare_word_vectors("she", aligned_model_1900, updated_aligned_model_1900)
she_vector_comparison_1999 = compare_word_vectors("she", aligned_model_1999, updated_aligned_model_1999)
print(he_vector_comparison_1900)
print(he_vector_comparison_1999)
print(she_vector_comparison_1900)
print(she_vector_comparison_1999)

0.7539094
0.7852283
0.8005604
0.7702599
