## Activate virtual environment

In [2]:
import os

path='/work/NLP_exam'
os.chdir(path)

# Make the activate.sh file executable
!chmod +x activate.sh

# Now run the script
!./activate.sh

Installed kernelspec virt_env in /home/ucloud/.local/share/jupyter/kernels/virt_env
Done! Remember changing the kernel in Jupyter.


## Import packages

In [1]:
import os
import json
from gensim.models.word2vec import Word2Vec

## Load in json file

In [2]:
# Specify the path to your JSON file
file_path = '/work/NLP_exam/tokenized_data_by_year.json'

# Load the JSON file
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

## Checking that the dictionary structure works as it should

In [3]:
# Print the keys (years)
print("Years available in the dataset:")
print(list(data.keys())[:10])

# Get all the years (keys)
years = list(data.keys())
print(len(years)) # check that all the data is there

Years available in the dataset:
['1973', '1992', '1936', '1968', '1995', '1956', '1997', '1987', '1964', '1996']
121


## Look at the amount of books for different years

In [5]:
print(len(data['1998']))  # Number of tokenized books in 1998
print(len(data['1999']))  # Number of tokenized books in 1999
print(len(data['2000']))  # Number of tokenized books in 2000

print(len(data['1900']))  # Number of tokenized books in 1900
print(len(data['1901']))  # Number of tokenized books in 1901
print(len(data['1902']))  # Number of tokenized books in 1902
print(len(data['1903']))  # Number of tokenized books in 1903
print(len(data['1904']))  # Number of tokenized books in 1904
print(len(data['1905']))  # Number of tokenized books in 1905

318
313
344
33
25
41
38
26
35


## Make a function that trains and saves word embedding models

In [None]:
# Define output directory
output_dir = "/work/NLP_exam/models_1900_2000"

# Define time periods (for each year, 5-year window from that year)
time_periods = [(year, year + 5) for year in range(1900, 2000)]

# Word2Vec parameters
num_features = 300  # Word vector dimensionality
min_word_count = 5  # Minimum word count
context = 10  # Context window size
downsampling = 10e-5  # Downsample setting for frequent words

# Function to train and save models
def train_and_save_models(data, time_periods, output_dir, **word2vec_params):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for start_year, end_year in time_periods:
        # Combine all tokenized texts for the time period
        combined_texts = []
        for year in range(start_year, end_year + 1):
            if str(year) in data:  # Ensure the year exists in the data
                combined_texts.extend(data[str(year)])  # Add all books from the year
        
        if not combined_texts:  # Skip if no data is found for the time period
            print(f"No data available for the period {start_year}-{end_year}. Skipping.")
            continue

        print(f"Number of books for {start_year}-{end_year}: {len(combined_texts)}")

        # Train the Word2Vec model
        model = Word2Vec(sentences=combined_texts, **word2vec_params)
        
        # Save the model
        model_name = f"{start_year}.w2v"
        model.save(os.path.join(output_dir, model_name))
        print(f"Model for {start_year} saved as {model_name}")

# Train models for the specified time periods
train_and_save_models(
    data,
    time_periods,
    output_dir,
    vector_size=num_features,
    min_count=min_word_count,
    window=context,
    sg=1,  # Use Skip-gram
    workers=32 
)


Number of books for 1900-1905: 198


## Load in model to check if we can look at word similarity

In [33]:
# Load the full Word2Vec model
model_1900 = Word2Vec.load("/work/NLP_exam/models/1900.w2v") 

# Calculate similarity between two words
similarity_he = model_1900.wv.similarity("he", "money")
print(f"Similarity between 'he' and 'money': {similarity_he}")
similarity_she = model_1900.wv.similarity("she", "money")
print(f"Similarity between 'she' and 'money': {similarity_she}")


Similarity between 'he' and 'money': 0.36406731605529785
Similarity between 'she' and 'money': 0.32877546548843384


## Check the frequency of these words

In [37]:
def get_word_frequency(model, word):
    if word in model.wv.key_to_index:  # Check if the word is in the model's vocabulary
        return model.wv.get_vecattr(word, "count")
    else:
        return None

# Check the frequency of 'king' and 'queen'
he_freq = get_word_frequency(model_1900, 'he')
she_freq = get_word_frequency(model_1900, 'she')

# Print the frequencies
if he_freq is not None:
    print(f"Frequency of 'he': {he_freq}")


if she_freq is not None:
    print(f"Frequency of 'she': {she_freq}")


Frequency of 'he': 224492
Frequency of 'she': 134317
