#Import and install libraries, the Diorisis corpus and List's stopwords

In [None]:
!pip install cltk
from cltk.alphabet.grc.beta_to_unicode import BetaCodeReplacer
from cltk.alphabet.grc.grc import tonos_oxia_converter
from glob import glob
from xml.etree.ElementTree import parse
import re
import os
import os.path
import pandas as pd

# Fetch List's stopwords and the Diorisis Corpus

!git clone https://github.com/lisni946/ijl_greek_kinship_terms
!wget -O Diorisis.zip "https://figshare.com/ndownloader/files/11296247"
!unzip Diorisis.zip -d /content/corpus
new_stops = os.path.join("/content/ijl_greek_kinship_terms/new_stops.csv")
f = open(new_stops)
X = pd.read_csv(f, delimiter=",", )
X.head()
df = pd.DataFrame(X, columns=['Add Stops'])
new_list = df['Add Stops'].values.tolist()

#Extract and save metadata from the Diorisis corpus (genre, subgenre, year)

In [None]:
import pickle
import glob
from xml.etree.ElementTree import parse

def save_metadata(metadata, filename):
    with open(filename, "wb") as f:
        pickle.dump(metadata, f)

xml_files = glob.glob('/content/corpus/*.xml')
corpus = []
metadata = []
for xml in xml_files:
    with open(xml, 'r') as x:
        tree = parse(x)
        root = tree.getroot()

        genre = root.find('.//xenoData/genre').text
        subgenre = root.find('.//xenoData/subgenre').text
        year = int(root.find('.//profileDesc/creation/date').text)

        metadata.append((genre, subgenre, year))

# Save metadata
metadata_file = "metadata.pkl"
save_metadata(metadata, metadata_file)

#Set searchword, define corpus A and Corpus B and set the number of models to be trained

In [None]:
search_word = 'καρδία' # Set search_word (for all corpuses)

# Define corpus A
desired_genre_A = "Religion"
desired_subgenre_A = None
desired_year_start_A = -800
desired_year_end_A = 365

# Define corpus B
desired_genre_B = "Technical"
desired_subgenre_B = "Medicine"
desired_year_start_B = -800
desired_year_end_B = 170

# Set the number of models
num_models_A = 30 #Corpus A
num_models_B = 30 #Corpus B

#Train models on corpus A

In [None]:
from __future__ import absolute_import, division, print_function
import os
import glob
import multiprocessing
import gensim.models.word2vec as w2v
import sklearn.manifold
import time
from xml.etree.ElementTree import parse
from collections import Counter
import pickle
import random
import numpy as np
from typing import Optional, List, Tuple

# Generate randomized parameters
def generate_random_params():
    num_features = random.randint(100, 300)
    context_size = random.randint(5, 10)
    return num_features, context_size

# Load metadata
with open('metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

xml_files = glob.glob('/content/corpus/*.xml')

# Define the BetaCodeReplacer class
class BetaCodeReplacer:
    def __init__(self, pattern: Optional[List[Tuple[str, str]]] = None, reorder_pattern: Optional[List[Tuple[str, str]]] = None):
        pass

beta_code_replace = BetaCodeReplacer()

corpus_A = []

for xml, (genre, subgenre, year) in zip(xml_files, metadata):
   if (desired_genre_A is None or genre == desired_genre_A) and (desired_subgenre_A is None or subgenre == desired_subgenre_A) and desired_year_start_A <= year <= desired_year_end_A:
        with open(xml, 'r') as x:
            tree = parse(x)
            root = tree.getroot()
            for sentence in root.iter('sentence'):
                sentences = []
                for word in sentence.iter('word'):
                    for lemma in word.iter('lemma'):
                        entry = lemma.get('entry')
                        if entry is None:
                            entry = word.get('form')
                            sentences.append(entry)
                        elif tonos_oxia_converter(entry) not in new_list:
                            sentences.append(entry)
                if len(sentences) > 0:
                    corpus_A.append(sentences)

# Train models
models_A = []

for i in range(num_models_A):
    num_features, context_size = generate_random_params()

    seed = i
    downsampling = 1e-3
    num_workers = multiprocessing.cpu_count()
    min_word_count = 10

    greek2vec = w2v.Word2Vec(
        sg=1,
        seed=seed,
        workers=num_workers,
        vector_size=num_features,
        min_count=min_word_count,
        window=context_size,
        sample=downsampling
    )
    greek2vec.build_vocab(corpus_A)

    token_count_A = sum([len(sentence) for sentence in corpus_A])
    print('Model {}/{} - Num Features: {}, Context Size: {}, Tokens: {:,}'.format(i+1, num_models_A, num_features, context_size, token_count_A))

    greek2vec.train(corpus_A, total_examples=greek2vec.corpus_count, epochs=50)
    models_A.append(greek2vec)

    greek2vec.save(f"{search_word}_{desired_genre_A}_{desired_subgenre_A}_{desired_year_start_A}_to_{desired_year_end_A}_model_{i+1}.model")

#Train models on corpus B

In [None]:
from __future__ import absolute_import, division, print_function
import os
import glob
import multiprocessing
import gensim.models.word2vec as w2v
import sklearn.manifold
import time
from xml.etree.ElementTree import parse
from collections import Counter
import pickle
import random
import numpy as np
from typing import Optional, List, Tuple

# Generate randomized parameters
def generate_random_params():
    num_features = random.randint(100, 300)
    context_size = random.randint(5, 10)
    return num_features, context_size

# Load metadata
with open('metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

xml_files = glob.glob('/content/corpus/*.xml')

# Define the BetaCodeReplacer class
class BetaCodeReplacer:
    def __init__(self, pattern: Optional[List[Tuple[str, str]]] = None, reorder_pattern: Optional[List[Tuple[str, str]]] = None):
        pass

beta_code_replace = BetaCodeReplacer()

corpus_B = []

for xml, (genre, subgenre, year) in zip(xml_files, metadata):
    if (desired_genre_B is None or genre == desired_genre_B) and (desired_subgenre_B is None or subgenre == desired_subgenre_B) and desired_year_start_B <= year <= desired_year_end_B:
        with open(xml, 'r') as x:
            tree = parse(x)
            root = tree.getroot()
            for sentence in root.iter('sentence'):
                sentences = []
                for word in sentence.iter('word'):
                   for lemma in word.iter('lemma'):
                        entry = lemma.get('entry')
                        if entry is None:
                            entry = word.get('form')
                            sentences.append(entry)
                        elif tonos_oxia_converter(entry) not in new_list:
                            sentences.append(entry)
                if len(sentences) > 0:
                    corpus_B.append(sentences)

# Train models
models_B = []

for i in range(num_models_B):
    num_features, context_size = generate_random_params()

    seed = i
    downsampling = 1e-3
    num_workers = multiprocessing.cpu_count()
    min_word_count = 10

    greek2vec = w2v.Word2Vec(
        sg=1,
        seed=seed,
        workers=num_workers,
        vector_size=num_features,
        min_count=min_word_count,
        window=context_size,
        sample=downsampling
    )
    greek2vec.build_vocab(corpus_B)

    token_count_B = sum([len(sentence) for sentence in corpus_B])
    print('Model {}/{} - Num Features: {}, Context Size: {}, Tokens: {:,}'.format(i+1, num_models_B, num_features, context_size, token_count_B))

    greek2vec.train(corpus_B, total_examples=greek2vec.corpus_count, epochs=50)
    models_B.append(greek2vec)

    greek2vec.save(f"{search_word}_{desired_genre_B}_{desired_subgenre_B}_{desired_year_start_B}_to_{desired_year_end_B}_model_{i+1}.model")

#Print results for models trained on corpus A

In [None]:
print(f"Results for '{search_word}' in genre '{desired_genre_A}', subgenre '{desired_subgenre_A}', ({desired_year_start_A} to {desired_year_end_A})")
print(f"Tokens: {token_count_A}")

similar_words_lists_A = []

# Load each model and find most similar words to the search word
for model in models_A:
    similar_words_A = model.wv.most_similar(search_word, topn=100)
    similar_words_lists_A.append(similar_words_A)

# Define all_words based on similar_words_lists
all_words_A = set(word for similar_words_A in similar_words_lists_A for word, _ in similar_words_A)

# Calculate frequencies of all relevant words in the corpus
word_freq = Counter()
for sentence in corpus_A:
    for word in sentence:
        if word in all_words_A or word == search_word:
            word_freq[word] += 1

similarities_A = {}
for word in all_words_A:
    scores = [score for similar_words_A in similar_words_lists_A for w, score in similar_words_A if w == word]
    if scores:  # If there are scores, i.e. the word was found in some model
        mean_score = np.mean(scores)
        coverage = len(scores) / len(similar_words_lists_A)  # Calculate coverage as a proportion of models
        freq = word_freq[word]  # Get frequency from the word_freq Counter
        similarities_A[word] = (mean_score, coverage, freq)

# Sorting by mean_score primarily
sorted_words_A = sorted(similarities_A.items(), key=lambda x: x[1][0], reverse=True)

# Print the frequency of the search_word
print(f"\nFrequency of '{search_word}' in the corpus: {word_freq[search_word]}")

# Print the results; the average cosine similarity score as calculated across models, the models' coverage of the word (how often it appears in each model), the frequency of the word in the defined corpus
print("Top words with the highest average cosine similarity scores, their coverage, and frequency in the corpus:")
for word, (mean_score, coverage, freq) in sorted_words_A[:40]:
    print(f"{word}: Average score: {mean_score:.3f}, Coverage: {coverage:.2%}, Frequency: {freq}")

#Print results for models trained on corpus B

In [None]:
print(f"Results for '{search_word}' in genre '{desired_genre_B}', subgenre '{desired_subgenre_B}', ({desired_year_start_B} to {desired_year_end_B})")

print(f"Tokens: {token_count_B}")

similar_words_lists_B = []

# Load each model and find most similar words to the search word
for model in models_B:
    similar_words_B = model.wv.most_similar(search_word, topn=100)
    similar_words_lists_B.append(similar_words_B)

# Define all_words based on similar_words_lists
all_words_B = set(word for similar_words_B in similar_words_lists_B for word, _ in similar_words_B)

# Calculate frequencies of all relevant words in the corpus
word_freq = Counter()
for sentence in corpus_B:
    for word in sentence:
        if word in all_words_B or word == search_word:
            word_freq[word] += 1

similarities = {}
for word in all_words_B:
    scores = [score for similar_words in similar_words_lists_B for w, score in similar_words if w == word]
    if scores:  # If there are scores, i.e. the word was found in some models
        mean_score = np.mean(scores)
        coverage = len(scores) / len(similar_words_lists_B)  # Calculate coverage as a proportion of models
        freq = word_freq[word]  # Get frequency from the word_freq Counter
        similarities[word] = (mean_score, coverage, freq)

# Sorting by mean_score primarily
sorted_words_B = sorted(similarities.items(), key=lambda x: x[1][0], reverse=True)

# Print the frequency of the search_word separately
print(f"\nFrequency of '{search_word}' in the corpus: {word_freq[search_word]}")

# Print the results; the average cosine similarity score as calculated across models, the models' coverage of the word (how often it appears in each model), the frequency of the word in the defined corpus
print("Top words with the highest average cosine similarity scores, their coverage, and frequency in the corpus:")
for word, (mean_score, coverage, freq) in sorted_words_B[:40]:
    print(f"{word}: Average score: {mean_score:.3f}, Coverage: {coverage:.2%}, Frequency: {freq}")

#Compare corpus A with corpus B

In [None]:
words_A_set = {word for word, data in sorted_words_A}
words_B_set = {word for word, data in sorted_words_B}

# Common words
common_words = words_A_set.intersection(words_B_set)
print("Common words:", common_words)

for word in common_words:
    data_A = next((data for w, data in sorted_words_A if w == word), None)
    data_B = next((data for w, data in sorted_words_B if w == word), None)
    if data_A and data_B:
        print(f"Word: {word}")
        print(f"\tCorpus A - Average score: {data_A[0]:.3f}, Coverage: {data_A[1]:.2%}, Frequency: {data_A[2]}")
        print(f"\tCorpus B - Average score: {data_B[0]:.3f}, Coverage: {data_B[1]:.2%}, Frequency: {data_B[2]}")

#Download the models trained on corpus A

In [None]:
from google.colab import files
import os
import re
import zipfile

# Directory where your models are saved
directory = "/content"

# Pattern to match filenames
pattern_A = re.compile(rf"{search_word}_{desired_genre_A}_{desired_subgenre_A}_{desired_year_start_A}_to_{desired_year_end_A}_model_\d+\.model")

# List all model files in the directory
model_files_A = [f for f in os.listdir(directory) if pattern_A.match(f)]

# Zip all model files
zip_file_path_A = "/content/modelscorpusA.zip"
with zipfile.ZipFile(zip_file_path_A, 'w') as zipf:
    for model_file in model_files_A:
        zipf.write(os.path.join(directory, model_file), model_file)

# Download the zip file
files.download(zip_file_path_A)

#Download the models trained on corpus B

In [None]:
from google.colab import files
import os
import re
import zipfile

# Directory where your models are saved
directory = "/content"

# Pattern to match filenames
pattern_B = re.compile(rf"{search_word}_{desired_genre_B}_{desired_subgenre_B}_{desired_year_start_B}_to_{desired_year_end_B}_model_\d+\.model")

# List all model files in the directory
model_files_B = [f for f in os.listdir(directory) if pattern_B.match(f)]

# Zip all model files
zip_file_path_B = "/content/modelscorpusB.zip"
with zipfile.ZipFile(zip_file_path_B, 'w') as zipf:
    for model_file in model_files_B:
        zipf.write(os.path.join(directory, model_file), model_file)

# Download the zip file
files.download(zip_file_path_B)