In [None]:
import numpy as np
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
#!pip install lingpy
#import lingpy
#from lingpy import ipa2tokens
import re

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device("cpu") #For debugging
print(f"Using device: {device}")
torch.set_default_device(device)

In [None]:
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

## IELEX

In [None]:
import pandas as pd
from itertools import combinations

df = pd.read_csv("data/ielexData.csv")

df = df[['Meaning', 'Phonological Form', 'cc']].dropna()
df.columns = ['meaning', 'word', 'cognate_class']

ilexPairs = []

for _, group in df.groupby('meaning'):
    entries = group.to_dict('records')
    for w1, w2 in combinations(entries, 2):
        word1 = str(w1['word'])
        word2 = str(w2['word'])
        label = int(w1['cognate_class'] == w2['cognate_class'])
        ilexPairs.append((word1, word2, label))

import csv

# Initialize an empty set to store the languages
languages = set()

# Open and read the CSV file with UTF-8 encoding and error handling
with open('data/ielexData.csv', 'r', encoding='utf-8', errors='replace') as file:
    reader = csv.reader(file)
    next(reader)  # Skip the header row
    for row in reader:
        languages.add(row[1])  # Add the language (second column) to the set

# Print the set of languages
print(languages)

df

## GLED

### Preprocessing

In [None]:
df = pd.read_csv("data/gled.tsv", delimiter="\t")

df.rename(columns={'LANGUAGE_NAME': 'Language'}, inplace=True)

df['Language'] = df['Language'].str.replace('Modern Greek', 'Greek', regex=False)

df = df[df['Language'].isin(languages)]

#df = df[df.apply(lambda row: row['DOCULECT'].lower() in row['Language'].lower(), axis=1)]

#These languages seem to be aight
#df = df[df["DOCULECT"].str.contains("_2")]

#df.dropna(subset=["COGSET"], inplace=True) All of them have a cogset

df

In [None]:
df = df[['CONCEPT', 'FORM', 'COGSET']].dropna()
df.columns = ['meaning', 'word', 'cognate_class']

pairs = []

from tqdm.notebook import tqdm
for _, group in tqdm(df.groupby('meaning')):
    #if len(pairs) > 100000:
    #    break

    entries = group.to_dict('records')
    for w1, w2 in combinations(entries, 2):
        word1 = str(w1['word'])
        word2 = str(w2['word'])
        if(word1 == word2):
            continue
        label = int(w1['cognate_class'] == w2['cognate_class'])
        #if label == 1:
        pairs.append((word1, word2, label))

In [None]:
pairs += ilexPairs

In [None]:
pairs

In [None]:
pairs_df = pd.DataFrame(pairs, columns=['Word1', 'Word2', 'Label'])
pairs_df

### Levenshtein distance preprocessing

In [None]:
all_pairs = [row[:2] for row in pairs]
all_labels = [row[2] for row in pairs]
#all_labels = [row[2:] for row in pairs]

embedding_dim = 64

def embedding_maker(embedding_dim=6):
	all_characters_in_pairs = set()
	for pair in all_pairs: #For all words (indiscriminately)
		for word in pair:
			all_characters_in_pairs.update(word)
			
	from embedding_stuff import IPAEmbedding

	ipa_embedder = IPAEmbedding(all_characters_in_pairs, embedding_dim=embedding_dim, device=device)
	return ipa_embedder

def generate_or_load_embeddings(embedding_maker, embedding_dim=6):
	file_path = f"data/embeddings/{embedding_dim}.joblib"
	
	import os
	import joblib
	if os.path.exists(file_path):
		# Load the object from the file
		loaded_obj = joblib.load(file_path)
		return loaded_obj
	else:
		# Ensure the directory exists
		os.makedirs(os.path.dirname(file_path), exist_ok=True)
		# Save the object to the file
		embeddings = embedding_maker(embedding_dim=embedding_dim)
		joblib.dump(embeddings, file_path)
		return embeddings

ipa_embedder = generate_or_load_embeddings(embedding_maker, embedding_dim=embedding_dim)

In [None]:
def preprocess(all_pairs, ipa_embedder, device):
    from itertools import chain

    max_length = 0
    for pair in all_pairs:
        for word in pair:
            length = len(word)
            if length > max_length:
                max_length = length
    
    batches = torch.empty((len(all_pairs), 2, max_length), dtype=torch.int, device=device)
    batches_masks = torch.zeros((len(all_pairs), 2, max_length), dtype=torch.bool, device=device)

    for pair_index, pair in tqdm(enumerate(all_pairs), total=len(all_pairs), desc="Creating batches"):
        for word_index, word in enumerate(pair):
            for letter_index, letter in enumerate(word):
                batches[pair_index, word_index, letter_index] = ipa_embedder.char_to_idx[letter]
            
            batches_masks[pair_index, word_index, len(word):] = True

    return (batches, batches_masks, max_length)

ldistance_operations, ldistance_masks, max_length = preprocess(all_pairs, ipa_embedder, device=device)

pairs = (ldistance_operations, ldistance_masks, torch.tensor(all_labels, device=device, dtype=torch.float))

In [None]:
pd.DataFrame(data=pairs[1][:,0,:].cpu()).tail(10)

In [None]:
pd.DataFrame(data=pairs[1][:,1,:].cpu()).tail(10)

In [None]:
from sklearn.model_selection import LeaveOneOut, KFold, StratifiedKFold
import numpy as np
from tqdm.notebook import tqdm
import time

def cv_test_model(model_maker, has_history=False):
	cv = StratifiedKFold(n_splits = 10)
	epochs = 50

	# Store results for each fold
	# All of these are [epoch_number, fold, (confusion_matrix, accuracy)]

	timestamps = min(30, epochs)
	
	def is_epoch_return_epoch_number(epoch):
		if epoch == 0:
			return 0

		interval = epochs / timestamps

		if epoch % interval < 1:
			return int(epoch // interval)
		return None
	
	def epoch_number_filter(epoch):
		return is_epoch_return_epoch_number(epoch) != None
	
	history = {
		"train": [
			[tuple((None, None)) for _ in range(cv.get_n_splits())]
			for _ in range(timestamps)
		],
		"test": [
			[tuple((None, None)) for _ in range(cv.get_n_splits())]
			for _ in range(timestamps)
		]
	}

	for fold, (train_idx, test_idx) in enumerate(tqdm(cv.split(pairs[0], pairs[2].cpu()), total=cv.get_n_splits(), desc="CV Progress", leave=False)):
		train_idx = torch.tensor(train_idx)
		test_idx = torch.tensor(test_idx)

		# Split data
		train_data = list(zip(torch.index_select(pairs[0], 0, train_idx), torch.index_select(pairs[1], 0, train_idx), torch.index_select(pairs[2], 0, train_idx)))
		test_data = list(zip(torch.index_select(pairs[0], 0, test_idx), torch.index_select(pairs[1], 0, test_idx), torch.index_select(pairs[2], 0, test_idx)))
		
		# Create model for this fold
		model = model_maker()
		criterion = nn.BCELoss()
		optimizer = optim.Adam(model.parameters(), lr=0.001)
		
		train_loader = DataLoader(train_data, batch_size=3200, shuffle=True, 
								generator=torch.Generator(device=device.type))
		test_loader = DataLoader(test_data, batch_size=3200, 
								generator=torch.Generator(device=device.type))
		
		
		def test_against_loader(model, loader):
			"""
			Modified test_against_loader that returns predictions, probabilities, and true labels
			"""
			predictions = []
			probabilities = []  # Raw model outputs (probabilities)
			true_labels = []

			with torch.no_grad():
				for word_pairs_batch, word_pairs_masks, label_batch in loader:
					output_batch = model(word_pairs_batch, word_pairs_masks).squeeze()
					predicted_batch = (output_batch > 0.5).int()

					# Store results for the entire batch
					predictions.extend(predicted_batch.tolist())
					probabilities.extend(output_batch.tolist())  # Store raw probabilities
					true_labels.extend(label_batch.int().tolist())

			# Calculate accuracy
			accuracy = sum([pred == true for pred, true in zip(predictions, true_labels)]) / len(true_labels)

			# Additional metrics
			from sklearn.metrics import confusion_matrix
			
			# Calculate confusion matrix
			conf_matrix = confusion_matrix(true_labels, predictions)

			return conf_matrix, accuracy, probabilities, true_labels

		# Train model
		model.train()
		for epoch in tqdm(range(epochs), desc=f"Fold {fold}", leave=False):
			total_loss = 0
			for word_pair, masks, labels in train_loader:	# Plural, because it's batches
				optimizer.zero_grad()
				output = model(word_pair, masks).squeeze()

				loss = criterion(output, labels)
				loss.backward()
				optimizer.step()
				total_loss += loss.item()
			#print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

			if has_history:
				epoch_number = is_epoch_return_epoch_number(epoch)
				if epoch_number == None:
					continue
				model.eval()
				history["train"][epoch_number][fold] = test_against_loader(model, train_loader)
				history["test"][epoch_number][fold] = test_against_loader(model, test_loader)

				#Early cutoff if overfitting
				if history["train"][epoch_number][fold][1] >= .99:
					history["train"] = history["train"][0:epoch_number]
					history["test"] = history["test"][0:epoch_number]
					return history, model
				
				model.train()

		if not has_history:
			history["test"][-1][fold] = test_against_loader(model, test_loader)

	return history, model

In [None]:
ipa_embedder.embedding_dim + 2

In [None]:
import numpy as np
import itertools
from transformer_stuff import TransformerCognateModel

def l_distance_model_maker(parameters):
	return TransformerCognateModel(embedder=ipa_embedder.embeddings, **parameters)

def grid_search_best_model():
	# Define parameter grid
	param_grid = {
		'hidden_dim': [128],
		'positional_dropout': [.2],
		'dropout': [0.0],
		'layers': [4]
	}
	
	best_accuracy = 0
	best_params = None
	best_history = None
	best_model = None
	
	# Track all results for analysis
	results = []
	
	print("Starting grid search...")
	num_combinations = len(list(itertools.product(*param_grid.values())))
	print(f"Total combinations to test: {num_combinations}")
	
	combination_count = 0
	
	# Grid search over all parameter combinations
	for param_vals in tqdm(itertools.product(*param_grid.values()), total=num_combinations, desc="Testing Parameters"):
		parameters = dict(zip(param_grid.keys(), param_vals))
		combination_count += 1
		
		print(f"\nTesting combination {parameters}")
		
		# Create model maker with current parameters
		def current_model_maker():
			return l_distance_model_maker(parameters)
		
		# Train and evaluate model
		history, model = cv_test_model(current_model_maker, has_history=True)
		
		# Calculate accuracy using your existing method
		last_epoch_test_data = history["test"][-1]
		all_confusion = np.sum(np.array([fold_data[0] for fold_data in last_epoch_test_data]), axis=0)
		all_accuracies = [fold_data[1] for fold_data in last_epoch_test_data]
		overall_accuracy = np.mean(all_accuracies)
		
		print(f"Accuracy: {overall_accuracy:.4f}")
		
		# Store results
		results.append(parameters)
		
		# Check if this is the best model so far
		if overall_accuracy > best_accuracy:
			best_accuracy = overall_accuracy
			best_params = parameters
			best_history = history
			best_model = model
			print(f"New best accuracy: {best_accuracy:.4f}")
	
	# Print summary
	print(f"\n{'='*50}")
	print("GRID SEARCH COMPLETE")
	print(f"{'='*50}")
	print(f"Best accuracy: {best_accuracy:.4f}")
	print(f"Best parameters: {best_params}")
	
	return best_history, best_model, best_params

history, mode, best_params = grid_search_best_model()

In [None]:
# Calculate overall statistics
last_epoch_test_data = history["test"][-1] #[fold, (pred, labels, accuracies, accuracy)]

all_confusion = np.sum(np.array([fold_data[0] for fold_data in last_epoch_test_data]), axis=0)
all_accuracies = [fold_data[1] for fold_data in last_epoch_test_data]

overall_accuracy = np.mean(all_accuracies)
correct_predictions = all_confusion[1][1] + all_confusion[0][0]
total_predictions = np.sum(all_confusion)

print(f"\nCross-Validation Results:")
print(f"Overall Accuracy: {overall_accuracy * 100:.2f}%")
print(f"Correct Predictions: {correct_predictions}/{total_predictions}")

fold_accuracies = [fold_data[1] for fold_data in history["test"][-1]]

from visualization_stuff import show_box_plot, show_confusion_matrix, show_train_and_test_accuracy_over_epochs
show_box_plot(fold_accuracies)

show_confusion_matrix(all_confusion)

show_train_and_test_accuracy_over_epochs(history)

In [None]:
from visualization_stuff import show_multiple_roc_curves, show_roc_curves_over_epochs

show_multiple_roc_curves(history)

# Show ROC AUC progression over epochs (similar to accuracy plot)
train_fold_aucs, test_fold_aucs = show_roc_curves_over_epochs(history, "ROC AUC Over Epochs")

In [None]:
def show_classification_report(history):
    all_labels = []
    all_predictions = []
    for fold_data in history["test"][-1]:
        _, _, probabilities, true_labels = fold_data
        all_labels.extend(true_labels)
        all_predictions.extend([int(prob > .5) for prob in probabilities])
    from sklearn.metrics import classification_report
    print(classification_report(all_labels, all_predictions))

show_classification_report(history)

https://github.com/pytorch/examples/tree/main/word_language_model