In [1]:
import numpy as np
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
#!pip install lingpy
#import lingpy
#from lingpy import ipa2tokens
import re

In [2]:
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

In [3]:
dat = pd.read_csv('data/ielexData.csv')

In [4]:
dat

Unnamed: 0.1,Unnamed: 0,Language,Meaning,Phonological Form,cc,ASJP
0,11,Greek,few,ˈliʝi,few:I,liSi
1,63,Bulgarian,few,ˈmaɫku,few:H,maLku
2,65,Russian,few,'maɫɔ,few:H,maLo
3,66,Polish,few,ˈmawɔ,few:H,mawo
4,68,Ukrainian,few,ˈmaɫɔ,few:H,maLo
...,...,...,...,...,...,...
4479,124,French,head,tɛt,head:D,tEt
4480,135,Italian,head,'tɛsta,head:D,tEsta
4481,136,Romanian,head,kap,head:B,kap
4482,143,Breton,head,ˈpɛnː,head:E,pEn


In [5]:
dat[dat['Meaning'] == 'few']

Unnamed: 0.1,Unnamed: 0,Language,Meaning,Phonological Form,cc,ASJP
0,11,Greek,few,ˈliʝi,few:I,liSi
1,63,Bulgarian,few,ˈmaɫku,few:H,maLku
2,65,Russian,few,'maɫɔ,few:H,maLo
3,66,Polish,few,ˈmawɔ,few:H,mawo
4,68,Ukrainian,few,ˈmaɫɔ,few:H,maLo
5,70,Czech,few,ˈmaːlɔ,few:H,malo
6,79,Icelandic,few,ˈfauːɪr,few:F,fauir
7,84,Swedish,few,foː,few:F,fo
8,89,Danish,few,fɔˀ,few:F,fo
9,93,English,few,fju:,few:F,fyu


In [6]:
concepts = [re.sub(':.*', '', x) for x in dat['cc']]
dat['concepts'] = concepts
uniqueconcepts = np.unique(concepts)
cognates = [list(re.sub('^.*:', '', x))[0] for x in dat['cc']]
dat['cognate_char'] = cognates

In [7]:
source_words = []
target_words = []
label = []
for concept in uniqueconcepts:
  tmp_df = dat[dat['concepts'] == concept]
  for word_idx in range(len(tmp_df)):
    source_word = tmp_df['ASJP'].iloc[word_idx]
    source_class = tmp_df['cognate_char'].iloc[word_idx]
    if len(list(source_word)) < 11:
      for word_idx2 in range(len(tmp_df)):
        if word_idx != word_idx2:
          target_word = tmp_df['ASJP'].iloc[word_idx2]
          target_class = tmp_df['cognate_char'].iloc[word_idx2]
          if len(list(target_word)) < 11:
            if source_class == target_class:
              label.append(1)
            else:
              label.append(0)
            source_words.append(source_word)
            target_words.append(target_word)


In [8]:
source_words[0]

'exo'

In [9]:
charlens = []
unique_characters = []
for i in source_words:
  charlens.append(len(list(i)))
  for j in list(i):
    unique_characters.append(j)
unique_characters = ['PAD'] + np.unique(unique_characters).tolist()

In [10]:
maxlen = max(charlens)

In [11]:
source_words_tokens = []
target_words_tokens = []
for j in range(len(source_words)):
  source_tmp = [unique_characters.index(i) for i in list(source_words[j])]
  target_tmp = [unique_characters.index(i) for i in list(target_words[j])]
  source_tmp = source_tmp + [0 for x in range(maxlen-len(source_tmp))]
  target_tmp = target_tmp + [0 for x in range(maxlen-len(target_tmp))]

  source_words_tokens.append(source_tmp)
  target_words_tokens.append(target_tmp)

In [12]:
source_words_tokens[0], target_words_tokens[0]

([15, 33, 25, 0, 0, 0, 0, 0, 0, 0], [25, 23, 19, 0, 0, 0, 0, 0, 0, 0])

In [13]:
unique_characters[25]

'o'

In [14]:
#indices = np.arange(len(source_words_tokens))
#np.random.shuffle(indices)
#split_index = int(0.9 * len(indices))
#train_indices = indices[:split_index]
#test_indices = indices[split_index:]

In [15]:
#source_words_tokens_train = source_words_tokens[train_indices]
#source_words_tokens_test = source_words_tokens[test_indices]
#target_words_tokens_train = target_words_tokens[train_indices]
#target_words_tokens_test = target_words_tokens[test_indices]
#label_train = label[train_indices]
#label_test = label[test_indices]

In [16]:
from sklearn.model_selection import train_test_split
source_train, source_test, target_train, target_test, label_train, label_test = train_test_split(source_words_tokens, target_words_tokens, label, test_size=0.1, shuffle=True, random_state=42)

In [17]:
source_train_tensor = torch.LongTensor(source_train)
source_test_tensor = torch.LongTensor(source_test)
target_train_tensor = torch.LongTensor(target_train)
target_test_tensor = torch.LongTensor(target_test)
label_train_tensor = torch.IntTensor(label_train)
label_test_tensor = torch.IntTensor(label_test)

In [18]:
train_dataset = TensorDataset(source_train_tensor, target_train_tensor, label_train_tensor)
test_dataset = TensorDataset(source_test_tensor, target_test_tensor, label_test_tensor)

In [19]:
batch_size = 512
train_loader = DataLoader(train_dataset, batch_size = batch_size)
test_loader = DataLoader(test_dataset, batch_size = batch_size)

Next step: NN that takes in both words, converts them to embeddings and predicts whether they are cognates (yes/no) = (1/0), sigmoid classification

In [20]:
class SiameseNet(nn.Module):
    def __init__(self, vocab_size, embedding_dim=64, hidden_dim=128, dropout = .2):
        super(SiameseNet, self).__init__()
        self.embedding = nn.Embedding(vocab_size + 1, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, dropout=dropout)
        self.lin1 = nn.Linear(hidden_dim, 64)
        self.lin2 = nn.Linear(64, 1)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout()

    #def forward_once(self, x):
    #    x = self.embedding(x)
    #    _, (hidden, _) = self.lstm(x)
    #    return hidden[-1]

    def forward(self, input1, input2):
        output1_emb = self.embedding(input1)
        output2_emb = self.embedding(input2)
        out_state1, (hidden1, _) = self.lstm(output1_emb)
        out_state2, (hidden2, _) = self.lstm(output2_emb)
        output1 = hidden1[-1]
        output2 = hidden2[-1]
        diff = torch.abs(output1 - output2)
        out = self.dropout(self.relu(self.lin1(diff)))
        out = self.lin2(out)
        return out


In [21]:
vocab_size = len(unique_characters)

model = SiameseNet(vocab_size)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)




In [22]:
tan = nn.Tanh()
sigmoid = nn.Sigmoid()
mydatavalue = 3
torch.tensor([mydatavalue, mydatavalue]) * sigmoid(torch.tensor([-10, 10]))

tensor([1.3619e-04, 2.9999e+00])

In [23]:
num_epochs = 2
model.train()

for epoch in range(num_epochs):
    total_loss = 0
    correct_train = 0
    total_train = 0
    for source_batch, target_batch, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(source_batch, target_batch).squeeze()
        loss = criterion(outputs, labels.float())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    outputs = model(source_batch, target_batch).squeeze()
    predicted = (outputs > 0.5).int()
    correct_train += (predicted == labels).sum().item()
    total_train += labels.size(0)

    model.eval()
    correct_test = 0
    total_test = 0

    with torch.no_grad():
        for source_batch, target_batch, labels in test_loader:
            outputs = model(source_batch, target_batch).squeeze()
            predicted = (outputs > 0.5).int()
            correct_test += (predicted == labels).sum().item()
            total_test += labels.size(0)


    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Train Accuracy: {100 * correct_train / total_train:.2f}%, Test Accuracy: {100 * correct_test / total_test:.2f}%")

Epoch 1/2, Loss: 0.5391, Train Accuracy: 65.60%, Test Accuracy: 68.79%
Epoch 2/2, Loss: 0.4039, Train Accuracy: 85.32%, Test Accuracy: 84.44%


In [24]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for source_batch, target_batch, labels in test_loader:
        outputs = model(source_batch, target_batch).squeeze()
        predicted = (outputs > 0.5).int()
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

print(f"Test Accuracy: {100 * correct / total:.2f}%")


Test Accuracy: 84.44%


In [25]:
def predict_cognate(word1, word2):
    def encode(word):
        encoded = [unique_characters.index(c)+1 for c in word]
        return encoded + [0] * (maxlen - len(encoded))

    model.eval()
    w1 = torch.LongTensor([encode(word1)])
    w2 = torch.LongTensor([encode(word2)])
    with torch.no_grad():
        output = model(w1, w2)
    return "Yes" if output.item() > 0.5 else "No"

In [26]:
predict_cognate("maLo", "fauir")

'No'

In [27]:
predict_cognate("maLo", "mawo")

'No'

In [28]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

class SimpleCognateDataset(Dataset):
    def __init__(self, data, unique_characters, maxlen):
        self.data = data
        self.char_to_idx = {char: i+1 for i, char in enumerate(unique_characters)}
        self.maxlen = maxlen

    def encode_word(self, word):
        encoded = [self.char_to_idx.get(c, 0) for c in word]
        return encoded + [0] * (self.maxlen - len(encoded))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        word1, word2, label = self.data[idx]
        return (
            torch.tensor(self.encode_word(word1), dtype=torch.long),
            torch.tensor(self.encode_word(word2), dtype=torch.long),
            torch.tensor(label, dtype=torch.float)
        )

class SimplePairNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim=64, hidden_dim=128):
        super(SimplePairNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size + 1, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim * 2, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def encode_word(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        return hidden[-1]

    def forward(self, input1, input2):
        enc1 = self.encode_word(input1)
        enc2 = self.encode_word(input2)
        combined = torch.cat([enc1, enc2], dim=1)
        return self.fc(combined)


Epoch 1/5, Loss: 0.6096
Epoch 2/5, Loss: 0.6084
Epoch 3/5, Loss: 0.5518
Epoch 4/5, Loss: 0.4439
Epoch 5/5, Loss: 0.3618
Test Accuracy: 86.44%


In [29]:
import pandas as pd

df = pd.read_csv("data/ielexData.csv")
import pandas as pd
from itertools import combinations

df = pd.read_csv("data/ielexData.csv")

df = df[['Meaning', 'Phonological Form', 'cc']].dropna()
df.columns = ['meaning', 'word', 'cognate_class']

pairs = []

for _, group in df.groupby('meaning'):
    entries = group.to_dict('records')
    for w1, w2 in combinations(entries, 2):
        word1 = str(w1['word'])
        word2 = str(w2['word'])
        label = int(w1['cognate_class'] == w2['cognate_class'])
        pairs.append((word1, word2, label))

df

Unnamed: 0,meaning,word,cognate_class
0,few,ˈliʝi,few:I
1,few,ˈmaɫku,few:H
2,few,'maɫɔ,few:H
3,few,ˈmawɔ,few:H
4,few,ˈmaɫɔ,few:H
...,...,...,...
4479,head,tɛt,head:D
4480,head,'tɛsta,head:D
4481,head,kap,head:B
4482,head,ˈpɛnː,head:E


In [None]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(pairs, test_size=0.2, random_state=42)

all_words = [w for pair in pairs for w in pair[:2]]
unique_characters = sorted(set("".join(all_words)))

maxlen = max(len(w) for w in all_words)


embedding_dim = 64
hidden_dim = 128
vocab_size = len(unique_characters)

model2 = SimplePairNN(vocab_size, embedding_dim, hidden_dim)
criterion2 = nn.BCELoss()
optimizer2 = optim.Adam(model2.parameters(), lr=0.001)

train_dataset2 = SimpleCognateDataset(train_data, unique_characters, maxlen)
test_dataset2 = SimpleCognateDataset(test_data, unique_characters, maxlen)
train_loader2 = DataLoader(train_dataset2, batch_size=32, shuffle=True)
test_loader2 = DataLoader(test_dataset2, batch_size=32)

for epoch in range(5):
    model2.train()
    total_loss = 0
    for word1, word2, label in train_loader2:
        optimizer2.zero_grad()
        output = model2(word1, word2).squeeze()
        loss = criterion2(output, label)
        loss.backward()
        optimizer2.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/5, Loss: {total_loss / len(train_loader2):.4f}")

model2.eval()
correct, total = 0, 0
with torch.no_grad():
    for word1, word2, label in test_loader2:
        output = model2(word1, word2).squeeze()
        predicted = (output > 0.5).int()
        correct += (predicted == label.int()).sum().item()
        total += label.size(0)
print(f"Test Accuracy: {100 * correct / total:.2f}%")

def predict_pair(word1, word2):
    def encode(w):
        idxs = [unique_characters.index(c)+1 for c in w]
        return idxs + [0] * (maxlen - len(idxs))

    w1 = torch.LongTensor([encode(word1)])
    w2 = torch.LongTensor([encode(word2)])
    model2.eval()
    with torch.no_grad():
        out = model2(w1, w2)
    return "Yes" if out.item() > 0.5 else "No"