In [133]:
import pandas as pd
import numpy as np
from pathlib import Path

In [106]:
data_folder = Path("../data")
roles_path = data_folder / "roles.csv"

In [113]:
df = pd.read_csv(roles_path)

In [114]:
languages = set(col.split(" ", 1)[1] for col in df.columns[1:])
merged_data = pd.DataFrame(index=df.index)

for lang in languages:
    merged_data[lang] = 0
    for prefix in ["Studying", "Fluent", "Native"]:
        col = f"{prefix} {lang}"
        if col in df.columns:
            merged_data[lang] |= df[col].apply(lambda x: 1 if x > 0 else 0)

merged_data = merged_data.astype(np.float32)

In [109]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [115]:
X_tensor = torch.tensor(merged_data.values)
dataset = TensorDataset(X_tensor, X_tensor)
dataloader = DataLoader(dataset, batch_size=256, shuffle=True)

In [116]:
class VAERecommender(nn.Module):
    def __init__(self, num_languages, latent_dim=64):
        super().__init__()
        self.num_languages = num_languages
        self.latent_dim = latent_dim
        
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(num_languages, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU()
        )
        # Mean and log-variance for latent space
        self.fc_mu = nn.Linear(256, latent_dim)
        self.fc_logvar = nn.Linear(256, latent_dim)
        
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, num_languages)
        )
        
        # Language embeddings (optional)
        self.language_embeddings = nn.Embedding(num_languages, latent_dim)
        
    def encode(self, x):
        h = self.encoder(x)
        mu = self.fc_mu(h)
        logvar = self.fc_logvar(h)
        return mu, logvar
    
    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std
    
    def decode(self, z):
        return self.decoder(z)
    
    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        # Optionally add mean of language embeddings for known languages
        lang_idx = x.nonzero(as_tuple=False)[:, 1]  # indices of known languages
        if len(lang_idx) > 0:
            z += self.language_embeddings(lang_idx).mean(dim=0)
        out = self.decode(z)
        return out, mu, logvar

In [117]:
def vae_loss(recon_x, x, mu, logvar):
    bce = nn.BCEWithLogitsLoss()(recon_x, x)
    # KL Divergence
    kld = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())
    return bce + kld

In [118]:
model = VAERecommender(X_tensor.shape[1], latent_dim=64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
epochs = 30

for epoch in range(epochs):
    total_loss = 0
    for batch_x, batch_y in dataloader:
        optimizer.zero_grad()
        recon, mu, logvar = model(batch_x)
        loss = vae_loss(recon, batch_y, mu, logvar)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")

Epoch 1, Loss: 0.1528
Epoch 2, Loss: 0.1319
Epoch 3, Loss: 0.1310
Epoch 4, Loss: 0.1289
Epoch 5, Loss: 0.1282
Epoch 6, Loss: 0.1281
Epoch 7, Loss: 0.1273
Epoch 8, Loss: 0.1264
Epoch 9, Loss: 0.1249
Epoch 10, Loss: 0.1241
Epoch 11, Loss: 0.1228
Epoch 12, Loss: 0.1222
Epoch 13, Loss: 0.1217
Epoch 14, Loss: 0.1215
Epoch 15, Loss: 0.1210
Epoch 16, Loss: 0.1209
Epoch 17, Loss: 0.1206
Epoch 18, Loss: 0.1202
Epoch 19, Loss: 0.1199
Epoch 20, Loss: 0.1199
Epoch 21, Loss: 0.1196
Epoch 22, Loss: 0.1196
Epoch 23, Loss: 0.1193
Epoch 24, Loss: 0.1190
Epoch 25, Loss: 0.1188
Epoch 26, Loss: 0.1189
Epoch 27, Loss: 0.1188
Epoch 28, Loss: 0.1187
Epoch 29, Loss: 0.1186
Epoch 30, Loss: 0.1188


In [132]:
def recommend_languages_vae(model, user_vector, top_k=10):
    model.eval()
    with torch.no_grad():
        input_tensor = torch.tensor(user_vector, dtype=torch.float32).unsqueeze(0)
        recon, _, _ = model(input_tensor)
        preds = torch.sigmoid(recon).squeeze().numpy()
    
    preds[user_vector > 0] = -1  # mask known languages
    recommended_indices = preds.argsort()[-top_k:][::-1]
    return [merged_data.columns[i] for i in recommended_indices]

# Example: New user
new_user_vector = np.zeros(merged_data.shape[1])
new_user_vector[merged_data.columns.get_loc("English")] = 1
new_user_vector[merged_data.columns.get_loc("Japanese")] = 1

recommendations = recommend_languages_vae(model, new_user_vector)
print(recommendations)


['Korean', 'Mandarin', 'Spanish', 'French', 'Indonesian', 'German', 'Arabic', 'Filipino', 'Russian', 'Conlangs / Sign Language']
