In [2]:
!pip install torch matplotlib scikit-learn numpy scipy openai

Collecting torch
  Downloading torch-2.8.0-cp310-none-macosx_11_0_arm64.whl (73.6 MB)
[K     |████████████████████████████████| 73.6 MB 187 kB/s eta 0:00:013
[?25hCollecting matplotlib
  Downloading matplotlib-3.10.6-cp310-cp310-macosx_11_0_arm64.whl (8.1 MB)
[K     |████████████████████████████████| 8.1 MB 11.8 MB/s eta 0:00:01
[?25hCollecting scikit-learn
  Downloading scikit_learn-1.7.2-cp310-cp310-macosx_12_0_arm64.whl (8.7 MB)
[K     |████████████████████████████████| 8.7 MB 5.6 MB/s eta 0:00:01
[?25hCollecting numpy
  Downloading numpy-2.2.6-cp310-cp310-macosx_14_0_arm64.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 4.3 MB/s eta 0:00:01
[?25hCollecting scipy
  Downloading scipy-1.15.3-cp310-cp310-macosx_14_0_arm64.whl (22.4 MB)
[K     |████████████████████████████████| 22.4 MB 8.6 MB/s eta 0:00:01
[?25hCollecting openai
  Downloading openai-1.107.2-py3-none-any.whl (946 kB)
[K     |████████████████████████████████| 946 kB 4.9 MB/s eta 0:00:01
[?25hColl

In [11]:
!pip install tqdm

You should consider upgrading via the '/Users/mykhailolapshyn/Desktop/Generative Embedding Network/.venv/bin/python3 -m pip install --upgrade pip' command.[0m


In [None]:
import os 
from openai import OpenAI 
import pandas as pd 
from tqdm import tqdm as tqdm 

class Baseline:
    def __init__(self, openai_client: OpenAI) -> None:
        self.openai_client = openai_client

    def get_embedding(self, source_text: str, model_name: str = "text-embedding-3-small"):
        source_text = source_text.replace("\n", " ")
        return self.openai_client.embeddings.create(
            input=[source_text],
            model=model_name
        ).data[0].embedding
    

def main() -> None:
    os.makedirs("output", exist_ok=True)
    output_path = "output/token_embeddings.csv"

    client = OpenAI(api_key=os.environ.get("API_KEY"))
    controller = Baseline(openai_client=client)

    text = """
    Meta said it will introduce more guardrails to its artificial intelligence (AI) chatbots - including blocking them from talking to teens about suicide, self-harm and eating disorders.
    It comes two weeks after a US senator launched an investigation into the tech giant after notes in a leaked internal document suggested its AI products could have "sensual" chats with teenagers.
    The company described the notes in the document, obtained by Reuters, as erroneous and inconsistent with its policies which prohibit any content sexualising children.
    But it now says it will make its chatbots direct teens to expert resources rather than engage with them on sensitive topics such as suicide.
    "We built protections for teens into our AI products from the start, including designing them to respond safely to prompts about self-harm, suicide, and disordered eating," a Meta spokesperson said.
    The firm told tech news publication TechCrunch on Friday it would add more guardrails to its systems "as an extra precaution" and temporarily limit chatbots teens could interact with.
    But Andy Burrows, head of the Molly Rose Foundation, said it was "astounding" Meta had made chatbots available that could potentially place young people at risk of harm.
    "While further safety measures are welcome, robust safety testing should take place before products are put on the market - not retrospectively when harm has taken place," he said.
    "Meta must act quickly and decisively to implement stronger safety measures for AI chatbots and Ofcom should stand ready to investigate if these updates fail to keep children safe."
    Meta said the updates to its AI systems are in progress. It already places users aged 13 to 18 into "teen accounts" on Facebook, Instagram and Messenger, with content and privacy settings which aim to give them a safer experience.
    It told the BBC in April these would also allow parents and guardians to see which AI chatbots their teen had spoken to in the last seven days.
    Safety concerns
    The changes come amid concerns over the potential for AI chatbots to mislead young or vulnerable users.
    A California couple recently sued ChatGPT-maker OpenAI over the death of their teenage son, alleging its chatbot encouraged him to take his own life.
    The lawsuit came after the company announced changes to promote healthier ChatGPT use last month.
    "AI can feel more responsive and personal than prior technologies, especially for vulnerable individuals experiencing mental or emotional distress," the firm said in a blog post.
    Meanwhile, Reuters reported on Friday Meta's AI tools allowing users to create chatbots had been used by some - including a Meta employee - to produce flirtatious "parody" chatbots of female celebrities.
    Among celebrity chatbots seen by the news agency were some using the likeness of artist Taylor Swift and actress Scarlett Johansson.
    Reuters said the avatars "often insisted they were the real actors and artists" and "routinely made sexual advances" during its weeks of testing them.
    It said Meta's tools also permitted the creation of chatbots impersonating child celebrities and, in one case, generated a photorealistic, shirtless image of one young male star.
    Several of the chatbots in question were later removed by Meta, it reported.
    "Like others, we permit the generation of images containing public figures, but our policies are intended to prohibit nude, intimate or sexually suggestive imagery," a Meta spokesperson said.
    They added that its AI Studio rules forbid "direct impersonation of public figures".
    """
    tokens = text.split()

    if os.path.exists(output_path):
        df_existing = pd.read_csv(output_path)
        existing_tokens = set(df_existing["token"].tolist())
    else:
        df_existing = pd.DataFrame(columns=["token", "embedding"])
        existing_tokens = set()

    new_rows = []
    for tok in tqdm(tokens, desc="Обробка токенів", unit="tok"):
        if tok not in existing_tokens:
            emb = controller.get_embedding(tok)
            new_rows.append({"token": tok, "embedding": emb})

    if new_rows:
        df_new = pd.DataFrame(new_rows)
        df_all = pd.concat([df_existing, df_new], ignore_index=True)
        df_all.to_csv(output_path, index=False)
        print(f"Додано {len(new_rows)} нових токенів. Загалом: {len(df_all)}")
    else:
        print("Нових токенів немає — все вже в CSV.")

if __name__ == "__main__":
    main()

        
    
        





Обробка токенів: 100%|██████████| 563/563 [01:25<00:00,  6.60tok/s]


Додано 184 нових токенів. Загалом: 5090


In [2]:
!pip install tensorboard

Collecting tensorboard
  Downloading tensorboard-2.20.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 6.8 MB/s eta 0:00:01
[?25hCollecting tensorboard-data-server<0.8.0,>=0.7.0
  Downloading tensorboard_data_server-0.7.2-py3-none-any.whl (2.4 kB)
Collecting protobuf!=4.24.0,>=3.19.6
  Downloading protobuf-6.32.1-cp39-abi3-macosx_10_9_universal2.whl (426 kB)
[K     |████████████████████████████████| 426 kB 3.9 MB/s eta 0:00:01
Collecting absl-py>=0.4
  Downloading absl_py-2.3.1-py3-none-any.whl (135 kB)
[K     |████████████████████████████████| 135 kB 4.1 MB/s eta 0:00:01
Collecting werkzeug>=1.0.1
  Downloading werkzeug-3.1.3-py3-none-any.whl (224 kB)
[K     |████████████████████████████████| 224 kB 4.3 MB/s eta 0:00:01
[?25hCollecting markdown>=2.6.8
  Downloading markdown-3.9-py3-none-any.whl (107 kB)
[K     |████████████████████████████████| 107 kB 6.7 MB/s eta 0:00:01
[?25hCollecting grpcio>=1.48.2
  Downloading grpcio-1.74.0-cp310-cp310-macosx_

In [3]:
!pip install --upgrade pip

Collecting pip
  Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 1.3 MB/s eta 0:00:01
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.2.4
    Uninstalling pip-21.2.4:
      Successfully uninstalled pip-21.2.4
Successfully installed pip-25.2


**NETWORK**

In [None]:
import os 
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
import pandas as pd
import ast
import warnings 
import numpy as np


# ==========================
# Dataset
# ==========================
class EmbeddingDataset(Dataset):
    def __init__(self, tokens, embeddings) -> None:
        super().__init__()
        self.tokens = tokens
        self.embeddings = torch.tensor(np.array(embeddings), dtype=torch.float32)

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
        return self.tokens[idx], self.embeddings[idx]


# ==========================
# Blocks
# ==========================
class BilinearGLUBlock(nn.Module):
    def __init__(self, dim: int, drop: float = 0.1) -> None:
        super().__init__()
        self.bilinear = nn.Bilinear(dim, dim, dim)
        self.glu = nn.GLU(dim=-1)
        self.norm = nn.LayerNorm(dim)
        self.drop = nn.Dropout(drop)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        h = self.bilinear(
            x.to(self.bilinear.weight.dtype), 
            x.to(self.bilinear.weight.dtype)
        )
        h = self.glu(torch.cat([h, h], dim=-1))
        h = self.drop(h)
        return self.norm(h + x.to(h.dtype))


class ProjectionHead(nn.Module):

    def __init__(self, in_dim: int, hidden_dim: int, out_dim: int):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, out_dim)
        )

    def forward(self, x):
        return self.net(x)


# ==========================
# Encoders
# ==========================
class ForwardEncoder(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int):
        super().__init__()
        self.fc = nn.Linear(input_dim, hidden_dim)
        self.block = BilinearGLUBlock(hidden_dim)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        h = F.silu(self.fc(x))
        return self.block(h)


class BackwardEncoder(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int):
        super().__init__()
        self.fc = nn.Linear(input_dim, hidden_dim)
        self.block = BilinearGLUBlock(hidden_dim)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        h = torch.flip(x, dims=[0])
        h = F.silu(self.fc(h))
        return self.block(h)


class BidirectionalEncoder(nn.Module):
    def __init__(self, input_dim: int = 1536, hidden_dim: int = 512, out_dim: int = 256):
        super().__init__()
        self.forward_encoder = ForwardEncoder(input_dim, hidden_dim)
        self.backward_encoder = BackwardEncoder(input_dim, hidden_dim)
        self.combine = nn.Linear(hidden_dim * 2, out_dim)
        self.norm = nn.LayerNorm(out_dim)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        fwd = self.forward_encoder(x)
        bwd = self.backward_encoder(x)
        combined = torch.cat([fwd, bwd], dim=-1)
        out = self.combine(combined)
        return self.norm(out)


# ==========================
# Model
# ==========================
class LinguaGEN(nn.Module):
    def __init__(self, input_dim=1536, hidden_dim=512, out_dim=256):
        super().__init__()
        self.encoder = BidirectionalEncoder(input_dim, hidden_dim, out_dim)
        self.projection_head = ProjectionHead(out_dim, hidden_dim, out_dim)
        self.baseline_proj = nn.Linear(input_dim, out_dim)

    def forward(self, x: torch.Tensor):
        pred = self.encoder(x)
        pred = self.projection_head(pred)
        pred = F.normalize(pred, dim=-1)

        baseline = self.baseline_proj(x)
        baseline = F.normalize(baseline, dim=-1)

        return pred, baseline


# ==========================
# Loss
# ==========================
class CosineSimilarityContrastiveLoss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, pred, target):
        cos_sim = F.cosine_similarity(pred, target, dim=-1)
        loss = 1 - cos_sim.mean()
        return loss, cos_sim.mean()


# ==========================
# Training
# ==========================
# ==========================
# Training
# ==========================
def split_dataset(tokens, embeddings, batch_size=4):
    dataset = EmbeddingDataset(tokens, embeddings)
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    
    train_loader = DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True,
        num_workers=0, pin_memory=False
    )
    val_loader = DataLoader(
        val_dataset, batch_size=batch_size, shuffle=False,
        num_workers=0, pin_memory=False
    )
    return train_loader, val_loader


def train_model(model, train_loader, val_loader, device, epochs=2, lr=1e-4, accumulation_steps=4):
    criterion = CosineSimilarityContrastiveLoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = CosineAnnealingLR(optimizer, T_max=epochs)
    writer = SummaryWriter()

    os.makedirs("checkpoints", exist_ok=True)

    model.to(device)
    best_val_cos = -1.0

    scaler = torch.cuda.amp.GradScaler(enabled=(device.type == "cuda"))

    for epoch in range(epochs):
        # ---- Train ----
        model.train()
        train_loss, train_cos = 0, 0

        batch_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", ncols=120)
        optimizer.zero_grad()

        for step, (_, embeddings) in enumerate(batch_bar):
            embeddings = embeddings.to(device)

            with torch.autocast(device_type=device.type, dtype=torch.float32):
                pred, baseline = model(embeddings)
                loss, cos_sim = criterion(pred, baseline)
                loss = loss / accumulation_steps  

            if device.type == "cuda":
                scaler.scale(loss).backward()
            else:
                loss.backward()

            if (step + 1) % accumulation_steps == 0:
                if device.type == "cuda":
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    optimizer.step()
                optimizer.zero_grad()

            train_loss += loss.item() * accumulation_steps
            train_cos += cos_sim.item()

            if step % 10 == 0:  
                batch_bar.set_postfix({
                    "loss": f"{loss.item() * accumulation_steps:.4f}",
                    "cos": f"{cos_sim.item():.4f}"
                })

        scheduler.step()

        # ---- Validation ----
        model.eval()
        val_loss, val_cos = 0, 0
        with torch.no_grad():
            for _, embeddings in val_loader:
                embeddings = embeddings.to(device)
                with torch.autocast(device_type=device.type, dtype=torch.float16 if device.type != "cpu" else torch.bfloat16):
                    pred, baseline = model(embeddings)
                    loss, cos_sim = criterion(pred, baseline)
                val_loss += loss.item()
                val_cos += cos_sim.item()

        # ---- Averages ----
        train_loss /= len(train_loader)
        train_cos /= len(train_loader)
        val_loss /= len(val_loader)
        val_cos /= len(val_loader)
        torch.mps.empty_cache()

        # ---- Logs ----
        writer.add_scalar("Loss/Train", train_loss, epoch)
        writer.add_scalar("Loss/Val", val_loss, epoch)
        writer.add_scalar("CosSim/Train", train_cos, epoch)
        writer.add_scalar("CosSim/Val", val_cos, epoch)

        # ---- Save checkpoints ----
        ckpt_path = f"checkpoints/epoch_{epoch+1}.pth"
        torch.save(model.state_dict(), ckpt_path)

        if val_cos > best_val_cos:
            best_val_cos = val_cos
            torch.save(model.state_dict(), "checkpoints/best_model.pth")

        print(f"\nEpoch {epoch+1}/{epochs} "
              f"- TrainLoss: {train_loss:.4f}, ValLoss: {val_loss:.4f}, "
              f"TrainCos: {train_cos:.4f}, ValCos: {val_cos:.4f}")


def main() -> None:
    
    data = pd.read_csv("/Users/mykhailolapshyn/Desktop/Generative Embedding Network/utils/output/token_embeddings.csv")

    tokens = data["token"].to_list()
   
    embeddings = [ast.literal_eval(e) for e in data["embedding"].to_list()]

    train_dataloader, val_dataloader = split_dataset(tokens, embeddings)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = LinguaGEN()
    epochs = 2
    lr = 1e-4
    train_model(model=model,
                train_loader=train_dataloader,
                val_loader=val_dataloader,
                device=device,
                epochs=epochs,
                lr=lr)


if __name__ == "__main__":
    warnings.filterwarnings('ignore')
    main()

Epoch 1/2: 100%|███████████████████████████████████████████| 1018/1018 [16:04<00:00,  1.06it/s, loss=0.0097, cos=0.9903]



Epoch 1/2 - TrainLoss: 0.0456, ValLoss: 0.0089, TrainCos: 0.9544, ValCos: 0.9911


Epoch 2/2: 100%|███████████████████████████████████████████| 1018/1018 [15:00<00:00,  1.13it/s, loss=0.0045, cos=0.9955]



Epoch 2/2 - TrainLoss: 0.0081, ValLoss: 0.0045, TrainCos: 0.9919, ValCos: 0.9955


**INFERENCE**

In [None]:
import torch 
import numpy as np
import pandas as pd 
import ast 
import os
from openai import OpenAI

client = OpenAI(api_key=os.environ.get("API_KEY"))

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

model = LinguaGEN()
model.load_state_dict(torch.load(
    "/Users/mykhailolapshyn/Desktop/Generative Embedding Network/utils/checkpoints/epoch_2.pth",
    map_location=device
))
model.to(device)
model.eval()

df = pd.read_csv("/Users/mykhailolapshyn/Desktop/Generative Embedding Network/utils/output/token_embeddings.csv")

def get_embedding_from_csv(word, df):
    row = df[df["token"] == word]
    if len(row) == 0:
        return None
    emb = np.array(ast.literal_eval(row["embedding"].values[0]), dtype=np.float32)
    return torch.tensor(emb, dtype=torch.float32)


def get_embedding_openai(word):
    resp = client.embeddings.create(model="text-embedding-3-small", input=word)
    emb = np.array(resp.data[0].embedding, dtype=np.float32)
    return torch.tensor(emb, dtype=torch.float32)

# =========================
# Comparison Ukraine vs Japan
# =========================
word1 = "Ukraine"
word2 = "Japan"

emb1 = get_embedding_from_csv(word1, df)
if emb1 is None:
    emb1 = get_embedding_openai(word1)
emb1 = emb1.unsqueeze(0).to(device)

# Для Japan беремо з CSV
emb2 = get_embedding_from_csv(word2, df)
if emb2 is None:
    emb2 = get_embedding_openai(word2)
emb2 = emb2.unsqueeze(0).to(device)

with torch.no_grad():
    gen1 = model(emb1)[0]
    gen2 = model(emb2)[0]

cos = torch.nn.functional.cosine_similarity(gen1, gen2).item()
print(f"Cosine Similarity between {word1} and {word2}: {cos:.4f}")




Cosine Similarity between Ukraine and Japan: 0.9987


**FINISH** :)