In [105]:
from importlib.metadata import metadata

import nltk
import requests
import numpy as np
from tqdm import tqdm
from typing import Dict, List, Tuple, Optional, Set
import sys
from pathlib import Path
from nltk.corpus import wordnet as wn
from deep_translator import GoogleTranslator
from typing import Dict, List
import time
from pathlib import Path
import csv

class MultilingualEmbeddingExtractor:
    def __init__(self, embeddings_path: str = Path("../data/numberbatch-19.08.txt")):
        self.embeddings_path = Path(embeddings_path)

        self.target_langs = ['de', 'fr', 'es', 'it', 'pt', 'nl', 'pl', 'ru', 
                        'ja', 'ko', 'zh-CN', 'ar', 'hi', 'tr', 'sv']

    def get_translations(self, word: str, source_lang: str = 'en', 
                        target_langs: List[str] = None) -> Dict[str, str]:
        
        if target_langs is None:
            target_langs = self.target_langs[:10]
        
        translations = {source_lang: word}
        
        for lang in target_langs:
            try:
                translator = GoogleTranslator(source=source_lang, target=lang)
                result = translator.translate(word)
                if result:
                    translations[lang] = result.lower()
                time.sleep(0.3)
            except Exception as e:
                print(f"Error translating to {lang}: {e}")
                continue
        
        return translations
    

        

    def load_embeddings_for_words(self, translations: Dict[str, str]) -> Dict[Tuple[str, str], np.ndarray]:
        embeddings = {}
        translations_lower = {lang: word.lower() for lang, word in translations.items()}

        if not self.embeddings_path.exists():
            raise FileNotFoundError(f"Embeddings file not found: {self.embeddings_path}")

        with open(self.embeddings_path, 'r', encoding='utf-8') as f:
            next(f)

            for line in tqdm(f, desc="Searching embeddings", file=sys.stdout):
                parts = line.rstrip().split(' ')
                if len(parts) < 2:
                    continue

                entry = parts[0]
                entry_parts = entry.split('/')

                if len(entry_parts) >= 4:
                    target_lang = entry_parts[2]
                    target_word = entry_parts[3].lower()

                    if target_lang in translations_lower and translations_lower[target_lang] == target_word:
                        vector = np.array([float(x) for x in parts[1:]], dtype=np.float32)
                        embeddings[(target_lang, translations[target_lang])] = vector

                        if len(embeddings) == len(translations):
                            break

        return embeddings

    def get_multilingual_embeddings(self, word: str, source_lang: str = "en") -> Dict[Tuple[str, str], np.ndarray]:
        translations = self.get_translations(word, source_lang)
        if not translations:
            return {}

        print(f"Found translations for '{word}': {translations}")
        embeddings = self.load_embeddings_for_words(translations)

        return embeddings

    def save_to_csv(self, word, embeddings):
        with open(Path(f"../data/embeddings/{word}.csv"), 'w', newline='') as file:
            for line in embeddings.items():
                mdata, emb = line[0], line [1]
                writer = csv.writer(file, delimiter=";")
                row = (*mdata, *[num for num in emb])
                writer.writerow(row)
        
        
def analyze_embedding_consistency(embeddings: Dict[Tuple[str, str], np.ndarray]) -> Dict[str, float]:
    if len(embeddings) < 2:
        return {}

    vectors = list(embeddings.values())
    mean_vector = np.mean(vectors, axis=0)

    consistencies = {}
    for (lang, word), vector in embeddings.items():
        cosine_sim = np.dot(vector, mean_vector) / (np.linalg.norm(vector) * np.linalg.norm(mean_vector))
        consistencies[f"{lang}:{word}"] = float(cosine_sim)

    return consistencies

    


In [124]:
word = "cat"

extractor = MultilingualEmbeddingExtractor()
translations = extractor.get_translations(word)
embeddings = extractor.load_embeddings_for_words(translations)
extractor.save_to_csv(word, embeddings)


Searching embeddings: 7758151it [02:21, 54806.95it/s]
('de', 'idee')
('en', 'idea')
('es', 'idea')
('fr', 'idée')
('it', 'idea')
('ja', 'アイデア')
('ko', '아이디어')
('nl', 'idee')
('pl', 'pomysł')
('pt', 'ideia')
('ru', 'идея')


In [116]:
import os

data = dict()
for file in os.listdir("../data/embeddings"):
    with open(Path("../data/embeddings") / Path(file), "r") as f:
        reader = csv.reader(f)
        data[file.removesuffix('.csv')] = dict()
        for row in reader:
            parsed = row[0].split(';')
            
            data[file.removesuffix('.csv')][(parsed[0], parsed[1])] = np.array(parsed[2:], dtype=np.float32)
            
           

#print(data['cow'])
#print([x.shape for x in vectors])
mean_v_1 = np.mean(list(data['sword'].values()), axis=0)
mean_v_2 = np.mean(list(data['sheep'].values()), axis=0)
cosine_sim = np.dot(mean_v_1, mean_v_2) / (np.linalg.norm(mean_v_1) * np.linalg.norm(mean_v_2))
print(cosine_sim)


0.01663858


In [23]:
import nltk
import os
from nltk.corpus import wordnet as wn
from collections import defaultdict

def get_supersense(word, pos='n'):
    """
    Holt den Supersense für ein Wort
    pos: 'n' für Nomen, 'v' für Verben, etc.
    """
    synsets = wn.synsets(word, pos=pos)
    if synsets:
        # Nimm das erste (häufigste) Synset
        return [x.lexname() for x in synsets]
        
        #return lexname
    return None



while True:
    word = input("Konzept suchen:")
    print(f"{word}: {get_supersense(word)}")

stone: ['noun.object', 'noun.artifact', 'noun.substance', 'noun.substance', 'noun.quantity', 'noun.plant', 'noun.person', 'noun.person', 'noun.person', 'noun.person', 'noun.person', 'noun.person', 'noun.attribute']
stein: ['noun.artifact', 'noun.person']
snake: ['noun.animal', 'noun.person', 'noun.object', 'noun.object', 'noun.artifact']
bill: ['noun.communication', 'noun.communication', 'noun.possession', 'noun.act', 'noun.communication', 'noun.communication', 'noun.communication', 'noun.artifact', 'noun.artifact', 'noun.animal']
horse: ['noun.animal', 'noun.artifact', 'noun.group', 'noun.artifact', 'noun.artifact']
guitar: ['noun.artifact']
baby: ['noun.person', 'noun.person', 'noun.person', 'noun.person', 'noun.person', 'noun.animal', 'noun.act']
star: ['noun.object', 'noun.person', 'noun.object', 'noun.person', 'noun.shape', 'noun.person', 'noun.communication', 'noun.cognition']
tiger: ['noun.person', 'noun.animal']


KeyboardInterrupt: Interrupted by user