In [None]:

import os
import json
from sentence_transformers import SentenceTransformer

In [6]:
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

In [2]:

def generate_embeddings_from_folder(folder_path, output_json="embeddings.json"):
    model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
    data = []

    for filename in sorted(os.listdir(folder_path)):
        if filename.endswith(".txt"):
            node_id = filename
            with open(os.path.join(folder_path, filename), "r", encoding="utf-8") as f:
                text = f.read().strip()
                if not text:
                    continue  # Skip empty files
                embedding = model.encode(text).tolist()
                data.append({
                    "node_id": node_id,
                    "text": text,
                    "embedding": embedding
                })

    with open(output_json, "w", encoding="utf-8") as out_f:
        json.dump(data, out_f, indent=2, ensure_ascii=False)

    print(f"✅ Embedded {len(data)} texts and saved to {output_json}")


In [3]:


generate_embeddings_from_folder("mistici_hoti_nebuni", "mistici_hoti_nebuni_node_embeddings.json")


✅ Embedded 103 texts and saved to mistici_hoti_nebuni_nodes_embeddings.json


In [4]:

wildcard = "Diane Rothenberg [NOT] feeling left out of a Paradise of Poets"

In [7]:

wildcard_embedding = model.encode(wildcard, convert_to_tensor=True)

In [8]:

import networkx as nx
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def build_multidigraph_from_embeddings(json_path, wildcard_embedding, threshold_1=0.50, threshold_2=0.75):
    # Load node data
    with open(json_path, "r", encoding="utf-8") as f:
        nodes = json.load(f)

    G = nx.MultiDiGraph()

    # Prepare embeddings as array
    node_ids = [node["node_id"] for node in nodes]
    embeddings = np.array([node["embedding"] for node in nodes])
    wildcard_emb = np.array(wildcard_embedding).reshape(1, -1)

    # Compute cosine similarity matrix
    sim_matrix = cosine_similarity(embeddings)
    wildcard_sims = cosine_similarity(embeddings, wildcard_emb).flatten()

    # Add nodes
    for i, node in enumerate(nodes):
        G.add_node(node["node_id"], text=node["text"], embedding=node["embedding"], wildcard_sim=wildcard_sims[i])

    # --- Type 1 Edges: Similar pairs above threshold_1 ---
    for i in range(len(nodes)):
        for j in range(len(nodes)):
            if i != j and sim_matrix[i][j] >= threshold_1:
                # Decide edge source: more similar to wildcard
                source = node_ids[i] if wildcard_sims[i] >= wildcard_sims[j] else node_ids[j]
                target = node_ids[j] if source == node_ids[i] else node_ids[i]
                G.add_edge(source, target, weight=1, type="embedding_sim")

    # --- Type 2 Edges: Both nodes must be similar to wildcard ---
    wildcard_above = [i for i, sim in enumerate(wildcard_sims) if sim >= threshold_2]
    for i in wildcard_above:
        for j in wildcard_above:
            if i < j:
                source = node_ids[i] if wildcard_sims[i] >= wildcard_sims[j] else node_ids[j]
                target = node_ids[j] if source == node_ids[i] else node_ids[i]
                G.add_edge(source, target, weight=2, type="wildcard_cluster")

    print(f"✅ Graph created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")
    return G


In [10]:

wildcard_embedding = model.encode(wildcard, convert_to_numpy=True)

In [11]:


jerry_mistici_hoti_nebuni_plus_wildcard_graph = build_multidigraph_from_embeddings("mistici_hoti_nebuni_node_embeddings.json", wildcard_embedding=wildcard_embedding)

✅ Graph created with 103 nodes and 1102 edges.


In [14]:

jerry_mistici_hoti_nebuni_plus_wildcard_graph_1 = \
build_multidigraph_from_embeddings("mistici_hoti_nebuni_node_embeddings.json", wildcard_embedding=wildcard_embedding, threshold_1=0.50, threshold_2=0.50)

✅ Graph created with 103 nodes and 1117 edges.


In [15]:

import pickle

In [16]:

with open('margento_jerry_mistici_hoti_nebuni_plus_wildcard_graph.gpickle', 'wb') as f:
    pickle.dump(jerry_mistici_hoti_nebuni_plus_wildcard_graph_1, f, pickle.HIGHEST_PROTOCOL)