In [None]:
import csv
import json
import sys
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Set

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

In [None]:
def collect_entity_names(nodes: List[Dict]) -> Set[str]:
    """Return a set with the *unique* names of all entities."""
    return {n["name"] for n in nodes if n.get("type") == "Entity"}

def extract_time_slot(raw_ts: str) -> str:
    if not raw_ts:
        return "unknown"
    try:
        return datetime.fromisoformat(raw_ts).date().isoformat()
    except ValueError:
        return raw_ts
    
def detect_participants(content: str, entity_names: Set[str]) -> Set[str]:
    participants = {name for name in entity_names if name in content}
    return participants

In [None]:
data = load_json('../data/MC3_graph.json')
schema = load_json('../data/MC3_schema.json')

In [None]:
data

In [None]:
nodes_type = schema['schema']['nodes'].keys()

In [None]:
nodes_type

In [None]:
nodes = data.get("nodes", [])
entity_names = collect_entity_names(nodes)

In [None]:
entity_names

In [None]:
############
import pandas as pd

In [None]:
data = pd.read_csv("../data/MC3_data_parsed.csv")

In [None]:
data[data["edge_id"]=="Event_Communication_2"]

In [None]:
data[data["edge_id"]=="Event_Communication_1"]

In [None]:
(data["edge_name_description"]).unique()

In [None]:
sentences = list(data["edge_name_description"].unique())


In [None]:
sentences

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 2.1 Vectorizar
vectorizer = TfidfVectorizer(stop_words="english")  # o usa español: stop_words="spanish"
tfidf = vectorizer.fit_transform(sentences)

# 2.2 Matriz de similitud coseno
sim_matrix = cosine_similarity(tfidf)

# 2.3 Encontrar los pares más similares
#  - ignoramos la diagonal (sim(self)=1)
#  - “desempaquetamos” la parte superior de la matriz
idxs, jdxs = np.triu_indices_from(sim_matrix, k=1)
pairs = list(zip(idxs, jdxs, sim_matrix[idxs, jdxs]))

# 2.4 Ordenar y mostrar top-10
top10 = sorted(pairs, key=lambda x: x[2], reverse=True)[:10]
for i, j, score in top10:
    print(f"{score:.3f} →\n 1) {sentences[i]}\n 2) {sentences[j]}\n")


In [None]:
!pip install scikit-learn


In [None]:
pd.set_option('display.max_colwidth', None)


In [None]:
print(sentences[i].endswith("…"))

In [None]:
import re

# 1) Tokenización muy básica
def tokenize(s):
    return set(re.findall(r'\w+', s.lower()))

sentences = list(data["edge_name_description"].unique())
token_sets = [tokenize(s) for s in sentences]

# 2) Construir matriz de similitud Jaccard
n = len(sentences)
scores = []
for i in range(n):
    for j in range(i+1, n):
        A, B = token_sets[i], token_sets[j]
        if not A or not B:
            continue
        jacc = len(A & B) / len(A | B)
        scores.append((i, j, jacc))

# 3) Top-10 pares más similares
top10 = sorted(scores, key=lambda x: x[2], reverse=True)[:10]
for i, j, sc in top10:
    print(f"{sc:.3f} →\n 1) {sentences[i]}\n 2) {sentences[j]}\n")


In [None]:
top20 = sorted(scores, key=lambda x: x[2], reverse=True)[:20]

for i, j, sc in top20:
    print(f"{sc:.3f} →\n 1) {sentences[i]}\n 2) {sentences[j]}\n")

In [None]:
import json
from pathlib import Path
from typing import List, Dict, Any

def load_data(path: str) -> List[Dict[str,Any]]:
    """
    Carga un fichero .json (lista de objetos) o .jsonl/.ndjson (un objeto JSON por línea).
    Devuelve una lista de diccionarios.
    """
    p = Path(path)
    if p.suffix.lower() == ".json":
        # JSON estándar: [ {...}, {...}, ... ]
        with p.open("r", encoding="utf-8") as f:
            data = json.load(f)
    elif p.suffix.lower() in (".jsonl", ".ndjson"):
        # JSONL/NDJSON: un objeto JSON por línea
        data = []
        with p.open("r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                data.append(json.loads(line))
    else:
        raise ValueError(f"Formato no soportado: {p.suffix}")
    return data

if __name__ == "__main__":
    # 1) Ruta a tu fichero JSON o JSONL
    archivo = '../data/MC3_graph.json'
    
    # 2) Carga los registros
    registros = load_data(archivo)
    
    # 3) Extrae la lista de oraciones completas
    sentences = [
        rec.get("edge_name_description", "")
        for rec in registros
        if "edge_name_description" in rec
    ]
    
    # 4) Comprueba que ya no tienen el carácter de truncado “…”
    for s in sentences[:5]:
        print(s)


In [None]:
sentences

In [None]:
import json
from pathlib import Path

def load_json(path):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

if __name__ == "__main__":
    archivo = Path("../data/MC3_graph.json")
    graph = load_json(archivo)

    # 1) ¿Qué tipo de objeto es?
    print(type(graph))           # dict, list, etc.

    # 2) Si es dict, qué claves tiene?
    if isinstance(graph, dict):
        print("Claves en graph:", graph.keys())

    # 3) Inspecciona un fragmento pequeño
    #    Ajusta según veas el nombre correcto de la lista de aristas/edges
    sample = None
    if isinstance(graph, dict) and "edges" in graph:
        sample = graph["edges"][:3]
    elif isinstance(graph, dict) and "links" in graph:
        sample = graph["links"][:3]
    elif isinstance(graph, list):
        sample = graph[:3]
    print("Muestra de registros:", sample)


In [None]:
registros

In [None]:
# 1) extraer todas las aristas de tipo Communication
communications = [
    edge["content"]
    for edge in graph.get("edges", [])
    if edge.get("sub_type") == "Communication" and "content" in edge
]

# 2) imprimirlas (o devolverlas, usarlas como necesites)
for idx, text in enumerate(communications, 1):
    print(f"{idx:03d}: {text}")


In [None]:
for edge in graph["data"]["links"]:
    if edge.get("sub_type") == "Communication":
        print(edge["content"])

In [None]:
import json
from pathlib import Path
from typing import Any, Dict, List, Union

def load_json(path: Path) -> Union[Dict[str, Any], List[Any]]:
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)

def main():
    archivo = Path("../data/MC3_graph.json")
    graph = load_json(archivo)

    # 1) ¿Qué estructura tiene 'graph'?
    print("Tipo de graph:", type(graph))
    if isinstance(graph, dict):
        print("Claves en root:", list(graph.keys()))
    print()

    # 2) ¿Dónde están tus aristas?
    #    Prueba varias rutas posibles:
    for candidate in ("edges", "links", "graph"):
        if isinstance(graph, dict) and candidate in graph:
            print(f"Encontré lista '{candidate}' con {len(graph[candidate])} elementos")
    print()

    # 3) Supongamos que las aristas están en graph["edges"]
    #    Cámbialo si está en otra ruta (p.ej. graph["graph"]["edges"])
    records = []
    if isinstance(graph, dict) and "edges" in graph:
        records = graph["edges"]
    elif isinstance(graph, dict) and "graph" in graph and isinstance(graph["graph"], dict):
        # a veces está anidado
        recs = graph["graph"]
        if "edges" in recs:
            records = recs["edges"]

    print("Número total de registros que vamos a filtrar:", len(records))

    # 4) Filtrar los Communication
    communications = [
        rec["content"]
        for rec in records
        if rec.get("sub_type") == "Communication" and "content" in rec
    ]

    print("Comunications encontradas:", len(communications))
    for idx, text in enumerate(communications, 1):
        print(f"{idx:03d}: {text}")

if __name__ == "__main__":
    main()



In [3]:
import json
from pathlib import Path
from typing import Any, Dict, List, Union

def load_json(path: Path) -> Union[Dict[str, Any], List[Any]]:
    """Carga un JSON estándar y lo devuelve como dict o lista."""
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)

#def main():
# 1) Ruta al .json
archivo = Path("../data/MC3_graph.json")
graph = load_json(archivo)

# 2) Chequeo rápido de estructura
print("Claves raíz:", list(graph.keys()))
print("→ #nodes:", len(graph.get("nodes", [])))
print("→ #edges:", len(graph.get("edges", [])))
print()

# 3) Extraemos de `nodes` porque ahí están tus eventos
eventos = graph.get("nodes", [])

# 4) Filtrar Communications y Monitoring
communications = [
    ev["content"]
    for ev in eventos
    if ev.get("sub_type") == "Communication" and "content" in ev
    #if ev.get("sub_type") == "Communication"  in ev
]
monitorings = [
    ev["findings"]
    for ev in eventos
    if ev.get("sub_type") == "Monitoring" and "findings" in ev
    #if ev.get("sub_type") == "Monitoring"  in ev
]

# 5) Mostrar resultados
print(f"Encontré {len(communications)} comunicaciones:")
for idx, txt in enumerate(communications, 1):
    print(f"{idx:03d}: {txt}")
print()
print(f"Encontré {len(monitorings)} informes de Monitoring:")
for idx, txt in enumerate(monitorings, 1):
    print(f"{idx:03d}: {txt}")

# if __name__ == "__main__":
#     main()


Claves raíz: ['directed', 'multigraph', 'graph', 'nodes', 'edges']
→ #nodes: 1159
→ #edges: 3226

Encontré 584 comunicaciones:
001: Hey The Intern, it's The Lookout! Just spotted a pod of dolphins near the eastern point this morning. They were so playful! If you're free this weekend, the migratory birds are starting to arrive too. Let me know if you want to join for some birdwatching!
002: Hey The Lookout, The Intern here! I'd absolutely love to join you for birdwatching this weekend! Those dolphins sound amazing. What time were you thinking? I'll bring my new binoculars that Mrs. Money helped me pick out.
003: Sam, it's Kelly! Let's meet at Sunrise Point at 7 AM for birdwatching. Bring your new binoculars and some water. I've heard there might be some rare shorebirds passing through this weekend. Can't wait!
004: Mrs. Money, it's The Intern. Just checking in to see what tasks you need help with today. Also, I'll be birdwatching with The Lookout this weekend. Should I reschedule if you

In [4]:
communications

["Hey The Intern, it's The Lookout! Just spotted a pod of dolphins near the eastern point this morning. They were so playful! If you're free this weekend, the migratory birds are starting to arrive too. Let me know if you want to join for some birdwatching!",
 "Hey The Lookout, The Intern here! I'd absolutely love to join you for birdwatching this weekend! Those dolphins sound amazing. What time were you thinking? I'll bring my new binoculars that Mrs. Money helped me pick out.",
 "Sam, it's Kelly! Let's meet at Sunrise Point at 7 AM for birdwatching. Bring your new binoculars and some water. I've heard there might be some rare shorebirds passing through this weekend. Can't wait!",
 "Mrs. Money, it's The Intern. Just checking in to see what tasks you need help with today. Also, I'll be birdwatching with The Lookout this weekend. Should I reschedule if you need me for anything?",
 "Boss, it's Mrs. Money. I've reviewed our operational funding for the upcoming projects. Need to discuss al

In [5]:
len(communications)

584

In [6]:
#!pip install sentence-transformers
import sys
!{sys.executable} -m pip install sentence-transformers --quiet


In [7]:
from sentence_transformers import SentenceTransformer, util

model      = SentenceTransformer('all-MiniLM-L6-v2')
emb        = model.encode(communications, convert_to_tensor=True)
sim_matrix = util.cos_sim(emb, emb) 

import torch
n = sim_matrix.size(0)
sim_np = sim_matrix.cpu().numpy()
idxs, jdxs = torch.triu_indices(n, n, offset=1)
scores = [(i, j, sim_np[i, j]) for i, j in zip(idxs, jdxs)]
top10  = sorted(scores, key=lambda x: x[2], reverse=True)[:10]
for i, j, sc in top10:
    print(f"{sc:.3f}: {communications[i]} ↔ {communications[j]}")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

1.000: Davis here to V. Miesel Shipping. Crew reallocation from Remora to Neptune underway as requested. Will maintain confidentiality regarding permit #CR-7844. Security team arrives 0600 tomorrow. Meeting Nadia tonight to finalize documentation. ↔ Davis here to V. Miesel Shipping. Crew reallocation from Remora to Neptune underway as requested. Will maintain confidentiality regarding permit #CR-7844. Security team arrives 0600 tomorrow. Meeting Nadia tonight to finalize documentation.
1.000: Boss, The Accountant here. Conservation vessels deploying underwater microphones around Nemo Reef. Meeting V. Miesel management at 10AM tomorrow. The Middleman confirms our 0500 meeting with enhanced encryption. Southwest clearance urgent after council pressure increased. ↔ Boss, The Accountant here. Conservation vessels deploying underwater microphones around Nemo Reef. Meeting V. Miesel management at 10AM tomorrow. The Middleman confirms our 0500 meeting with enhanced encryption. Southwest clear

In [15]:
import pandas as pd
pd.set_option('display.max_columns', None)       # Mostrar todas las columnas
pd.set_option('display.max_colwidth', None)      # Sin límite de ancho de columna
pd.set_option('display.width', None)             # Sin límite de ancho total
pd.set_option('display.max_rows', None)   

In [10]:
top10  = sorted(scores, key=lambda x: x[2], reverse=True)[:20]
for i, j, sc in top10:
    print(f"{sc:.3f}: {communications[i]} ↔ {communications[j]}")

1.000: Davis here to V. Miesel Shipping. Crew reallocation from Remora to Neptune underway as requested. Will maintain confidentiality regarding permit #CR-7844. Security team arrives 0600 tomorrow. Meeting Nadia tonight to finalize documentation. ↔ Davis here to V. Miesel Shipping. Crew reallocation from Remora to Neptune underway as requested. Will maintain confidentiality regarding permit #CR-7844. Security team arrives 0600 tomorrow. Meeting Nadia tonight to finalize documentation.
1.000: Boss, The Accountant here. Conservation vessels deploying underwater microphones around Nemo Reef. Meeting V. Miesel management at 10AM tomorrow. The Middleman confirms our 0500 meeting with enhanced encryption. Southwest clearance urgent after council pressure increased. ↔ Boss, The Accountant here. Conservation vessels deploying underwater microphones around Nemo Reef. Meeting V. Miesel management at 10AM tomorrow. The Middleman confirms our 0500 meeting with enhanced encryption. Southwest clear

In [13]:
df_similitudes = pd.DataFrame({
    'score': [sc for i, j, sc in top10],
    'mensaje_1': [communications[i] for i, j, sc in top10],
    'mensaje_2': [communications[j] for i, j, sc in top10]
})



In [16]:
df_similitudes

Unnamed: 0,score,mensaje_1,mensaje_2
0,1.0,Davis here to V. Miesel Shipping. Crew reallocation from Remora to Neptune underway as requested. Will maintain confidentiality regarding permit #CR-7844. Security team arrives 0600 tomorrow. Meeting Nadia tonight to finalize documentation.,Davis here to V. Miesel Shipping. Crew reallocation from Remora to Neptune underway as requested. Will maintain confidentiality regarding permit #CR-7844. Security team arrives 0600 tomorrow. Meeting Nadia tonight to finalize documentation.
1,1.0,"Boss, The Accountant here. Conservation vessels deploying underwater microphones around Nemo Reef. Meeting V. Miesel management at 10AM tomorrow. The Middleman confirms our 0500 meeting with enhanced encryption. Southwest clearance urgent after council pressure increased.","Boss, The Accountant here. Conservation vessels deploying underwater microphones around Nemo Reef. Meeting V. Miesel management at 10AM tomorrow. The Middleman confirms our 0500 meeting with enhanced encryption. Southwest clearance urgent after council pressure increased."
2,1.0,"Rodriguez, Davis here. Maintain current position with Mako at Nemo Reef. Security team arriving at 0600. Continue operating under permit #CR-7844 and keep tourism appearance. I'll update after meeting with Nadia tonight.","Rodriguez, Davis here. Maintain current position with Mako at Nemo Reef. Security team arriving at 0600. Continue operating under permit #CR-7844 and keep tourism appearance. I'll update after meeting with Nadia tonight."
3,1.0,"Mrs. Money, this is Middleman. I've redirected Council's attention to other areas. Meeting with Boss tonight at 9PM to discuss new focus. Have you secured those alternative funding channels you mentioned? Need status update.","Mrs. Money, this is Middleman. I've redirected Council's attention to other areas. Meeting with Boss tonight at 9PM to discuss new focus. Have you secured those alternative funding channels you mentioned? Need status update."
4,0.958649,"Mrs. Money, The Intern reporting. Jensen from City Council signed the Nemo Reef permit yesterday. They're creating a corridor specifically for Mako vessel. Checking if this overlaps with V. Miesel shipping lanes as requested. Will have full report tomorrow.","Boss, Mrs. Money here. Jensen from City Council approved the Nemo Reef permit. The Intern confirms special corridor creation for Mako vessel. This could interfere with our V. Miesel shipping arrangements. Recommend reassessing tonight's strategy at 9PM meeting."
5,0.951462,"Himark Harbor, Mako here. Safely docked at your location. Crew performing routine maintenance after yesterday's trip. Will require expedited departure clearance once our special permits arrive. Captain Davis expects documentation shortly from management.","Captain Davis, Mako here. We're fully docked at Himark Harbor with maintenance underway. The crew is discussing yesterday's trip while preparing for quick departure once permits arrive. Harbor confirms our expedited clearance is approved. The Boss seems confident this time."
6,0.947648,"Elise, Liam here. Conservation vessels found nothing at Nemo Reef. Council meeting tomorrow with Sentinel - I'll deflect concerns. V. Miesel appears to be expanding operations in that area. Will bring encryption tech to our 0500 meeting.","Boss, The Accountant here. Conservation vessels found nothing concerning at Nemo Reef according to The Middleman. He's attending council meeting with Sentinel tomorrow and will deflect. Bringing encryption tech to our 0500 meeting. V. Miesel definitely expanding in target area."
7,0.939498,"This is Paackland Harbor to Oceanus City Council. Reporting increased tourism vessel activity near protected areas, specifically Nemo Reef. Have issued standard reminders about conservation regulations. Recommend additional signage or patrol consideration for upcoming tourism season.","Liam, this is Oceanus City Council. We've received reports from Paackland Harbor about increased tourism vessels near Nemo Reef. Please coordinate with harbor masters to enhance conservation patrols and consider temporary additional signage for protected areas."
8,0.939296,Himark Harbor to Neptune. Your berth has been reserved for tomorrow's Nemo Reef operations. We're logging unusual activity with expedited permits for multiple vessels. Please confirm your estimated arrival time and equipment requirements.,"Himark Harbor to Neptune. Your berth for Nemo Reef operations is confirmed. Given increased activity, we're extending docking hours. Please provide arrival time and equipment specifications for our log. Several vessels already departing with expedited permits."
9,0.934575,"Davis, Mako here. Preparing for departure from Himark Harbor. Executive client has arrived and joined the Sailors team. Will maintain minimal radio communication as instructed. Transponder active. Awaiting your final clearance.","Davis, Mako here. We've cleared Himark Harbor with the executive client aboard. Transponder active but maintaining radio silence as instructed. Sailors team is ready for operations. Position is secure. Awaiting further directives."
