In [1]:
import csv
import json
import sys
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Set

def load_json(file_path):
    """
    Load a JSON file and return its content as a Python dictionary.

    :param file_path: Path to the JSON file.
    :return: Dictionary containing the JSON data.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

## Reading data

In [2]:
data = load_json('../data/MC3_graph.json')
schema = load_json('../data/MC3_schema.json')

In [3]:
nodes_type = schema['schema']['nodes'].keys()

## Parsing data for PAOHVIS

In [5]:
def collect_entity_names(nodes: List[Dict]) -> Set[str]:
    """Return a set with the *unique* names of all entities."""
    return {n["name"] for n in nodes if n.get("type") == "Entity"}

def extract_time_slot(raw_ts: str) -> str:
    """Convert full timestamp to a coarse *time‑slot* accepted by PAOHVis.

    Currently we keep the **date** part (`YYYY‑MM‑DD`). If `raw_ts` is null or
    malformed, we return "unknown" so the record still loads.
    """
    if not raw_ts:
        return "unknown"
    try:
        return datetime.fromisoformat(raw_ts).date().isoformat()
    except ValueError:
        return raw_ts
    
def detect_participants(content: str, entity_names: Set[str]) -> Set[str]:
    """Return the subset of `entity_names` that occur inside `content`."""
    participants = {name for name in entity_names if name in content}
    return participants

In [14]:
nodes = data.get("nodes", [])
edges = data.get("edges", [])
entity_names = collect_entity_names(nodes)

In [None]:
# Create a CSV file with the required columns
output_file = '../data/MC3_data_parsed.csv'
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['edge_id', 'node_name', 'time_slot', 'edge_name_description', 'group_name', 'role']
    writer = csv.writer(csvfile)
    writer.writerow(fieldnames)
    for event in nodes:
            if event.get("type") == "Event" and event.get("sub_type") == "Communication":
                edge_id = event.get("id")
                time_slot = extract_time_slot(event.get("timestamp"))
                content = event.get("content", "")
                # A short description—trim content to first 60 chars
                edge_desc = content[:60] + ("…" if len(content) > 60 else "")

                participants = detect_participants(content, entity_names)
                if not participants:
                    # Fallback: unknown sender/receiver – still output a single row
                    writer.writerow([edge_id, "_unknown_", time_slot, edge_desc, "", ""])
                else:
                    for name in participants:
                        writer.writerow([edge_id, name, time_slot, edge_desc, "", ""])
print(f"Data parsed and saved to {output_file}")

In [17]:
edges

[{'id': '2',
  'is_inferred': True,
  'source': 'Sam',
  'target': 'Relationship_Suspicious_217'},
 {'id': '3',
  'type': 'sent',
  'is_inferred': False,
  'source': 'Sam',
  'target': 'Event_Communication_370'},
 {'id': '5',
  'is_inferred': True,
  'source': 'Sam',
  'target': 'Event_Assessment_600'},
 {'id': '3013',
  'is_inferred': True,
  'source': 'Sam',
  'target': 'Relationship_Colleagues_430'},
 {'is_inferred': True, 'source': 'Sam', 'target': 'Relationship_Friends_272'},
 {'is_inferred': True,
  'source': 'Sam',
  'target': 'Relationship_Colleagues_215'},
 {'is_inferred': True,
  'source': 'Sam',
  'target': 'Relationship_Colleagues_431'},
 {'id': '8',
  'type': 'sent',
  'is_inferred': False,
  'source': 'Kelly',
  'target': 'Event_Communication_3'},
 {'id': '10',
  'type': 'sent',
  'is_inferred': False,
  'source': 'Kelly',
  'target': 'Event_Communication_443'},
 {'is_inferred': True,
  'source': 'Kelly',
  'target': 'Relationship_Friends_272'},
 {'is_inferred': True,
  '

## Question 3

1. Expanding upon your prior visual analytics, determine who is using pseudonyms to communicate, and what these pseudonyms are.
2. Describe how your visualization makes it easier to identify common entities in the data.
3. How does your understanding of activities change given your understanding of pseudonyms

In [27]:
from openai import OpenAI
import numpy as np
import pandas as pd

In [32]:
import openai
openai.__version__

'1.87.0'

In [13]:
def cosine(u: np.ndarray, v: np.ndarray) -> float:
    """Cosine similarity for 1‑D row vectors"""
    return float(np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v)))

def json_stream(path):
    """Yield JSON objects (one per line or list) from path."""
    with open(path, "r", encoding="utf-8") as fh:
        first_char = fh.read(1)
        fh.seek(0)
        if first_char == "[":  # entire file is a JSON array
            data = json.load(fh)
            for obj in data:
                yield obj
        else:  # assume JSON Lines
            for line in fh:
                yield json.loads(line)

In [18]:
def get_participants_communications(event_id: str, nodes, edges):
    sent_edge = next((e for e in edges if e.get('type') == 'sent' and e.get('target') == event_id), None)
    # Find the 'received' edge where source is the event
    received_edge = next((e for e in edges if e.get('type') == 'received' and e.get('source') == event_id), None)

    if not sent_edge or not received_edge:
        return None, None

    source_entity_id = sent_edge['source']
    target_entity_id = received_edge['target']

    source_node = next((n for n in nodes if n.get('id') == source_entity_id), None)
    target_node = next((n for n in nodes if n.get('id') == target_entity_id), None)

    return source_node, target_node


In [19]:
print("[+] Loading graph …")
messages = []
for event in nodes:
    if event.get("type") == "Event" and event.get("sub_type") == "Communication": # We concentrate on the communications
        event_id = event.get("id")
        time_slot = event.get("timestamp")
        content = event.get("content", "")
        participants = get_participants_communications(event_id, nodes, edges)
        messages.append(
            {
                "event_id": event_id,
                "datetime": time_slot,
                "content": content,
                "source": participants[0],
                "target": participants[1] 
            }
        )
        

[+] Loading graph …


In [21]:
len(messages)

584

In [33]:
messages_df = pd.DataFrame(messages)

In [35]:
messages_df['word_count'] = messages_df['content'].apply(lambda x: len(x.split()))

In [36]:
messages_df

Unnamed: 0,event_id,datetime,content,source,target,word_count
0,Event_Communication_1,2040-10-01 08:09:00,"Hey The Intern, it's The Lookout! Just spotted...","{'type': 'Entity', 'label': 'The Lookout', 'na...","{'type': 'Entity', 'label': 'The Intern', 'nam...",46
1,Event_Communication_2,2040-10-01 08:10:00,"Hey The Lookout, The Intern here! I'd absolute...","{'type': 'Entity', 'label': 'The Intern', 'nam...","{'type': 'Entity', 'label': 'The Lookout', 'na...",37
2,Event_Communication_3,2040-10-01 08:13:00,"Sam, it's Kelly! Let's meet at Sunrise Point a...","{'type': 'Entity', 'label': 'Kelly', 'name': '...","{'type': 'Entity', 'label': 'Sam', 'name': 'Sa...",34
3,Event_Communication_5,2040-10-01 08:16:00,"Mrs. Money, it's The Intern. Just checking in ...","{'type': 'Entity', 'label': 'The Intern', 'nam...","{'type': 'Entity', 'label': 'Mrs. Money', 'nam...",35
4,Event_Communication_6,2040-10-01 08:19:00,"Boss, it's Mrs. Money. I've reviewed our opera...","{'type': 'Entity', 'label': 'Mrs. Money', 'nam...","{'type': 'Entity', 'label': 'Boss', 'name': 'B...",31
...,...,...,...,...,...,...
579,Event_Communication_994,2040-10-14 12:50:00,"Green Guardians, Oceanus City Council here. Yo...","{'type': 'Entity', 'label': 'Oceanus City Coun...","{'type': 'Entity', 'label': 'Green Guardians',...",36
580,Event_Communication_997,2040-10-14 12:51:00,"EcoVigil, Green Guardians HQ here. Your video ...","{'type': 'Entity', 'label': 'Green Guardians',...","{'type': 'Entity', 'label': 'EcoVigil', 'name'...",39
581,Event_Communication_999,2040-10-14 13:31:00,Defender to Mako. Be advised that conservation...,"{'type': 'Entity', 'label': 'Defender', 'name'...","{'type': 'Entity', 'label': 'Mako', 'name': 'M...",34
582,Event_Communication_1001,2040-10-14 13:34:00,"Knowles, Mako here. Proceed to southern dock a...","{'type': 'Entity', 'label': 'Knowles', 'name':...","{'type': 'Entity', 'label': 'Davis', 'name': '...",37


In [38]:
mean_word_count = messages_df['word_count'].mean()
print(f"Mean word count in messages: {mean_word_count:.2f}")

Mean word count in messages: 35.53


## Generating Embeddings using Local model

In [39]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

# 2. Sentence-Transformers & friends
%pip install -U sentence-transformers scikit-learn tqdm


Looking in indexes: https://download.pytorch.org/whl/cu124
Collecting torch
  Downloading https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp312-cp312-linux_x86_64.whl.metadata (28 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp312-cp312-linux_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu124/torchaudio-2.6.0%2Bcu124-cp312-cp312-linux_x86_64.whl.metadata (6.6 kB)
Collecting filelock (from torch)
  Using cached https://download.pytorch.org/whl/filelock-3.13.1-py3-none-any.whl.metadata (2.8 kB)
Collecting networkx (from torch)
  Using cached https://download.pytorch.org/whl/networkx-3.3-py3-none-any.whl.metadata (5.1 kB)
Collecting jinja2 (from torch)
  Using cached https://download.pytorch.org/whl/Jinja2-3.1.4-py3-none-any.whl.metadata (2.6 kB)
Collecting fsspec (from torch)
  Using cached https://download.pytorch.org/whl/fsspec-2024.6.1-py3-none-any.whl.metadata

In [None]:
import torch
print(torch.cuda.is_available())

True
