In [None]:

# Synthetic log simulator for #GraphPoem (A4, B1) - streamlined and defensive
# This script regenerates synthetic logs and outputs of #graphpoem @ dhsi23.

import json, random, math, os
from collections import defaultdict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

random.seed(42)
np.random.seed(42)

# Anchors as extracted from 
# https://www.taylorfrancis.com/chapters/edit/10.4324/9781003320838-3/dynamical-systems-interplatform-intermediality-chris-tanasescu?context=ubx&refId=a352b9ec-5706-43ce-84c4-827290868465 
anchors = [
    (("7:09","7:27"), ("8:04","8:55")),
    (("17:36","18:43"), (None, None)),
    (("1:08:04","1:08:18"), ("1:08:24","1:09:00")),
    (("1:09:45","1:10:36"), ("1:11:07","1:12:16")),
]

def mmss_to_seconds(s):
    if s is None: return None
    parts = s.split(':')
    if len(parts) == 2:
        return int(parts[0]) * 60 + int(parts[1])
    elif len(parts) == 3:
        return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2])
    else:
        return int(parts[0])

event_windows = []
trail_windows = []
for ev, tr in anchors:
    ev_start = mmss_to_seconds(ev[0]); ev_end = mmss_to_seconds(ev[1])
    event_windows.append((ev_start, ev_end))
    if tr[0] is None:
        trail_windows.append(None)
    else:
        trail_windows.append((mmss_to_seconds(tr[0]), mmss_to_seconds(tr[1])))

# Horizon
max_time = 0

for s,e in event_windows:
    if e and e>max_time: max_time = e
for tr in trail_windows:
    if tr and tr[1] and tr[1]>max_time: max_time = tr[1]
        
HORIZON = max_time + 600
DT = 10
time_points = np.arange(0, HORIZON+DT, DT)

# Users
twitter_users = [f"tw_user_{i+1}" for i in range(11)]
fb_overlap = twitter_users[:10]
fb_only_count = 37 - len(fb_overlap)
fb_users = fb_overlap + [f"fb_user_{i+1}" for i in range(fb_only_count)]
jh_contributors = [f"jh_user_{i+1}" for i in range(101)]
twitter_dormant = [f"tw_dorm_{i+1}" for i in range(200)]

# Poems
# num_poems = 2312
import os

POEM_FOLDER = "poems_#dhsi25"

poems = sorted([f for f in os.listdir(POEM_FOLDER) if f.endswith(".txt")])
NUM_POEMS = len(poems)

placeholder_map = {
    0: {
        "before": "afrikaans_emoji_in_the_scriptorium_slippers_trans_inggs.txt",
        "during": "eva_tizzani_kakeche_from_spanish_trans_david_brunson_poem_1.txt",
        "after": "tan_chee_lay_iv_altering_constituencies_from_chinese_trans_teng_qian_xi_poem_4.txt"
    },
    1: {
        "before": "vítězslav_nezval_a_man_composing_a_selfportrait_out_of_objects_from_czech_trans_stephan_delbos_and_tereza_novická_poem_1.txt",
        "during": "martinus_nijhoff_trans_james_s_holmes_awater_excerpt.txt",
        "after": "w_h_auden_brussels_in_winter.txt"
    },
    2: {
        "before": "anonymous_libra_from_old_trans_gnaomi_siemens_poem_7.txt",
        "during": "margento_after-ovid's-elegy.txt",
        "after": "place_serban-foarta_trans-margento_place-de-la-concorde_paris-&-france-&-bucharest-&-romania.txt"
    },
    3: {
        "before": "place_flavia-teoc_trans-margento_constantinople_byzantium-&-turkey.txt",
        "during": "hayashi_amari_from_scent_of_nanako_from_japanese_trans_jon_holt_poem_1.txt",
        "after": "constantine_p_cavafy_returning_home_from_greece_from_greek_trans_george_economou_poem_2.txt"
    }
}


anchor_positions = np.linspace(0, NUM_POEMS-1, 4)
event_indices = []

for anchor in anchor_positions:
    center_idx = int(anchor)
    before_idx = max(0, center_idx - 2)
    during_idx = center_idx
    after_idx = min(NUM_POEMS-1, center_idx + 2)
    event_indices.append((before_idx, during_idx, after_idx))

# Override poems with your real event poems
for idx_tuple, mapping in zip(event_indices, placeholder_map.values()):
    b, d, a = idx_tuple
    poems[b] = mapping["before"]
    poems[d] = mapping["during"]
    poems[a] = mapping["after"]

In [None]:


poem_seq_path = "margento_#graphpoem_dhsi23_poem_sequence.json"

with open(poem_seq_path, "w") as f:
    json.dump({
        "poems": poems,
        "event_indices": event_indices,
        "event_poems": placeholder_map
    }, f, indent=2)


In [3]:


# Bot schedule
bot_interval = 180
bot_jitter = 5
bot_times = []
t = 0

while t <= HORIZON:
    jitter = random.randint(-bot_jitter, bot_jitter)
    tt = max(0, t + jitter)
    bot_times.append(tt)
    t += bot_interval
    
bot_times = sorted(list(set(bot_times)))

# WE WILL CONTINUE DEVELOPING THE BOT ONCE WE ARE DONE W| JUPYTERHUB SINCE THE BOT TWEETS THE POEMS PROVIDED BY THE CODE THERE

# helpers
def in_any_event(t):
    for s,e in event_windows:
        if s is not None and e is not None and s <= t <= e:
            return True
    return False
    
def in_any_trail(t):
    for tr in trail_windows:
        if tr and tr[0] is not None and tr[1] is not None and tr[0] <= t <= tr[1]:
            return True
    return False
    
def which_event(t):
    for idx,(s,e) in enumerate(event_windows):
        if s is not None and e is not None and s <= t <= e:
            return idx+1
    return None


# Jupyter logs: map poems to times denser near events (A4)
poem_times = {}
base_spacing = HORIZON / len(poems)

for i, pname in enumerate(poems):
    base_t = int(i * base_spacing)
    # find nearest event center
    mindist = 1e9; nearest_ev = None
    for s,e in event_windows:
        center = (s+e)/2.0
        d = abs(base_t - center)
        if d < mindist:
            mindist = d; nearest_ev = (s,e,center)
    if mindist < 0.15 * HORIZON:
        center = int(nearest_ev[2])
        tloc = int(center + random.randint(-30,30) + random.randint(-int(0.05*HORIZON), int(0.05*HORIZON))//10)
    else:
        tloc = base_t + random.randint(-30,30)
    poem_times[pname] = max(0, min(HORIZON, int(tloc)))

jupyter_rows = []
file_id = 0

for pname in poems:
    tloc = poem_times[pname]

    # Set number of contributors based on event/trail/normal regions
    if in_any_event(tloc):
        n_contrib = random.randint(3, 8)
    elif in_any_trail(tloc):
        n_contrib = random.randint(2, 5)
    else:
        n_contrib = random.randint(1, 3)

    # Choose distinct contributors
    chosen_contributors = random.sample(
        jh_contributors, 
        k=min(len(jh_contributors), n_contrib)
    )

    # Create one "save" event per contributor
    for user in chosen_contributors:
        file_id += 1
        jupyter_rows.append({
            "time": int(tloc) + random.randint(0, 40),
            "user": user,
            "action": "save",
            "file": pname,
            "event": which_event(tloc)
        })

df_jh = pd.DataFrame(jupyter_rows)
jh_path = "margento_#graphpoem_dhsi23_synthetic_jupyter_log.csv"
df_jh.sort_values(by="time").to_csv(jh_path, index=False)

# Twitter generation
twitter_rows = []
tweet_records = []
tweet_id = 0

for tt in bot_times:
    tweet_id += 1
    # Ensure we have saved poems before picking
    saved_poems_sorted_by_time = sorted(jupyter_rows, key=lambda r: r["time"])
    # When the bot tweets at time tt, choose the *latest* poem saved before tt
    available_poems = [r["file"] for r in saved_poems_sorted_by_time if r["time"] <= tt]
    if available_poems:
        poem_name = available_poems[-1]  # most recent poem
    else:
        poem_name = random.choice(poems) # fallback
    media_flag = True
    tweet_meta = {"tweet_id": f"t{tweet_id:05d}", "time": int(tt), "poem": poem_name, "media": media_flag}
    tweet_records.append(tweet_meta)
    base_likers = random.sample(twitter_users, k=random.randint(1,3))
    base_rts = random.sample(twitter_users, k=random.choice([0,1]))
    base_dorm_likes = random.sample(twitter_dormant, k=random.randint(0,2))
    if in_any_event(tt) or in_any_trail(tt):
        if in_any_event(tt):
            extra = random.randint(4,12)
            extra_rts = random.randint(2,6)
        else:
            extra = random.randint(2,6)
            extra_rts = random.randint(1,3)
        extra_likers = random.sample(twitter_dormant, k=min(len(twitter_dormant), extra))
    else:
        extra_likers = []
        extra_rts = 0
    twitter_rows.append({"time": int(tt), "user":"bot_twitter", "action":"tweet", "object_id": tweet_meta["tweet_id"], "poem": poem_name, "media": media_flag, "event": which_event(tt)})
    for u in base_likers:
        twitter_rows.append({"time": int(tt)+random.randint(0,8), "user":u, "action":"like", "object_id": tweet_meta["tweet_id"], "poem": poem_name, "media": media_flag, "event": which_event(tt)})
    for u in base_dorm_likes:
        twitter_rows.append({"time": int(tt)+random.randint(1,30), "user":u, "action":"like", "object_id": tweet_meta["tweet_id"], "poem": poem_name, "media": media_flag, "event": which_event(tt)})
    for u in base_rts:
        twitter_rows.append({"time": int(tt)+random.randint(0,20), "user":u, "action":"retweet", "object_id": tweet_meta["tweet_id"], "poem": poem_name, "media": media_flag, "event": which_event(tt)})
    for u in extra_likers:
        twitter_rows.append({"time": int(tt)+random.randint(0,40), "user":u, "action":"like", "object_id": tweet_meta["tweet_id"], "poem": poem_name, "media": media_flag, "event": which_event(tt)})
    for _ in range(extra_rts):
        u = random.choice(twitter_users + twitter_dormant)
        twitter_rows.append({"time": int(tt)+random.randint(0,60), "user":u, "action":"retweet", "object_id": tweet_meta["tweet_id"], "poem": poem_name, "media": media_flag, "event": which_event(tt)})

df_tw = pd.DataFrame(twitter_rows)
tw_path = "margento_#graphpoem_dhsi23_synthetic_twitter_log.csv"
df_tw.sort_values(by="time").to_csv(tw_path, index=False)


# Facebook live simulation
facebook_rows = []

for tt in range(0, HORIZON+1, DT):
    if in_any_event(tt) or in_any_trail(tt) or (tt in bot_times):
        if in_any_event(tt):
            num_actions = random.randint(3,12)
        elif in_any_trail(tt):
            num_actions = random.randint(1,6)
        else:
            num_actions = random.randint(0,2)
        for _ in range(num_actions):
            actor = random.choice(fb_users + twitter_users[:5])
            action = random.choices(["like_live","share_live","comment_live"], weights=[0.8,0.15,0.05])[0]
            facebook_rows.append({"time": int(tt)+random.randint(0,30), "user": actor, "action": action, "object":"livestream", "event": which_event(tt)})

df_fb = pd.DataFrame(facebook_rows)
fb_path = "margento_#graphpoem_dhsi23_synthetic_facebook_log.csv"
df_fb.sort_values(by="time").to_csv(fb_path, index=False)

with open("margento_#graphpoem_dhsi23_facebook_post_event_summary.json", "w") as f:
    json.dump({"viewers_2weeks":11500,"likes":244,"shares":3,"comments":11}, f, indent=2)


In [9]:

# Event descriptors
import random

event_descriptors = []

for ei, ((s, e), tr) in enumerate(zip(event_windows, trail_windows), start=1):
    # Twitter slice
    tw_slice = df_tw[(df_tw["time"] >= s) & (df_tw["time"] <= e)]
    num_tw_likes = int((tw_slice["action"] == "like").sum())
    num_tw_rts = int((tw_slice["action"] == "retweet").sum())
    
    # Facebook slice
    fb_slice = df_fb[(df_fb["time"] >= s) & (df_fb["time"] <= e)]
    num_fb_likes = int((fb_slice["action"] == "like_live").sum())
    num_fb_shares = int((fb_slice["action"] == "share_live").sum())
    
    # JupyterHub slice
    jh_slice = df_jh[(df_jh["time"] >= s) & (df_jh["time"] <= e)]
    num_jh_saves = len(jh_slice)
    contributors = sorted(list(set(jh_slice["user"].tolist())))
    
    # All poems in the window
    poems_in_window = jh_slice["file"].tolist()
    
    # Only the latest file per unique time
    poems_in_window_selected = (
        jh_slice.sort_values("time")  # ensure sorted by time
                .groupby("time", as_index=False)  # group by time
                .last()  # take last file for each time
                ["file"]
                .tolist()
    )
    
    # Build descriptor
    descriptor = {
        "event_index": ei,
        "event_window": (s, e),
        "trail_window": tr,
        "num_tw_likes": num_tw_likes,
        "num_tw_rts": num_tw_rts,
        "num_fb_likes": num_fb_likes,
        "num_fb_shares": num_fb_shares,
        "num_jh_saves": num_jh_saves,
        "contributors": contributors,
        "poems_in_window": poems_in_window,
        "poems_in_window_selected": poems_in_window_selected,
        "highlighted_poems_before_during_after": placeholder_map[ei-1],
        "semantic_tags": ["intermedia","dissonance"] if random.random() < 0.5 else ["collage","sampling"],
        "media_density": float(min(1.0, (num_tw_likes + num_tw_rts)/10.0))
    }
    
    event_descriptors.append(descriptor)

ed_path = "margento_#graphpoem_dhsi23_event_descriptors.json"
with open(ed_path, "w") as f:
    json.dump(event_descriptors, f, indent=2)

# Simple descriptor encoder and beta_R approximator (resource bounded deterministic approx)
def simple_descriptor_encoder(descriptor, salt=0):
    s = json.dumps(descriptor, sort_keys=True)
    h = abs(hash(s + str(salt)))
    return h % (10**6)

def halting_bits_resource_bounded_simple(indices, R):
    M_R = max(2, 100 + R*10)
    K_R = max(1, min(M_R-1, int(0.1 * M_R)))
    bits = {}
    for idx in indices:
        bits[idx] = 1 if (idx % M_R) < K_R else 0
    return bits

descriptor_indices = [simple_descriptor_encoder(d, salt=0) for d in event_descriptors]
for di, d in zip(descriptor_indices, event_descriptors):
    d['descriptor_index'] = int(di)


In [55]:
with open(ed_path, "w") as f:
    json.dump(event_descriptors, f, indent=2)

In [10]:

def make_beta_R(time_vec, event_descriptors, R, slot_width=30.0):
    indices = [d['descriptor_index'] for d in event_descriptors]
    bits_map = halting_bits_resource_bounded_simple(indices, R)
    beta = np.zeros_like(time_vec, dtype=float)
    for d in event_descriptors:
        k = d['descriptor_index']
        bit = bits_map.get(k, 0)
        if bit == 1:
            s,e = d['event_window']
            center = (s+e)/2.0
            kernel = np.exp(-0.5 * ((time_vec - center)/slot_width)**2)
            beta += kernel
    if beta.max() > 0:
        beta = beta / beta.max()
    return beta

time_vec = np.array(list(time_points))
beta_1 = make_beta_R(time_vec, event_descriptors, R=1)
beta_10 = make_beta_R(time_vec, event_descriptors, R=10)
np.save("margento_#graphpoem_dhsi23_beta_R_1.npy", beta_1)
np.save("margento_#graphpoem_dhsi23_beta_R_10.npy", beta_10)

# Save files manifest
manifest = {
    "twitter_log": tw_path,
    "jupyter_log": jh_path,
    "facebook_log": fb_path,
    "poem_sequence": poem_seq_path,
    "event_descriptors": ed_path,
    "beta_R1": "margento_#graphpoem_dhsi23_beta_R_1.npy",
    "beta_R10": "margento_#graphpoem_dhsi23_beta_R_10.npy"
}
with open("margento_#graphpoem_dhsi23_synthetic_manifest.json", "w") as f:
    json.dump(manifest, f, indent=2)


In [7]:

# Plot a small timeline
plt.figure(figsize=(10,4))
tweet_times = np.array([r['time'] for r in tweet_records])
plt.plot(tweet_times/60.0, np.ones_like(tweet_times)*0.2, '|', label='bot tweets', color='tab:blue')
tw_like_times = df_tw[df_tw['action']=='like']['time'].values
tw_counts, _ = np.histogram(tw_like_times, bins=time_points)
# plt.plot(time_points[:-1]/60.0, tw_counts/np.max(tw_counts+1)*0.9, label='twitter likes (norm)', color='tab:orange')
plt.plot(time_points[:-1]/60.0, tw_counts/np.max(tw_counts+1)*0.9, label='twitter likes (norm)', color='black')
jh_times = df_jh['time'].values
jh_counts, _ = np.histogram(jh_times, bins=time_points)
plt.plot(time_points[:-1]/60.0, jh_counts/np.max(jh_counts+1)*0.7, label='jupyter saves (norm)', color='tab:green')
if not df_fb.empty:
    fb_times = df_fb['time'].values
    fb_counts, _ = np.histogram(fb_times, bins=time_points)
    plt.plot(time_points[:-1]/60.0, fb_counts/np.max(fb_counts+1)*0.6, label='fb actions (norm)', color='tab:red')
#plt.plot(time_vec/60.0, beta_1*0.95, label='beta_R=1', color='k', linestyle='--')
plt.plot(time_vec/60.0, beta_1*0.95, label='beta_R=1', color='yellow', linestyle='--')
# plt.plot(time_vec/60.0, beta_10*0.6, label='beta_R=10', color='k', linestyle=':')
plt.plot(time_vec/60.0, beta_10*0.6, label='beta_R=10', color='yellow', linestyle=':')
for (s,e) in event_windows:
    plt.axvspan(s/60.0, e/60.0, color='gray', alpha=0.12)
for tr in trail_windows:
    if tr:
        plt.axvspan(tr[0]/60.0, tr[1]/60.0, color='gray', alpha=0.06)
plt.xlabel('time (minutes)'); plt.ylabel('activity / beta'); plt.legend(loc='upper right', bbox_to_anchor=(1.3,1.05))
plt.tight_layout()
plot_path = "margento_#graphpoem_dhsi23_synthetic_timeline_1.png"
plt.savefig(plot_path, dpi=150, bbox_inches='tight')
plt.close()


In [5]:
print("Generated synthetic logs and artifacts. Manifest:", "margento_#graphpoem_dhsi23_synthetic_manifest.json")
print(json.dumps(manifest, indent=2))
manifest

Generated synthetic logs and artifacts. Manifest: margento_#graphpoem_dhsi23_synthetic_manifest.json
{
  "twitter_log": "margento_#graphpoem_dhsi23_synthetic_twitter_log.csv",
  "jupyter_log": "margento_#graphpoem_dhsi23_synthetic_jupyter_log.csv",
  "facebook_log": "margento_#graphpoem_dhsi23_synthetic_facebook_log.csv",
  "poem_sequence": "margento_#graphpoem_dhsi23_poem_sequence.json",
  "event_descriptors": "margento_#graphpoem_dhsi23_event_descriptors.json",
  "beta_R1": "margento_#graphpoem_dhsi23_beta_R_1.npy",
  "beta_R10": "margento_#graphpoem_dhsi23_beta_R_10.npy"
}


{'twitter_log': 'margento_#graphpoem_dhsi23_synthetic_twitter_log.csv',
 'jupyter_log': 'margento_#graphpoem_dhsi23_synthetic_jupyter_log.csv',
 'facebook_log': 'margento_#graphpoem_dhsi23_synthetic_facebook_log.csv',
 'poem_sequence': 'margento_#graphpoem_dhsi23_poem_sequence.json',
 'event_descriptors': 'margento_#graphpoem_dhsi23_event_descriptors.json',
 'beta_R1': 'margento_#graphpoem_dhsi23_beta_R_1.npy',
 'beta_R10': 'margento_#graphpoem_dhsi23_beta_R_10.npy'}

In [None]:

# BUILDING ADJACENCY MATRICES


In [12]:


# import numpy as np
# import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# -----------------------------------------------------------
# 1. BUILD NODE LIST
# -----------------------------------------------------------

node_index = {n: i for i, n in enumerate(nodes)}

all_users = sorted(
    set(df_tw["user"]).union(df_fb["user"]).union(df_jh["user"])
)

all_poems = sorted(set(poems))

nodes = all_users + all_poems

# Add livestream node
if "livestream" not in nodes:
    nodes.append("livestream")
    
node_index = {n: i for i, n in enumerate(nodes)}

# -----------------------------------------------------------
# 2. SOCIAL INTERACTION ADJACENCY (Twitter + Facebook)
# -----------------------------------------------------------

A_social = np.zeros((len(nodes), len(nodes)), dtype=float)

# Twitter likes/retweets: user -> tweet/poem
for _, row in df_tw.iterrows():
    u = row["user"]
    if u == "bot_twitter":
        continue
    f = row["poem"]
    if u in node_index and f in node_index:
        A_social[node_index[u], node_index[f]] += 1

# Facebook likes/shares
for _, row in df_fb.iterrows():
    u = row["user"]
    if row["action"] in {"like_live", "share_live"}:
        if u in node_index:
            A_social[node_index[u], node_index["livestream"]] = 1

# -----------------------------------------------------------
# 3. TEMPORAL COACTIVITY ADJACENCY
# -----------------------------------------------------------

A_temp = np.zeros((len(nodes), len(nodes)), dtype=float)
WINDOW = 30  # seconds

def add_temporal_edges(df):
    rows = df.sort_values("time")
    for _, r in rows.iterrows():
        u = r["user"]
        t = r["time"]
        # find all actions within +/- WINDOW
        mask = (rows["time"] >= t - WINDOW) & (rows["time"] <= t + WINDOW)
        block = rows[mask]
        for _, r2 in block.iterrows():
            u2 = r2["user"]
            if u != u2 and u in node_index and u2 in node_index:
                A_temp[node_index[u], node_index[u2]] += 1

# Add from all platforms
add_temporal_edges(df_tw)
add_temporal_edges(df_fb)
add_temporal_edges(df_jh)

In [13]:

np.save("margento_graphpoem_dhsi23_adjacency_matrix_before_semantics_and_form.npy", A_temp)

In [None]:

# -----------------------------------------------------------
# 4. SEMANTIC and SONIC SIMILARITY ADJACENCY
# -----------------------------------------------------------


In [None]:

import os
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import json


In [15]:


# HELPERS

import re, unicodedata, math
import numpy as np
from collections import Counter


# Download Unicode Scripts.txt
SCRIPTS_URL = "https://www.unicode.org/Public/UCD/latest/ucd/Scripts.txt"


# ALL SCRIPTS 'UNDER THE SUN' [IN UNICODE, THAT IS]

import regex
import urllib.request


def get_all_scripts() -> set[str]:
    """
    Fetch the official Unicode script names from Scripts.txt.
    """
    with urllib.request.urlopen(SCRIPTS_URL) as f:
        lines = f.read().decode("utf-8").splitlines()

    scripts = set()
    for line in lines:
        if line.strip() and not line.startswith("#"):
            # Example line: "0041..005A; Latin # L&  [26] LATIN CAPITAL LETTER A..Z"
            parts = line.split(";")
            if len(parts) >= 2:
                script = parts[1].strip().split()[0]
                scripts.add(script)
    return scripts

UNICODE_SCRIPTS = sorted(get_all_scripts())

def char_script(ch):
    import regex
    
    if not ch or len(ch) != 1:
        return "INVALID"

    for script in UNICODE_SCRIPTS:
        try:
            # Use the script name exactly as Unicode defines it
            if regex.match(rf"\p{{Script={script}}}", ch):
                return script  # return it as-is
        except regex.error:
            continue  # skip invalid/unrecognized scripts

    return "UNKNOWN"


from collections import Counter

def word_script(word: str) -> str:
    """
    Return the dominant script of a word (based on majority of alphabetic chars).
    """
    scripts = Counter(char_script(ch) for ch in word if ch.isalpha())
    return scripts.most_common(1)[0][0] if scripts else "OTHER"


def get_unicode_name(ch):
    try:
        return unicodedata.name(ch)
    except ValueError:
        return None


import unicodedata

# Latin/Cyrillic/Greek/Devanagari vowels (extendable)
_vowel_re_latin = re.compile(r"[aeiouy\u00E0-\u00FF]+", re.IGNORECASE)
_vowel_re_cyrillic = re.compile(r"[аеёиоуыэюя]+", re.IGNORECASE)  # basic Russian vowels
_vowel_re_greek = re.compile(r"[αεηιουωάέήίόύώ]", re.IGNORECASE)   # modern Greek vowels
_vowel_re_devanagari = re.compile(r"[अआइईउऊएऐओऔऋॠॡॢॣ]", re.IGNORECASE)


def approx_syllables_word(word: str) -> int:
    if not word:
        return 0
    w = unicodedata.normalize("NFC", word)
    script = word_script(w)

    if script == "LATIN":
        groups = _vowel_re_latin.findall(w)
        count = len(groups)
        if w.lower().endswith("e") and count > 1:  # silent 'e'
            count -= 1
        return max(1, count)

    if script == "CYRILLIC":
        groups = _vowel_re_cyrillic.findall(w)
        return max(1, len(groups))

    if script == "GREEK":
        groups = _vowel_re_greek.findall(w)
        return max(1, len(groups))

    if script == "DEVANAGARI":
        groups = _vowel_re_devanagari.findall(w)
        return max(1, len(groups))

    if script in ("HIRAGANA", "KATAKANA"):
        kana_chars = [ch for ch in w if '\u3040' <= ch <= '\u30FF']
        return max(1, len(kana_chars))

    if script == "HANGUL":
        return len([ch for ch in w if '\uAC00' <= ch <= '\uD7A3'])

    if script == "CJK":
        chars = [ch for ch in w if '\u4E00' <= ch <= '\u9FFF']
        return max(1, len(chars))

    if script == "THAI":
        return max(1, len([ch for ch in w if ch.strip()]))

    # Fallback
    groups = _vowel_re_latin.findall(w)
    return max(1, len(groups) if groups else len(w))


def extract_phonological_clusters(word: str):
    clusters = set()
    w = unicodedata.normalize("NFC", word.lower())
    script = word_script(w)

    if script in ("LATIN", "GREEK", "CYRILLIC"):
        consonant_matches = re.findall(r'[^aeiouy]+', w)
        for c in consonant_matches:
            for i in range(len(c)):
                for j in range(i+1, len(c)+1):
                    clusters.add(c[i:j])
        vowel_matches = re.findall(r'[aeiouy]+', w)
        for v in vowel_matches:
            for i in range(len(v)):
                for j in range(i+1, len(v)+1):
                    clusters.add(v[i:j])
        for k in range(2, 5):
            if len(w) >= k:
                clusters.add(w[-k:])

    elif script in ("ARABIC", "HEBREW"):
        consonant_runs = re.findall(r'[^aeiou]+', w)
        for c in consonant_runs:
            for i in range(len(c)):
                for j in range(i+1, len(c)+1):
                    clusters.add(c[i:j])
        for k in range(2, 5):
            if len(w) >= k:
                clusters.add(w[-k:])

    elif script == "DEVANAGARI":
        groups = _vowel_re_devanagari.findall(w)
        for g in groups:
            clusters.add(g)
        for k in range(2, 5):
            if len(w) >= k:
                clusters.add(w[-k:])

    elif script in ("HIRAGANA", "KATAKANA", "HANGUL", "CJK"):
        chars = list(w)
        clusters.update(chars)
        for i in range(len(chars)-1):
            clusters.add(chars[i] + chars[i+1])

    else:
        for i in range(len(w)):
            for j in range(i+1, min(i+4, len(w))+1):
                clusters.add(w[i:j])

    return clusters


_word_re = re.compile(r"\w+", re.UNICODE)

def tokenize_text(text):
    tokens = []
    for m in _word_re.finditer(text):
        tok = m.group(0)
        tokens.append(tok)
    return tokens

_fricatives = set(list("fvsz") + ["sh","zh","th"])
_plosives = set(list("pbtdkg"))

def phonetic_density(tokens):
    latin_tokens = [t for t in tokens if char_script(t[0]) == "LATIN"]
    joined = " ".join(latin_tokens).lower()
    letters = re.sub(r'[^a-z]', '', joined)
    if not letters:
        return 0.0, 0.0, 0.0
    fric_count = sum(joined.count(f) for f in ["f","v","s","z","sh","zh","th"])
    plos_count = sum(joined.count(p) for p in ["p","b","t","d","k","g"])
    vowel_count = sum(1 for c in letters if c in "aeiouy")
    total = len(letters)
    return fric_count/total, plos_count/total, vowel_count/(total+1e-9)

import re
import nltk
from nltk.corpus import words, stopwords, wordnet as wn
from nltk.corpus.reader import WordListCorpusReader
import numpy as np

# Download necessary NLTK resources
nltk.download('words')
nltk.download('stopwords')
nltk.download('omw')
nltk.download('omw-1.4')

# Load English words and stopwords
english_words = set(words.words())
stop_words = set(stopwords.words('english'))

# Define common prefixes and suffixes
PL_PREFIXES = {"re", "un", "in", "dis", "pre", "sub"}
PL_SUFFIXES = {"ing", "ed", "er", "ly", "es", "ful"}

def is_plausible_fragment(fragment):
    """Check if fragment is a plausible English word, prefix/suffix, or foreign fragment."""
    fragment = fragment.lower()
    if not fragment:
        return False
    if fragment in english_words:
        return True
    if fragment in PL_PREFIXES or fragment in PL_SUFFIXES:
        return True
    # Check if fragment exists in WordNet for any language
    for lang in wn.langs():
        if wn.synsets(fragment, lang=lang):
            return True
    # Fallback: accept fragments that are at least 2 characters long
    if len(fragment) > 1:
        return True
    return False

def extract_audio_features_from_stanza(stanza, expected_feet_per_line=(5,6), foot_syllables=(2,3)):
    lines = [ln.strip() for ln in stanza.strip().split("\n") if ln.strip()]
    n_lines = max(1, len(lines))
    tokens = tokenize_text(stanza)
    syll_counts_tokens = [approx_syllables_word(t) for t in tokens]
    total_syllables = sum(syll_counts_tokens)
    n_words = len(tokens) if tokens else 1
    syllable_density = total_syllables / n_words if n_words else 0.0
    target_feet = np.mean(expected_feet_per_line)
    avg_syll_per_line = total_syllables / max(1, len(lines))
    avg_foot_syll = np.mean(foot_syllables)
    tempo = avg_syll_per_line / avg_foot_syll
    sylls_per_line = [sum(approx_syllables_word(t) for t in tokenize_text(ln)) for ln in lines]
    pacing_variance = float(np.var(sylls_per_line)) if sylls_per_line else 0.0
    fric_density, plos_density, vowel_ratio = phonetic_density(tokens)
    vocal_smoothness = float(vowel_ratio)

    # --- [word-splitting] enjambment detection (needed in this specific case; if you need to process enjambments in general see https://github.com/Margento/Computationally_Assembled_Belgian_Poetry_Anthology ---
    enjambments = 0
    enjambed_positions = set()

    for ln_idx, ln in enumerate(lines):
        # 1. End-of-line split (including ellipses)
        end_match = re.search(r'(\w+(?:\.\.\.)?)-/?(\w*)$', ln)
        if end_match:
            left, right = end_match.groups()
            if is_plausible_fragment(left) and (not right or is_plausible_fragment(right)):
                enjambments += 1
                enjambed_positions.add(end_match.start())

        # 2. Start-of-line split
        if ln_idx > 0:
            start_match = re.match(r'^(\w*)-/(\w+)', ln)
            if start_match:
                left, right = start_match.groups()
                if (not left or is_plausible_fragment(left)) and is_plausible_fragment(right):
                    enjambments += 1
                    enjambed_positions.add(start_match.start())

        # 3. Multi-word or foreign-word consideration (fallback)
        for match in re.finditer(r'(\S+)/(\S+)', ln):
            left, right = match.groups()
            if is_plausible_fragment(left) and is_plausible_fragment(right):
                enjambments += 1
                enjambed_positions.add(match.start())

    # Count pause marks excluding those part of valid enjambments
    pause_marks = 0
    for m in re.finditer(r'[,;:\-\—\(\)]', stanza):
        if m.start() not in enjambed_positions:
            pause_marks += 1

    silence_ratio = pause_marks / (total_syllables + 1e-9)
    caesura = sum(1 for ln in lines if "," in ln or ";" in ln or "—" in ln)

    enjambments_norm = enjambments / n_lines
    caesura_norm = caesura / n_lines

    audio = {
        "syllable_density": float(syllable_density),
        "tempo": float(tempo),
        "pacing_variance": float(pacing_variance),
        "fricative_density": float(fric_density),
        "plosive_density": float(plos_density),
        "vocal_smoothness": float(vocal_smoothness),
        "silence_ratio": float(silence_ratio),
        "total_syllables": int(total_syllables),
        "sylls_per_line": sylls_per_line,
        "enjambments": float(enjambments_norm),
        "caesura": float(caesura_norm),
        "n_words": n_words
    }
    return audio


[nltk_data] Downloading package words to /Users/Margento/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Margento/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw to /Users/Margento/nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/Margento/nltk_data...


In [None]:

# --------------------------------------------------------------
# 0. Load multilingual embedding model (fast + robust)
# --------------------------------------------------------------
# Good models: "sentence-transformers/distiluse-base-multilingual-cased-v2"
# or for higher quality: "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
model = SentenceTransformer("sentence-transformers/distiluse-base-multilingual-cased-v2")


In [None]:

# FIRST AN EXPERIMENT ON EVENT 3 & THE WINDOW PRECEDING IT (BEFORE RUNNING THIS ON THE ENTIRE #DHSI 2023 EVENT)


In [18]:


from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import networkx as nx
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
# import matplotlib.pyplot as plt

# ---------- User parameters ----------
# POEM_FOLDER = "...r"   # change
# MANIFEST = "..."
# POEM_SEQ = "..."
# EVENT_DESC = "....json"
# TWITTER_CSV = "... synthetic_twitter_log.csv"
# JUPYTER_CSV = "... synthetic_jupyter_log.csv"
# FACEBOOK_CSV = "... synthetic_facebook_log.csv"

# neighborhood and window params
# k_neigh = 10            # include ±k poems around event poems
pre_multiplier_short = 1.0
pre_multiplier_long = 2.0

# DONE ABOVE
# semantic/phon weights
# alpha = 0.6
# beta  = 0.4

# ---------- load files ----------
# with open(POEM_SEQ, "r") as f:
   # poem_seq_obj = json.load(f)
# poems = poem_seq_obj["poem_sequence"]  # list of filenames

# with open(EVENT_DESC, "r") as f:
    # event_desc = json.load(f)

# NO NEED FOR THIS IF YOU STILL HAVE THEM RUNNING IN-MEMORY
# df_tw = pd.read_csv(TWITTER_CSV)
# df_jh = pd.read_csv(JUPYTER_CSV)
# df_fb = pd.read_csv(FACEBOOK_CSV)

# ---------- select event 3 (3rd literally) ----------
# event indices in event_desc are 1-based when built earlier
event_i = 3   # third singularity
ev = event_descriptors[event_i-1]   # descriptor for event 3
s,e = ev["event_window"]
event_length = e - s


In [None]:

# ev # SANITY CHECK

In [25]:

k_neigh = 1

In [26]:

# poems that were saved in the event window (selected)
poems_event_selected = ev.get("poems_in_window_selected", ev.get("poems_in_window", []))
# print("Event 3 poems (selected):", poems_event_selected)

# ---------- build poem subset: include ±k around indices ----------
# find indices of these poems in poems_all
indices = [poems.index(p) for p in poems_event_selected if p in poems]
# if none found, fall back to the 'poems_in_window' list indices
if not indices:
    indices = []
    for p in ev.get("poems_in_window", []):
        if p in poems:
            indices.append(poems.index(p))
# expand by k_neigh
subset_idx = set()
for idx in indices:
    for j in range(max(0, idx-k_neigh), min(len(poems), idx+k_neigh+1)):
        subset_idx.add(j)
        
subset_idx = sorted(list(subset_idx))
poem_subset = [poems[i] for i in subset_idx]
print(f"Selected {len(poem_subset)} poems for subset experiment")

Selected 45 poems for subset experiment


In [27]:

# ---------- load poem texts for subset ----------
texts = []
valid_poems = []
for p in poem_subset:
    pth = os.path.join(POEM_FOLDER, p)
    try:
        with open(pth, "r", encoding="utf-8") as f:
            texts.append(f.read())
            valid_poems.append(p)
    except Exception as ex:
        print("couldn't read", p, ex)

In [28]:

# ---------- compute multilingual embeddings (SBERT) ----------
emb = model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)

In [30]:
# ---------- compute phonological features ----------

import numpy as np
from sklearn.preprocessing import StandardScaler

# Scalar keys returned by extract_audio_features_from_stanza
SCALAR_KEYS = [
    "syllable_density",
    "tempo",
    "pacing_variance",
    "fricative_density",
    "plosive_density",
    "vocal_smoothness",
    "silence_ratio",
    "enjambments",
    "caesura",
    "total_syllables",
    "n_words"
]

phon_feats = []

for txt in texts:
    feats = extract_audio_features_from_stanza(txt)

    # Extract scalar features
    vec = [float(feats[k]) for k in SCALAR_KEYS]

    # Handle list-valued feature "sylls_per_line"
    syll_list = feats.get("sylls_per_line", [])
    if isinstance(syll_list, list) and len(syll_list) > 0:
        syll_mean = float(np.mean(syll_list))
        syll_var  = float(np.var(syll_list))
    else:
        syll_mean = 0.0
        syll_var  = 0.0

    # Append to feature vector
    vec.extend([syll_mean, syll_var])

    phon_feats.append(vec)

# Convert to numpy array
phon_feats = np.array(phon_feats, dtype=float)

# Normalize the features
phon_feats_norm = StandardScaler().fit_transform(phon_feats)


In [31]:

np.save("margento_graphpoem_dhsi23_singularity_3_phonological_feats.npy", phon_feats)

In [33]:

alpha = 0.6
beta = 0.4

In [34]:

# ---------- pairwise similarities ----------
S_sem = cosine_similarity(emb)            # shape (M,M)
S_phon = cosine_similarity(phon_feats_norm)
S_comb = alpha * S_sem + beta * S_phon
S_comb = np.clip(S_comb, 0.0, 1.0)

In [35]:

# ---------- build subset poem adjacency (sparse) ----------
M = len(valid_poems)
# optionally threshold or keep top-k per row
top_k = 8
rows=[]; cols=[]; data=[]
for i in range(M):
    neigh = np.argsort(-S_comb[i])[:top_k+1]
    for j in neigh:
        if i==j: continue
        rows.append(i); cols.append(j); data.append(float(S_comb[i,j]))
A_sem_subset = csr_matrix((data,(rows,cols)), shape=(M,M))

In [126]:

np.save("margento_graphpoem_dhsi23_adjacency_matrix_singularity_3_poems.npy", A_sem_subset)

In [36]:

# ---------- Map social/temporal adjacency restricted to relevant users and windows ----------
# define pre-windows
pre_short = (int(s - pre_multiplier_short*event_length), s)
pre_long  = (int(s - pre_multiplier_long*event_length), s)

In [None]:

# ANALYZING THE SEGMENT PREVIOUS TO THE EVENT THE WAY WE ANALYZED EVENTS


In [37]:

# import random

previous_descriptors = []

for k, prev in enumerate([pre_short, pre_long]):
    # Twitter slice
    tw_slice = df_tw[(df_tw["time"] >= prev[0]) & (df_tw["time"] <= s)]
    num_tw_likes = int((tw_slice["action"] == "like").sum())
    num_tw_rts = int((tw_slice["action"] == "retweet").sum())
    
    # Facebook slice
    fb_slice = df_fb[(df_fb["time"] >= prev[0]) & (df_fb["time"] <= s)]
    num_fb_likes = int((fb_slice["action"] == "like_live").sum())
    num_fb_shares = int((fb_slice["action"] == "share_live").sum())
    
    # JupyterHub slice
    jh_slice = df_jh[(df_jh["time"] >= prev[0]) & (df_jh["time"] <= s)]
    num_jh_saves = len(jh_slice)
    contributors = sorted(list(set(jh_slice["user"].tolist())))
    
    # All poems in the window
    poems_in_window = jh_slice["file"].tolist()
    
    # Only the latest file per unique time
    poems_in_window_selected = (
        jh_slice.sort_values("time")  # ensure sorted by time
                .groupby("time", as_index=False)  # group by time
                .last()  # take last file for each time
                ["file"]
                .tolist()
    )
    
    # Build descriptor
    descriptor = {
        "prev_index": k,
        "prev_window": (prev[0], s),
        # "trail_window": tr,
        "num_tw_likes": num_tw_likes,
        "num_tw_rts": num_tw_rts,
        "num_fb_likes": num_fb_likes,
        "num_fb_shares": num_fb_shares,
        "num_jh_saves": num_jh_saves,
        "contributors": contributors,
        "poems_in_window": poems_in_window,
        "poems_in_window_selected": poems_in_window_selected,
        # "highlighted_poems_before_during_after": placeholder_map[ei-1],
        "semantic_tags": ["intermedia","dissonance"] if random.random() < 0.5 else ["collage","sampling"],
        "media_density": float(min(1.0, (num_tw_likes + num_tw_rts)/10.0))
    }
    
    previous_descriptors.append(descriptor)


In [40]:
# previous_descriptors[1] # sanity check

In [54]:

prev_path = "margento_#graphpoem_dhsi23_windows_before_event3_descriptors.json"

with open(prev_path, "w") as f:
    json.dump(previous_descriptors, f, indent=2)

In [42]:

# poems that were saved in the event window (selected)
prev_0_poems_selected = previous_descriptors[0].get("poems_in_window_selected", previous_descriptors[0].get("poems_in_window", []))
# print("Short window before event 3 poems (selected):", prev_0_poems_selected)

# ---------- build poem subset: include ±k around indices ----------
# find indices of these poems in poems_all
indices = [poems.index(p) for p in prev_0_poems_selected if p in poems]
# if none found, fall back to the 'poems_in_window' list indices
if not indices:
    indices = []
    for p in previous_descriptors[0].get("poems_in_window", []):
        if p in poems:
            indices.append(poems.index(p))
# expand by k_neigh
subset_idx = set()
for idx in indices:
    for j in range(max(0, idx-k_neigh), min(len(poems), idx+k_neigh+1)):
        subset_idx.add(j)
        
subset_idx = sorted(list(subset_idx))
poem_subset = [poems[i] for i in subset_idx]
print(f"Selected {len(poem_subset)} poems for subset experiment")

# ---------- load poem texts for subset ----------
texts_prev = []
valid_poems_prev = []

for p in poem_subset:
    pth = os.path.join(POEM_FOLDER, p)
    try:
        with open(pth, "r", encoding="utf-8") as f:
            texts_prev.append(f.read())
            valid_poems_prev.append(p)
    except Exception as ex:
        print("couldn't read", p, ex)

# ---------- embeddings ----------
emb_prev = model.encode(texts_prev, convert_to_numpy=True, normalize_embeddings=True)

# ---------- compute phonological features ----------

SCALAR_KEYS = [
    "syllable_density",
    "tempo",
    "pacing_variance",
    "fricative_density",
    "plosive_density",
    "vocal_smoothness",
    "silence_ratio",
    "enjambments",
    "caesura",
    "total_syllables",
    "n_words"
]

phon_feats_prev = []

for txt in texts_prev:
    feats = extract_audio_features_from_stanza(txt)

    # Extract scalar features
    vec = [float(feats[k]) for k in SCALAR_KEYS]

    # Handle list-valued feature "sylls_per_line"
    syll_list = feats.get("sylls_per_line", [])
    if isinstance(syll_list, list) and len(syll_list) > 0:
        syll_mean = float(np.mean(syll_list))
        syll_var  = float(np.var(syll_list))
    else:
        syll_mean = 0.0
        syll_var  = 0.0

    # Append to feature vector
    vec.extend([syll_mean, syll_var])

    phon_feats_prev.append(vec)

# Convert to numpy array
phon_feats_prev = np.array(phon_feats_prev, dtype=float)

# Normalize the features
phon_feats_norm_prev = StandardScaler().fit_transform(phon_feats_prev)

np.save("margento_graphpoem_dhsi23_wondow_previous_to_singularity_3_phonological_feats.npy", phon_feats_prev)

# ---------- pairwise similarities ----------
S_sem_prev = cosine_similarity(emb_prev)            # shape (M_prev,M_prev)
S_phon_prev = cosine_similarity(phon_feats_norm_prev)
S_comb_prev = alpha * S_sem_prev + beta * S_phon_prev
S_comb_prev = np.clip(S_comb_prev, 0.0, 1.0)

# ---------- build subset poem adjacency (sparse) for window previous to event 3 ----------
M_prev = len(valid_poems_prev)

# optionally threshold or keep top-k per row
top_k = 8
rows=[]; cols=[]; data=[]

for i in range(M_prev):
    neigh = np.argsort(-S_comb_prev[i])[:top_k+1]
    for j in neigh:
        if i==j: continue
        rows.append(i); cols.append(j); data.append(float(S_comb_prev[i,j]))

A_sem_subset_prev = csr_matrix((data,(rows,cols)), shape=(M_prev,M_prev))

Selected 42 poems for subset experiment


In [108]:
np.save("margento_graphpoem_dhsi23_semantic_n_sonic_matrix_window_previous_to_singularity_3.npy", A_sem_subset_prev)

In [43]:

intersection_prev_n_event = set(prev_0_poems_selected).intersection(set(poems_event_selected))

In [44]:
len(intersection_prev_n_event)

1

In [None]:

# ONLY ONE POEM SHARED BY THE TWO CHUNKS (PREVIOUS WINDOW AND EVENT)!!!!!!!!!!
# THERE ARE OTHER POEMS SHARED BY THE TWO CHUNKS IN GENERAL
# BUT THIS IS THE ONLY ONE IN THE INTERSECTION OF THE SELECTED POEMS
# THAT IS, POEMS FOR EACH CHUNK OUTPUTTED BY (THE LIVE INTERACTIVE CODING ON) JUPYTERHUB

In [67]:

len(prev_0_poems_selected)

15

In [68]:
len(poems_event_selected)

15

In [45]:
intersection_prev_n_event

{'place_rachel_blau_duplessis_draft_65_that_japanese_language_tao.txt'}

In [64]:

intersection_prev_n_event_valid = set(valid_poems_prev).intersection(set(valid_poems))

In [65]:
len(intersection_prev_n_event_valid)

9

In [66]:
intersection_prev_n_event_valid

{'place_oliver_goldsmith_france.txt',
 'place_rachel_blau_duplessis_draft_65_that_japanese_language_tao.txt',
 'place_rachel_by_matthew_arnold_paris_provence_french_riviera.txt',
 'qiu_jin_pusaman_a_message_for_a_female_friend_from_chinese_trans_yilin_wang_poem_1.txt',
 'qiu_jin_reflections_from_chinese_trans_yilin_wang_poem_3.txt',
 'sagawa_chika_dark_song_from_japanese_trans_sawako_nakayasu_poem_2.txt',
 'sagawa_chika_ocean_angel_from_japanese_trans_sawako_nakayasu_poem_3.txt',
 'satu-taskinen_bio.txt',
 'satu-taskinen_interview.txt'}

In [None]:
# THE ABOVE ARE THE INTERSECTION BETWEEN THE EXPANDED SETS OF POEMS PER WINDOW 
# (INCL. POEMS WITHIN THE K_NEIGH WINDOW IN THE CORPUS AROUND EACH POEM IN THE WINDOW) 
# WHEREAS THE UNIQUE ONE PREVIOUSLY IDENTIFIED IS THE INTERSECTION OF THOSE OUTPUTTED BY JUPYTERHUB
# THAT IS, THE VALUES OF THE "poems_in_window_selected" KEYS IN BOTH

In [48]:

intersection_prev_n_event_users = set(previous_descriptors[0]["contributors"]).intersection(set(ev["contributors"]))

In [49]:
len(intersection_prev_n_event_users)

46

In [52]:

len(set(previous_descriptors[0]["contributors"]))

60

In [53]:
len(set(ev["contributors"]))

83

In [None]:

# LET'S ANALYZE THAT SINGLE SHARED POEM AND HOW ITS TOPOLOGICAL FEATURES SHIFT AS IT TRANSITIONS FROM PRE-WINDOW TO EVENT PER SE


In [58]:


import scipy.sparse as sp

# ------------------------------------------------------------
# 1. IDENTIFY THE SHARED POEM--WE ALREADY DID THAT, NOW WE ARE GETTING ITS INDICES IN BOTH WORLDS
# ------------------------------------------------------------

# shared_poems = list(set(valid_poems_prev) & set(valid_poems))
# if len(shared_poems) != 1:
    # print("Warning: expected exactly one shared poem, found:", shared_poems)

# p_shared = shared_poems[0]
# print("Shared poem:", p_shared)
p_shared = list(intersection_prev_n_event)[0]

# indices inside each window
i_prev = valid_poems_prev.index(p_shared)
i_evt  = valid_poems.index(p_shared)


In [59]:
print(i_prev, i_evt)

19 13


In [60]:

# SEMANTIC SHIFT
emb_prev_vec = emb_prev[i_prev]
emb_evt_vec  = emb[i_evt]

semantic_shift = float(np.linalg.norm(emb_prev_vec - emb_evt_vec))
print("Semantic shift ‖emb_prev - emb_evt‖:", semantic_shift)

# SONIC SHIFT
phon_prev_vec = phon_feats_prev[i_prev]
phon_evt_vec  = phon_feats[i_evt]

phonological_shift = float(np.linalg.norm(phon_prev_vec - phon_evt_vec))
print("Phonological shift ‖phon_prev - phon_evt‖:", phonological_shift)

# NEIGHBORHOOD SHIFT
def top_neighbors(A, i, k=8):
    row = A[i].toarray().flatten()
    idx = np.argsort(-row)
    idx = [j for j in idx if j != i][:k]
    return idx

N_prev = top_neighbors(A_sem_subset_prev, i_prev)
N_evt  = top_neighbors(A_sem_subset, i_evt)

# convert neighbor indices to poem names
N_prev_names = [valid_poems_prev[j] for j in N_prev]
N_evt_names  = [valid_poems[j] for j in N_evt]

overlap = len(set(N_prev_names) & set(N_evt_names))
print("Neighbor overlap:", overlap)
print("Neighbors PRE:", N_prev_names)
print("Neighbors EVENT:", N_evt_names)


Semantic shift ‖emb_prev - emb_evt‖: 0.0
Phonological shift ‖phon_prev - phon_evt‖: 0.0
Neighbor overlap: 1
Neighbors PRE: ['osip_mandelstam_rome_from_russian_trans_john_high_and_matvei_yankelevich_poem_1.txt', 'place_lupe-gomez_trans-erin-moure_secret-energy_hermida-galicia-&-germany.txt', 'place_rachel_by_matthew_arnold_paris_provence_french_riviera.txt', 'osip_mandelstam_meganom_from_russian_trans_alistair_noon_poem_1.txt', 'paulo_leminski_reportedly_from_portuguese_trans_elisa_wouk_almino_poem_4.txt', 'place_oliver_goldsmith_france.txt', 'place_margento_tower_mask_cannes_france.txt', 'osip_mandelstam_january_9_1937_from_russian_trans_john_high_and_matvei_yankelevich_poem_2.txt']
Neighbors EVENT: ['pierre_peuchmaurd_glimmers_from_french_trans_ec_belli_poem_1.txt', 'saksiri_meesomsueb_dogs_in_the_lead_from_thai_trans_noh_anothai_poem_3.txt', 'qiu_jin_reflections_from_chinese_trans_yilin_wang_poem_3.txt', 'phu_recalling_love_scenes_by_pleasant_river_from_thai_trans_noh_anothai_poem_2.

In [61]:

set(N_prev_names) & set(N_evt_names)

{'place_rachel_by_matthew_arnold_paris_provence_french_riviera.txt'}

In [71]:

# ALL NEIGHBORS
def neighbors(A, i):
    row = A[i].toarray().flatten()
    idx = np.argsort(-row)
    # idx = [j for j in idx if j != i][:k]
    idx = [j for j in idx if j != i]
    return idx

N_prev = neighbors(A_sem_subset_prev, i_prev)
N_evt  = neighbors(A_sem_subset, i_evt)

# convert neighbor indices to poem names
N_prev_names = [valid_poems_prev[j] for j in N_prev]
N_evt_names  = [valid_poems[j] for j in N_evt]

overlap = len(set(N_prev_names) & set(N_evt_names))
print("Neighbor overlap:", overlap)
print("Neighbors PRE:", N_prev_names)
print("Neighbors EVENT:", N_evt_names)

Neighbor overlap: 8
Neighbors PRE: ['osip_mandelstam_rome_from_russian_trans_john_high_and_matvei_yankelevich_poem_1.txt', 'place_lupe-gomez_trans-erin-moure_secret-energy_hermida-galicia-&-germany.txt', 'place_rachel_by_matthew_arnold_paris_provence_french_riviera.txt', 'osip_mandelstam_meganom_from_russian_trans_alistair_noon_poem_1.txt', 'paulo_leminski_reportedly_from_portuguese_trans_elisa_wouk_almino_poem_4.txt', 'place_oliver_goldsmith_france.txt', 'place_margento_tower_mask_cannes_france.txt', 'osip_mandelstam_january_9_1937_from_russian_trans_john_high_and_matvei_yankelevich_poem_2.txt', 'qiu_jin_to_drink_from_chinese_trans_yilin_wang_poem_5.txt', 'rosa_chávez_i_braid_my_hair_so_it_rests_from_spanish_trans_gabriela_ramirezchavez_poem_4.txt', 'rosa_chávez_in_our_palm_lines_our_tenderness_is_written_from_spanish_trans_gabriela_ramirezchavez_poem_6.txt', 'rosa_chávez_may_my_heart_bloom_when_it_stops_pumping_red_ink_may_it_burst_into_small_thorns_from_spanish_trans_gabriela_ramire

In [72]:
overlap = set(N_prev_names) & set(N_evt_names)

In [73]:
overlap

{'place_oliver_goldsmith_france.txt',
 'place_rachel_by_matthew_arnold_paris_provence_french_riviera.txt',
 'qiu_jin_pusaman_a_message_for_a_female_friend_from_chinese_trans_yilin_wang_poem_1.txt',
 'qiu_jin_reflections_from_chinese_trans_yilin_wang_poem_3.txt',
 'sagawa_chika_dark_song_from_japanese_trans_sawako_nakayasu_poem_2.txt',
 'sagawa_chika_ocean_angel_from_japanese_trans_sawako_nakayasu_poem_3.txt',
 'satu-taskinen_bio.txt',
 'satu-taskinen_interview.txt'}

In [114]:

with open('margento_graphpoem_dhsi23_valid_poems_window_prev_to_singualrity_3.pkl', 'wb') as file:
    pickle.dump(valid_poems_prev, file)

with open('margento_graphpoem_dhsi23_valid_poems_singualrity_3.pkl', 'wb') as file0:
    pickle.dump(valid_poems, file0)

In [87]:
import networkx as nx
import numpy as np

# Convert your sparse matrices to networkx graphs
G_prev = nx.from_scipy_sparse_array(A_sem_subset_prev, create_using=nx.Graph)
G_evt  = nx.from_scipy_sparse_array(A_sem_subset, create_using=nx.Graph)

# ALREADY DONE BUT RUN AGAIN IF NEW SESSION
# p = p_shared
# i_prev = valid_poems_prev.index(p)
# i_evt  = valid_poems.index(p)

# Mapping from index to poem name for printing
mapping_prev = {i:valid_poems_prev[i] for i in range(len(valid_poems_prev))}
mapping_evt  = {i:valid_poems[i]  for i in range(len(valid_poems))}

# -------------------------------
# 1. DEGREE CENTRALITY
# -------------------------------
deg_prev = G_prev.degree(i_prev, weight='weight')
deg_evt  = G_evt.degree(i_evt,  weight='weight')

print("Weighted degree PRE:", deg_prev)
print("Weighted degree EVENT:", deg_evt)
print("Δ weighted degree:", deg_evt - deg_prev)

# -------------------------------
# 2. CLOSENESS CENTRALITY
# -------------------------------

# Build a distance-weighted version of the graphs
G_prev_dist = G_prev.copy()
G_evt_dist  = G_evt.copy()

for u, v, d in G_prev_dist.edges(data=True):
    w = d.get('weight', 1e-9)
    d['weight'] = 1.0 / max(w, 1e-9)

for u, v, d in G_evt_dist.edges(data=True):
    w = d.get('weight', 1e-9)
    d['weight'] = 1.0 / max(w, 1e-9)

cl_prev = nx.closeness_centrality(G_prev_dist, i_prev, distance='weight')
cl_evt  = nx.closeness_centrality(G_evt_dist,  i_evt,  distance='weight')

print("Closeness PRE:", cl_prev)
print("Closeness EVENT:", cl_evt)
print("Δ closeness:", cl_evt - cl_prev)

# -------------------------------
# 3. BETWEENNESS CENTRALITY
# -------------------------------
# (run weighted betweenness)
bt_prev = nx.betweenness_centrality(G_prev, weight='weight')[i_prev]
bt_evt  = nx.betweenness_centrality(G_evt,  weight='weight')[i_evt]

print("\nBetweenness PRE:", bt_prev)
print("Betweenness EVENT:", bt_evt)
print("Δ betweenness:", bt_evt - bt_prev)

# -------------------------------
# 4. EIGENVECTOR CENTRALITY
# -------------------------------
eig_prev = nx.eigenvector_centrality_numpy(G_prev, weight='weight')[i_prev]
eig_evt  = nx.eigenvector_centrality_numpy(G_evt,  weight='weight')[i_evt]

print("\nEigenvector centrality PRE:", eig_prev)
print("Eigenvector centrality EVENT:", eig_evt)
print("Δ eigenvector:", eig_evt - eig_prev)

# -------------------------------
# 5. PAGERANK
# -------------------------------
pr_prev_dict = nx.pagerank(G_prev, weight='weight')
pr_evt_dict  = nx.pagerank(G_evt,  weight='weight')

pr_prev = pr_prev_dict.get(i_prev, 0.0)
pr_evt  = pr_evt_dict.get(i_evt, 0.0)

print("\nPageRank PRE:", pr_prev)
print("PageRank EVENT:", pr_evt)
print("Δ PageRank:", pr_evt - pr_prev)

# -------------------------------
# 6. LOCAL CLUSTERING COEFFICIENT
# -------------------------------
cc_prev = nx.clustering(G_prev, i_prev, weight='weight')
cc_evt  = nx.clustering(G_evt,  i_evt,  weight='weight')

print("\nClustering PRE:", cc_prev)
print("Clustering EVENT:", cc_evt)
print("Δ clustering:", cc_evt - cc_prev)

# -------------------------------
# 7. NEIGHBORHOOD GROWTH / SHRINK
# -------------------------------
N_prev = list(G_prev.neighbors(i_prev))
N_evt  = list(G_evt.neighbors(i_evt))

print("\nNum neighbors PRE:", len(N_prev))
print("Num neighbors EVENT:", len(N_evt))
print("Δ neighbors:", len(N_evt) - len(N_prev))

# -------------------------------
# 8. EDGE WEIGHT DISTRIBUTION
# -------------------------------
weights_prev = [G_prev[i_prev][nbr]['weight'] for nbr in N_prev]
weights_evt  = [G_evt[i_evt][nbr]['weight']  for nbr in N_evt]

print("\nMean edge weight PRE:", np.mean(weights_prev))
print("Mean edge weight EVENT:", np.mean(weights_evt))
print("Median PRE:", np.median(weights_prev))
print("Median EVENT:", np.median(weights_evt))
print("Δ mean weight:", np.mean(weights_evt) - np.mean(weights_prev))

# -------------------------------
# 9. COMMUNITY ROLE (Louvain)
# -------------------------------
try:
    import community as community_louvain

    com_prev = community_louvain.best_partition(G_prev, weight='weight')
    com_evt  = community_louvain.best_partition(G_evt,  weight='weight')

    print("\nCommunity PRE:", com_prev[i_prev])
    print("Community EVENT:", com_evt[i_evt])
    print("Community changed:", com_prev[i_prev] != com_evt[i_evt])
except:
    print("\nLouvain library not installed; skip community detection.")


Weighted degree PRE: 3.2140706944756348
Weighted degree EVENT: 4.323058051327843
Δ weighted degree: 1.1089873568522082
Closeness PRE: 0.17569417710631507
Closeness EVENT: 0.18217927159944716
Δ closeness: 0.0064850944931320875

Betweenness PRE: 0.10121951219512196
Betweenness EVENT: 0.007399577167019028
Δ betweenness: -0.09381993502810293

Eigenvector centrality PRE: 0.032616880487014256
Eigenvector centrality EVENT: 0.19246052431567587
Δ eigenvector: 0.15984364382866162

PageRank PRE: 0.019632008573655848
PageRank EVENT: 0.019913861636161517
Δ PageRank: 0.00028185306250566936

Clustering PRE: 0.21065843866034695
Clustering EVENT: 0.353299341125147
Δ clustering: 0.14264090246480005

Num neighbors PRE: 11
Num neighbors EVENT: 10
Δ neighbors: -1

Mean edge weight PRE: 0.2921882449523304
Mean edge weight EVENT: 0.4323058051327844
Median PRE: 0.30983276841097257
Median EVENT: 0.43125422227755356
Δ mean weight: 0.14011756018045396

Louvain library not installed; skip community detection.


In [91]:

from networkx.algorithms.community import louvain_communities


In [92]:


# --- detect communities ---
com_prev = louvain_communities(G_prev, weight="weight")
com_evt  = louvain_communities(G_evt,  weight="weight")

def communities_to_partition(communities):
    part = {}
    for cid, comm in enumerate(communities):
        for node in comm:
            part[node] = cid
    return part

part_prev = communities_to_partition(com_prev)
part_evt  = communities_to_partition(com_evt)

# --- community of the shared poem ---
c_prev = part_prev.get(i_prev, None)
c_evt  = part_evt.get(i_evt, None)

print("Shared poem community PRE:", c_prev)
print("Shared poem community EVENT:", c_evt)
print("Δ community (if changed):", c_evt != c_prev)


Shared poem community PRE: 1
Shared poem community EVENT: 2
Δ community (if changed): True


In [99]:
print(i_prev, i_evt)

19 13


In [96]:
len(com_prev)

3

In [97]:
len(com_evt)

3

In [98]:
part_prev

{0: 0,
 1: 0,
 2: 0,
 39: 0,
 40: 0,
 15: 0,
 17: 0,
 21: 0,
 22: 0,
 23: 0,
 3: 1,
 4: 1,
 5: 1,
 36: 1,
 10: 1,
 16: 1,
 18: 1,
 19: 1,
 20: 1,
 24: 1,
 25: 1,
 26: 1,
 28: 1,
 29: 1,
 6: 2,
 7: 2,
 8: 2,
 9: 2,
 11: 2,
 12: 2,
 13: 2,
 14: 2,
 27: 2,
 30: 2,
 31: 2,
 32: 2,
 33: 2,
 34: 2,
 35: 2,
 37: 2,
 38: 2,
 41: 2}

In [100]:
part_evt

{1: 0,
 35: 0,
 37: 0,
 38: 0,
 39: 0,
 40: 0,
 41: 0,
 15: 0,
 16: 0,
 22: 0,
 0: 1,
 2: 1,
 4: 1,
 5: 1,
 6: 1,
 7: 1,
 8: 1,
 21: 1,
 23: 1,
 24: 1,
 25: 1,
 26: 1,
 27: 1,
 28: 1,
 29: 1,
 30: 1,
 31: 1,
 33: 1,
 42: 1,
 44: 1,
 32: 2,
 34: 2,
 3: 2,
 36: 2,
 9: 2,
 10: 2,
 11: 2,
 43: 2,
 13: 2,
 14: 2,
 12: 2,
 17: 2,
 18: 2,
 19: 2,
 20: 2}

In [101]:

#          com_prev, com_evt are lists of sets of node indices (from louvain_communities)
#          mapping_prev, mapping_evt map node-index -> poem name (we created these earlier)
#          i_prev, i_evt are the index of the shared poem in each graph

def print_communities(communities, mapping, highlight_index=None, name=""):
    print(f"\n--- {name} communities (total {len(communities)}) ---")
    for cid, comm in enumerate(communities):
        # map numeric indices -> poem names (if mapping provided)
        members = [mapping.get(n, str(n)) for n in sorted(comm)]
        marker = ""
        if highlight_index is not None and highlight_index in comm:
            marker = "  <-- SHARED POEM HERE"
        print(f"community {cid:02d} ({len(members)} nodes): {members[:15]}{'' if len(members)<=15 else ' ...'}{marker}")
    print("-------------------------------------------------\n")


mapping_prev = {i: valid_poems_prev[i] for i in range(len(valid_poems_prev))}
mapping_evt  = {i: valid_poems[i] for i in range(len(valid_poems))}
print_communities(com_prev, mapping_prev, highlight_index=i_prev, name="PRE")
print_communities(com_evt,  mapping_evt,  highlight_index=i_evt,  name="EVENT")


--- PRE communities (total 3) ---
community 00 (10 nodes): ['ondrej-buddeus_chatroom-the-history-of-the-coca-cola-trademark-in-central-and-eastern-europe-from-1990-to-2011.txt', 'oriette_dangelo_caracas_in_your_absence_i_know_ill_find_my_name_from_spanish_trans_lupita_eydetucker_poem_1.txt', 'oriette_dangelo_criminal_syndrome_from_spanish_trans_lupita_eydetucker_poem_5.txt', 'place_lupe-gomez_trans-erin-moure_secret-energy_hermida-galicia-&-germany.txt', "place_martin-glaz-serup_roman-nights_rome-venice-colle-di-val-d'elsa-italy-&-nicaragua-&-russia-&-usa-&-amager-denmark-&-sarajevo-bosnia-&-costa-brava-spain-&-gothenburg-sweden-&-berlin-germany.txt", 'publishing_sphere_reader_erin_moure_reading_translation_multilingual_poetry.txt', 'publishing_sphere_reader_lionel_ruffel_contemporary_moment.txt', 'publishing_sphere_reader_nick_thurston_subcontract.txt', 'satu-taskinen_bio.txt', 'satu-taskinen_interview.txt']
community 01 (14 nodes): ['osip_mandelstam_january_9_1937_from_russian_trans

In [102]:


def get_community_of_node(communities, node_index):
    for cid, comm in enumerate(communities):
        if node_index in comm:
            return cid, sorted(list(comm))
    return None, []

cid_prev, members_prev = get_community_of_node(com_prev, i_prev)
cid_evt,  members_evt  = get_community_of_node(com_evt,  i_evt)
print("Shared poem community PRE:", cid_prev, [mapping_prev.get(n) for n in members_prev])
print("Shared poem community EVENT:", cid_evt, [mapping_evt.get(n) for n in members_evt])


Shared poem community PRE: 1 ['osip_mandelstam_january_9_1937_from_russian_trans_john_high_and_matvei_yankelevich_poem_2.txt', 'osip_mandelstam_meganom_from_russian_trans_alistair_noon_poem_1.txt', 'osip_mandelstam_rome_from_russian_trans_john_high_and_matvei_yankelevich_poem_1.txt', 'paulo_leminski_reportedly_from_portuguese_trans_elisa_wouk_almino_poem_4.txt', 'place_margento_tower_mask_cannes_france.txt', 'place_oliver_goldsmith_france.txt', 'place_rachel_blau_duplessis_draft_65_that_japanese_language_tao.txt', 'place_rachel_by_matthew_arnold_paris_provence_french_riviera.txt', 'qiu_jin_pusaman_a_message_for_a_female_friend_from_chinese_trans_yilin_wang_poem_1.txt', 'qiu_jin_reflections_from_chinese_trans_yilin_wang_poem_3.txt', 'qiu_jin_to_drink_from_chinese_trans_yilin_wang_poem_5.txt', 'rosa_chávez_in_our_palm_lines_our_tenderness_is_written_from_spanish_trans_gabriela_ramirezchavez_poem_6.txt', 'rosa_chávez_may_my_heart_bloom_when_it_stops_pumping_red_ink_may_it_burst_into_small

In [104]:

inters_communities_unique_poem = set([mapping_prev.get(n) for n in members_prev]).intersection(set([mapping_evt.get(n) for n in members_evt]))

In [105]:
inters_communities_unique_poem

{'place_oliver_goldsmith_france.txt',
 'place_rachel_blau_duplessis_draft_65_that_japanese_language_tao.txt',
 'place_rachel_by_matthew_arnold_paris_provence_french_riviera.txt',
 'qiu_jin_pusaman_a_message_for_a_female_friend_from_chinese_trans_yilin_wang_poem_1.txt',
 'qiu_jin_reflections_from_chinese_trans_yilin_wang_poem_3.txt'}

In [116]:

# -----------------------------------------------------
# IDENTIFY "IMPORTANT" NEIGHBORS OF THE UNIQUES SHARED POEM (that made it from prev window to the event/singularity one)
# in the EVENT network
# -----------------------------------------------------


# --- 2) Compute eigenvector centrality ---
eig_evt = nx.eigenvector_centrality_numpy(G_evt, weight='weight')
# eig_evt  = nx.eigenvector_centrality_numpy(G_evt,  weight='weight')[i_evt]
eig_evt_vec = np.array([eig_evt[n] for n in range(len(valid_poems))])

# --- 3) Get neighbors of the shared poem ---
neighbors = list(G_evt.neighbors(i_evt))

mapping_evt  = {i: valid_poems[i] for i in range(len(valid_poems))}
print("Eigenvector centrality (EVENT):", eig_evt[i_evt])
print("\n--- Direct neighbors in EVENT network ---")

records = []

for j in neighbors:
    w = G_evt[i_evt][j]['weight']
    c = eig_evt[j]
    influence = w * c
    records.append((j, valid_poems[j], w, c, influence))
    print(f"Neighbor idx {j:3d}: {valid_poems[j]}")
    print(f"   edge weight w_ij = {w:.4f}")
    print(f"   eigenvector c_j  = {c:.4f}")
    print(f"   influence = w_ij * c_j = {influence:.4f}")

# --- 4) Sort by influence (descending) ---
records_sorted = sorted(records, key=lambda x: x[4], reverse=True)

# Optional: return as a DataFrame
# try:
    # import pandas as pd
    # df_inf = pd.DataFrame(records_sorted, columns=["index","poem","weight","eig_centrality","influence"])
    # display(df_inf.head(20))
# except ImportError:
    # pass

Eigenvector centrality (EVENT): 0.19246052431567542

--- Direct neighbors in EVENT network ---
Neighbor idx  10: pierre_peuchmaurd_glimmers_from_french_trans_ec_belli_poem_1.txt
   edge weight w_ij = 0.5495
   eigenvector c_j  = 0.2955
   influence = w_ij * c_j = 0.1623
Neighbor idx   9: phu_recalling_love_scenes_by_pleasant_river_from_thai_trans_noh_anothai_poem_2.txt
   edge weight w_ij = 0.4520
   eigenvector c_j  = 0.3095
   influence = w_ij * c_j = 0.1399
Neighbor idx  11: pierre_peuchmaurd_the_foam_of_lions_from_french_trans_ec_belli_poem_2.txt
   edge weight w_ij = 0.4345
   eigenvector c_j  = 0.2955
   influence = w_ij * c_j = 0.1284
Neighbor idx  14: place_rachel_by_matthew_arnold_paris_provence_french_riviera.txt
   edge weight w_ij = 0.3915
   eigenvector c_j  = 0.2476
   influence = w_ij * c_j = 0.0970
Neighbor idx  20: qiu_jin_reflections_from_chinese_trans_yilin_wang_poem_3.txt
   edge weight w_ij = 0.4629
   eigenvector c_j  = 0.1939
   influence = w_ij * c_j = 0.0897
Ne

In [117]:

records_sorted

[(10,
  'pierre_peuchmaurd_glimmers_from_french_trans_ec_belli_poem_1.txt',
  0.5494834160567819,
  0.29545431439707576,
  0.1623472459636196),
 (9,
  'phu_recalling_love_scenes_by_pleasant_river_from_thai_trans_noh_anothai_poem_2.txt',
  0.45195930441337306,
  0.30952723187264153,
  0.1398937124141559),
 (32,
  'saksiri_meesomsueb_dogs_in_the_lead_from_thai_trans_noh_anothai_poem_3.txt',
  0.4780074949085418,
  0.2697973995102332,
  0.1289651790727256),
 (11,
  'pierre_peuchmaurd_the_foam_of_lions_from_french_trans_ec_belli_poem_2.txt',
  0.4344534651858002,
  0.29546166370931853,
  0.12836434362807503),
 (34,
  'saksiri_meesomsueb_sleight_from_thai_trans_noh_anothai_poem_1.txt',
  0.4280549793693069,
  0.25973901153304946,
  0.11118257722318366),
 (36,
  'sappho_31_trans_julia_dubnoff_trans_chris_childers_trans_anne_carson_walt_whitman_woman_waits_for_me.txt',
  0.42265278784838384,
  0.2597628094732154,
  0.10978947560318306),
 (14,
  'place_rachel_by_matthew_arnold_paris_provence_f

In [121]:

# -----------------------------------------------------
# PRE-EVENT NEIGHBOR ANALYSIS (the ties the poem had before singularity)
# -----------------------------------------------------


# Eigenvector centrality (pre-event)
eig_prev = nx.eigenvector_centrality_numpy(G_prev, weight='weight')
eig_prev_vec = np.array([eig_prev[n] for n in range(len(valid_poems_prev))])

# Neighbors of shared poem in the PRE network
neighbors_prev = list(G_prev.neighbors(i_prev))

print(f"\n=== PRE-EVENT neighbors for shared poem {valid_poems_prev[i_prev]} ===")
print("Eigenvector centrality (PRE):", eig_prev[i_prev])
print("\n--- Direct neighbors in PRE-EVENT network ---")

records_prev = []

for j in neighbors_prev:
    w = G_prev[i_prev][j]['weight']
    c = eig_prev_vec[j]
    influence = w * c
    records_prev.append((j, valid_poems_prev[j], w, c, influence))
    
    print(f"Neighbor idx {j:3d}: {valid_poems_prev[j]}")
    print(f"   edge weight w_ij = {w:.4f}")
    print(f"   eigenvector c_j  = {c:.4f}")
    print(f"   influence = w_ij * c_j = {influence:.4f}")

# Sort by influence
records_prev_sorted = sorted(records_prev, key=lambda x: x[4], reverse=True)

print("\n=== MOST INFLUENTIAL PRE-EVENT NEIGHBORS (ranked) ===")
for (j, name, w, c, infl) in records_prev_sorted:
    print(f"{name:60s} | w={w:.4f} | c={c:.4f} | influence={infl:.4f}")



=== PRE-EVENT neighbors for shared poem place_rachel_blau_duplessis_draft_65_that_japanese_language_tao.txt ===
Eigenvector centrality (PRE): 0.0326168804870135

--- Direct neighbors in PRE-EVENT network ---
Neighbor idx   5: osip_mandelstam_rome_from_russian_trans_john_high_and_matvei_yankelevich_poem_1.txt
   edge weight w_ij = 0.4607
   eigenvector c_j  = 0.0372
   influence = w_ij * c_j = 0.0171
Neighbor idx  15: place_lupe-gomez_trans-erin-moure_secret-energy_hermida-galicia-&-germany.txt
   edge weight w_ij = 0.4034
   eigenvector c_j  = 0.0884
   influence = w_ij * c_j = 0.0356
Neighbor idx  16: place_margento_tower_mask_cannes_france.txt
   edge weight w_ij = 0.2799
   eigenvector c_j  = 0.0316
   influence = w_ij * c_j = 0.0089
Neighbor idx  17: place_martin-glaz-serup_roman-nights_rome-venice-colle-di-val-d'elsa-italy-&-nicaragua-&-russia-&-usa-&-amager-denmark-&-sarajevo-bosnia-&-costa-brava-spain-&-gothenburg-sweden-&-berlin-germany.txt
   edge weight w_ij = 0.2611
   eige

In [94]:

# ---------- K-CORE INDEX ----------
core_prev = nx.core_number(G_prev).get(i_prev, 0)
core_evt  = nx.core_number(G_evt).get(i_evt, 0)

print("\n=== K-CORE INDEX ===")
print("Core PRE:", core_prev)
print("Core EVENT:", core_evt)
print("Δ core (event - pre):", core_evt - core_prev)

import numpy as np

# ---------- LOCAL ASSORTATIVITY ----------
def local_assortativity(G, node):
    """Compute degree assortativity restricted to the 1-hop ego network of `node`."""
    neighbors = list(G.neighbors(node))
    if len(neighbors) < 2:
        return 0.0  # undefined for nodes with <2 neighbors

    # Extract subgraph of node + neighbors
    ego = G.subgraph([node] + neighbors)
    
    # Node degrees in this subgraph
    deg = dict(ego.degree())

    # Collect degree pairs for all edges
    deg_pairs = []
    for u, v in ego.edges():
        deg_pairs.append((deg[u], deg[v]))

    if len(deg_pairs) < 2:
        return 0.0

    xs = np.array([d1 for d1, d2 in deg_pairs], dtype=float)
    ys = np.array([d2 for d1, d2 in deg_pairs], dtype=float)

    if xs.std() == 0 or ys.std() == 0:
        return 0.0

    return np.corrcoef(xs, ys)[0, 1]


loc_assort_prev = local_assortativity(G_prev, i_prev)
loc_assort_evt  = local_assortativity(G_evt,  i_evt)

print("\n=== LOCAL ASSORTATIVITY ===")
print("Local assortativity PRE:", loc_assort_prev)
print("Local assortativity EVENT:", loc_assort_evt)
print("Δ assortativity:", loc_assort_evt - loc_assort_prev)

import numpy as np
import networkx as nx
from scipy.sparse.linalg import eigsh
from scipy.sparse import csgraph

# ---------- SPECTRAL EMBEDDING (3D) ----------

def spectral_embedding_3d(G):
    # Weighted adjacency matrix
    A = nx.to_scipy_sparse_array(G, weight='weight', dtype=float)

    # Normalized Laplacian
    L = csgraph.laplacian(A, normed=True)

    # Eigenvectors
    eigenvals, eigenvecs = np.linalg.eigh(L.toarray())

    # Skip the first eigenvector (zero eigenvalue)
    vecs = eigenvecs[:, 1:4]
    return vecs


# Compute embeddings
spec_prev = spectral_embedding_3d(G_prev)
spec_evt  = spectral_embedding_3d(G_evt)

# Coordinates of the poem
coord_prev = spec_prev[i_prev]
coord_evt  = spec_evt[i_evt]

# Euclidean shift
spec_shift = np.linalg.norm(coord_prev - coord_evt)

print("\n=== SPECTRAL EMBEDDING SHIFT ===")
print("Coordinates PRE:", coord_prev)
print("Coordinates EVENT:", coord_evt)
print("L2 shift:", spec_shift)



=== K-CORE INDEX ===
Core PRE: 8
Core EVENT: 8
Δ core (event - pre): 0

=== LOCAL ASSORTATIVITY ===
Local assortativity PRE: -0.1432894420850021
Local assortativity EVENT: -0.2136156012179899
Δ assortativity: -0.07032615913298779

=== SPECTRAL EMBEDDING SHIFT ===
Coordinates PRE: [-0.13688314  0.0632467   0.02723614]
Coordinates EVENT: [-0.12686304  0.14096015  0.01220931]
L2 shift: 0.07978463356769269


In [79]:

# POEM-SPACE GLOBAL GEOMETRY CHANGES

# average pairwise semantic distance
mean_sem_dist_prev = float(1 - np.mean(cosine_similarity(emb_prev)))
mean_sem_dist_evt  = float(1 - np.mean(cosine_similarity(emb)))

print("Mean semantic cosine distance PRE:", mean_sem_dist_prev)
print("Mean semantic cosine distance EVENT:", mean_sem_dist_evt)

# density
def graph_density(A):
    P = A.shape[0]
    possible = P*(P-1)
    actual = A.count_nonzero()
    return actual / possible

# edge weights
def avg_edge_weight(A):
    if A.count_nonzero() == 0:
        return 0
    return float(A.data.mean())

print("Density PRE:", graph_density(A_sem_subset_prev))
print("Density EVENT:", graph_density(A_sem_subset))
print("Avg edge weight PRE:", avg_edge_weight(A_sem_subset_prev))
print("Avg edge weight EVENT:", avg_edge_weight(A_sem_subset))


Mean semantic cosine distance PRE: 0.7544712871313095
Mean semantic cosine distance EVENT: 0.8123135417699814
Density PRE: 0.1951219512195122
Density EVENT: 0.18181818181818182
Avg edge weight PRE: 0.44997312298129255
Avg edge weight EVENT: 0.4567532805084696


In [109]:

np.save("margento_graphpoem_dhsi23_embeddings_window_prev_to_singularity_3.npy", emb_prev)
np.save("margento_graphpoem_dhsi23_embeddings_singularity_3.npy", emb)

In [74]:

# ANALYZING THE USERS NOW AS THEY TRANSITION FROM PRE-WINDOW TO EVENT

import numpy as np
import scipy.sparse as sp

# ---- Helper: build users↔poems bipartite adjacency for a window ----
def build_user_poem(df_tw, df_jh, df_fb, poems_list, window):
    s, e = window

    # slice logs
    tw_slice = df_tw[(df_tw["time"] >= s) & (df_tw["time"] <= e)]
    jh_slice = df_jh[(df_jh["time"] >= s) & (df_jh["time"] <= e)]
    fb_slice = df_fb[(df_fb["time"] >= s) & (df_fb["time"] <= e)]

    # unify poem column for each df
    # TW + JH have "poem" or "file"
    def get_poem_column(df):
        if "poem" in df.columns:
            return "poem"
        if "file" in df.columns:
            return "file"
        return None

    col_tw = get_poem_column(tw_slice)
    col_jh = get_poem_column(jh_slice)
    col_fb = get_poem_column(fb_slice)

    # For FB: map to poems by nearest Twitter bot tweet (if no poem column)
    if col_fb is None:
        tweet_times = df_tw[["time","poem"]].sort_values("time")
        def map_time_to_poem(t):
            prev = tweet_times[tweet_times["time"] <= t]
            if len(prev) > 0:
                return prev.iloc[-1]["poem"]
            else:
                return None
        fb_slice = fb_slice.copy()
        fb_slice["poem"] = fb_slice["time"].apply(map_time_to_poem)
        col_fb = "poem"

    # Filter only the poems in our subset
    tw_slice = tw_slice[tw_slice[col_tw].isin(poems_list)]
    jh_slice = jh_slice[jh_slice[col_jh].isin(poems_list)]
    fb_slice = fb_slice[fb_slice[col_fb].isin(poems_list)]

    # collect users
    users = sorted(set(tw_slice["user"]) |
                   set(jh_slice["user"]) |
                   set(fb_slice["user"]))

    U = len(users)
    P = len(poems_list)
    uidx = {u:i for i,u in enumerate(users)}
    pidx = {p:i for i,p in enumerate(poems_list)}

    rows=[]; cols=[]; data=[]

    # define platform weights
    w_tw = 1.0
    w_jh = 1.2      # coding platform often has stronger weight
    w_fb = 0.7      # livestream tends to be supplementary

    # TW
    for _,r in tw_slice.iterrows():
        rows.append(uidx[r["user"]])
        cols.append(pidx[r[col_tw]])
        data.append(w_tw)

    # JH
    for _,r in jh_slice.iterrows():
        rows.append(uidx[r["user"]])
        cols.append(pidx[r[col_jh]])
        data.append(w_jh)

    # FB
    for _,r in fb_slice.iterrows():
        rows.append(uidx[r["user"]])
        cols.append(pidx[r[col_fb]])
        data.append(w_fb)

    A_up = sp.csr_matrix((data,(rows,cols)), shape=(U,P))

    return A_up, users


In [75]:

# USERS & POEMS IN PREV WINDOW
A_up_prev, users_prev = build_user_poem(df_tw, df_jh, df_fb, poems, pre_short) 

In [76]:

# USERS & POEMS IN EVENT WINDOW
A_up, users = build_user_poem(df_tw, df_jh, df_fb, poems, ev["event_window"])

In [112]:

np.save("margento_graphpoem_dhsi23_adjacency_matrix_window_prev_to_singularity_3.npy", A_up_prev)
np.save("margento_graphpoem_dhsi23_adjacency_matrix_singularity_3.npy", A_up_prev)

In [111]:
users[:2]

['fb_user_8', 'jh_user_101']

In [113]:
import pickle

with open("margento_graphpoem_dhsi23_users_window_prev_to_singularity_3.pkl", "wb") as f:
    pickle.dump(users_prev, f)

with open("margento_graphpoem_dhsi23_users_singularity_3.pkl", "wb") as fo:
    pickle.dump(users, fo)

In [77]:

deg_user_prev = np.array(A_up_prev.sum(axis=1)).flatten()
deg_user_evt  = np.array(A_up.sum(axis=1)).flatten()

print("Mean user activity PRE:", deg_user_prev.mean())
print("Mean user activity EVENT:", deg_user_evt.mean())
print("Median PRE:", np.median(deg_user_prev))
print("Median EVENT:", np.median(deg_user_evt))


Mean user activity PRE: 1.6999999999999997
Mean user activity EVENT: 2.7940476190476184
Median PRE: 1.2
Median EVENT: 2.4


In [78]:

def participation_entropy(deg):
    tot = deg.sum()
    if tot == 0:
        return 0
    p = deg / tot
    p = p[p>0]
    return float(-(p*np.log(p)).sum())

print("Entropy PRE:", participation_entropy(deg_user_prev))
print("Entropy EVENT:", participation_entropy(deg_user_evt))
print("Δ entropy (event - pre):", participation_entropy(deg_user_evt) - participation_entropy(deg_user_prev))


Entropy PRE: 4.004138318292434
Entropy EVENT: 4.245624796802582
Δ entropy (event - pre): 0.24148647851014804


In [80]:

# STRUCTURAL SIGNATURE OF THE WINDOWS

# top-20 strongest edges
def top_edges(A, k=20):
    coo = A.tocoo()
    edges = [(i,j,w) for i,j,w in zip(coo.row, coo.col, coo.data) if i<j]
    edges_sorted = sorted(edges, key=lambda x: -abs(x[2]))[:k]
    return set((i,j) for i,j,_ in edges_sorted)

overlap20 = len(top_edges(A_sem_subset_prev,20) & top_edges(A_sem_subset,20)) / 20

print("Top-20 edge overlap between windows:", overlap20)


Top-20 edge overlap between windows: 0.0


In [83]:


def edges(A, k=20):
    coo = A.tocoo()
    edges = [(i,j,w) for i,j,w in zip(coo.row, coo.col, coo.data) if i<j]
    edges_sorted = sorted(edges, key=lambda x: -abs(x[2]))[:k]
    return set((i,j) for i,j,_ in edges_sorted)

overlap_e = len(edges(A_sem_subset_prev) & edges(A_sem_subset)) / min(len(edges(A_sem_subset_prev)), len(edges(A_sem_subset)))

print("Edge overlap between windows:", overlap_e)

Edge overlap between windows: 0.0
