In [None]:
import os
import re
import networkx as nx
import math
import matplotlib.pyplot as plt


artist_dir = 'artists'
artist_files = [f for f in os.listdir(artist_dir) if f.endswith('.txt')]
artist_names = [os.path.splitext(f)[0] for f in artist_files]
artist_set = set(artist_names)

G = nx.DiGraph()
G.add_nodes_from(artist_names)

wiki_url_pattern = re.compile(r'https?://en\.wikipedia\.org/wiki/([^\s\]#]+)')

for filename in artist_files:
    artist = os.path.splitext(filename)[0]
    filepath = os.path.join(artist_dir, filename)
    with open(filepath, 'r', encoding='utf-8') as f:
        wikitext = f.read()
    # Only use the main body (before references/external links)
    main_body = wikitext
    # Find all internal wiki links
    links = re.findall(r'\[\[(.*?)\]\]', main_body)
    for link in links:
        target = link.split('|')[0].strip().replace('/', '_').replace(' ', '_')
        if target in artist_set and target != artist:
            G.add_edge(artist, target)

print(f"Directed graph (internal wiki links, main body only): {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")

# remove inactive nodes, and AllMusic and rock_music
active_nodes = [node for node in G.nodes() if G.degree(node) > 0]

G_clean = G.subgraph(active_nodes).copy()
G_clean.remove_nodes_from(["rock_music", "AllMusic"])


G_undirected = G_clean.to_undirected()

length_of_content = {}
for filename in artist_files:
    filepath = os.path.join(artist_dir, filename)
    with open(filepath, 'r', encoding='utf-8') as f:
        text = f.read()
    node_name = os.path.splitext(filename)[0]
    length_of_content[node_name] = len(text)


FileNotFoundError: [Errno 2] No such file or directory: '../../data/artists'

In [None]:

pos = nx.forceatlas2_layout(
    G_undirected, 
    node_size=length_of_content, 
    seed=42,
    max_iter=1000, 
    gravity=1.0, 
    jitter_tolerance=1e-1,
    scaling_ratio=2.0,
    )

node_degrees = [G_undirected.degree(n) for n in G_undirected.nodes()]
node_sizes = [length_of_content.get(n, 100) / 100 for n in G_undirected.nodes()]

plt.figure(figsize=(12, 12))
nx.draw(
    G_undirected,
    pos,
    node_color=node_degrees,
    with_labels=True,
    font_size=8,
    node_size=node_sizes,
    alpha=0.5,
    edge_color='gray',
    linewidths=0.1,
)
plt.title("Artist Wiki Link Network", fontsize=16)
plt.axis("off")
plt.show()
