In [None]:
import os
import re
import networkx as nx
import math
import matplotlib.pyplot as plt


artist_dir = '../../data/artists'
artist_files = [f for f in os.listdir(artist_dir) if f.endswith('.txt')]
artist_names = [os.path.splitext(f)[0] for f in artist_files]
artist_set = set(artist_names)

G = nx.DiGraph()
G.add_nodes_from(artist_names)

wiki_url_pattern = re.compile(r'https?://en\.wikipedia\.org/wiki/([^\s\]#]+)')

for filename in artist_files:
    artist = os.path.splitext(filename)[0]
    filepath = os.path.join(artist_dir, filename)
    with open(filepath, 'r', encoding='utf-8') as f:
        wikitext = f.read()
    # Only use the main body (before references/external links)
    main_body = wikitext
    # Find all internal wiki links
    links = re.findall(r'\[\[(.*?)\]\]', main_body)
    for link in links:
        target = link.split('|')[0].strip().replace('/', '_').replace(' ', '_')
        if target in artist_set and target != artist:
            G.add_edge(artist, target)

print(f"Directed graph (internal wiki links, main body only): {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")

# remove inactive nodes, and AllMusic and rock_music
active_nodes = [node for node in G.nodes() if G.degree(node) > 0]

G_clean = G.subgraph(active_nodes).copy()
G_clean.remove_nodes_from(["rock_music", "AllMusic"])


G_undirected = G_clean.to_undirected()