In [None]:
# Import necessary libraries
import os, re, time, urllib.parse, urllib.request, gzip, json
from collections import defaultdict, Counter

import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from networkx.readwrite import json_graph
from networkx.algorithms.community import louvain_communities
from networkx.algorithms.community.quality import modularity
import seaborn as sns
import statistics
import csv

from sklearn.feature_extraction.text import TfidfVectorizer

from adjustText import adjust_text


In [None]:
# ==========  BUILD BIPARTITE GRAPH ==========

# Load data
df = pd.read_csv("festival_data.csv")

G = nx.Graph()

# Add nodes and edges
for _, row in df.iterrows():
    festival_node = row['festival']
    artist_node = row['artist']

    G.add_node(festival_node, bipartite="festival_year")
    G.add_node(artist_node, bipartite="artist")

    G.add_edge(festival_node, artist_node)


# Convert graph to JSON serializable structure
data = json_graph.node_link_data(G)

with open("festival_network.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("Saved 'festival_network.json'")

In [None]:
#Open Network from GitHub JSON

base_url = "missing the json"

# Read JSON directly from GitHub
response = urllib.request.urlopen(base_url).read()
data = json.loads(response.decode("utf-8"))

# Convert to NetworkX graph
G = json_graph.node_link_graph(data)

### basic analysis for bipartie

In [None]:
#basic analysis

num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()

festival_nodes = [n for n, d in G.nodes(data=True) if d['type'] == 'festival_year']
artist_nodes = [n for n, d in G.nodes(data=True) if d['type'] == 'artist']

print("Total nodes:", num_nodes)
print("Total edges:", num_edges)
print("Festival-Year nodes:", len(festival_nodes))
print("Artist nodes:", len(artist_nodes))


In [None]:
artist_degrees = [G.degree(n) for n in artist_nodes]
festival_degrees = [G.degree(n) for n in festival_nodes]

print("Average artists per festival:", sum(festival_degrees) / len(festival_degrees))
print("Average festivals per artist:", sum(artist_degrees) / len(artist_degrees))

In [None]:
components = list(nx.connected_components(G))
print("Number of connected components:", len(components))

In [None]:
nx.shortest_path_length(G, source=festival_nodes[0])

if nx.is_connected(G):
    avg_path = nx.average_shortest_path_length(G)
    print("Average shortest path length:", avg_path)
else:
    print("Graph is not connected; average shortest path length is undefined.")


# Only works if the graph is connected
if not nx.is_connected(G):
    print("Graph is not fully connected. Using largest connected component.")
    largest_component = max(nx.connected_components(G), key=len)
    G_sub = G.subgraph(largest_component)
else:
    G_sub = G

# Calculate all shortest path lengths
path_lengths = dict(nx.all_pairs_shortest_path_length(G_sub))

# Flatten the distances into a single list (excluding 0 self-distances)
all_lengths = []

for source in path_lengths:
    for target in path_lengths[source]:
        if source != target:
            all_lengths.append(path_lengths[source][target])

# Count occurrences of each path length
length_counts = Counter(all_lengths)

# ========================
# PLOT DISTRIBUTION
# ========================

plt.figure()
plt.bar(length_counts.keys(), length_counts.values())
plt.xlabel("Shortest Path Length")
plt.ylabel("Frequency")
plt.title("Distribution of Shortest Path Lengths in Bipartite Network")
plt.show()

In [None]:
# ========================
# CENTRALITY MEASURES
# ========================

# DEGREE CENTRALITY:
# For artists → how many festivals they play (ubiquity)
# For festivals → size of lineup (scale)
degree = dict(G.degree())

# BETWEARNESS CENTRALITY:
# Measures how often a node lies on shortest paths between others.
# This identifies BRIDGES:
# Artists = connect different festival clusters
# Festivals = connect different artist communities
betweenness = nx.betweenness_centrality(G, normalized=True)

# CLOSENESS CENTRALITY:
# How close a node is to all others.
# High value = structurally central and well-positioned.
closeness = nx.closeness_centrality(G)


# ========================
# SPLIT INTO ARTISTS & FESTIVALS
# ========================

artists = []
festivals = []

for node, data in G.nodes(data=True):
    entry = {
        "node": node,
        "degree": degree[node],
        "betweenness": betweenness[node],
        "closeness": closeness[node]
    }

    if data["type"] == "artist":
        artists.append(entry)
    else:
        entry["festival"] = data["festival"]
        entry["year"] = data["year"]
        festivals.append(entry)


artist_df = pd.DataFrame(artists)
festival_df = pd.DataFrame(festivals)


# ========================
# SORT & DISPLAY TOP NODES
# ========================

def show_top(df, metric, title, n=10):
    print("\n" + title)
    print("-" * len(title))
    display = df.sort_values(by=metric, ascending=False).head(n)
    print(display[["node", metric]].to_string(index=False))


# ========================
# ARTIST ANALYSIS
# ========================

print("\n================ ARTIST CENTRALITY ================")

# Most frequently booked artists across festivals
show_top(
    artist_df,
    "degree",
    "Top Artists by Degree (played most festivals)",
)

# Artists that bridge different festival ecosystems
show_top(
    artist_df,
    "betweenness",
    "Top Artists by Betweenness (structural bridges)",
)

# Artists most centrally embedded in the network
show_top(
    artist_df,
    "closeness",
    "Top Artists by Closeness (most network-central)",
)


# ========================
# SUPERSTAR BRIDGING ARTISTS
# ========================

print("\n================ SUPERSTAR BRIDGING ARTISTS ================")
print("Artists who perform at the most festival-years, acting as structural glue between festivals.")
print("High degree here indicates artists most responsible for overlap and homogenisation.\n")

superstars = artist_df.sort_values(
    by="degree",
    ascending=False
).head(15)

print(superstars[["node", "degree", "betweenness"]].to_string(index=False))


# Optional: define a threshold for superstar status
threshold = superstars["degree"].mean()

print(f"\nArtists with degree above superstar threshold ({threshold:.2f} festival-years):")

bridging_elite = artist_df[artist_df["degree"] > threshold]
print(bridging_elite[["node", "degree"]].to_string(index=False))


# ========================
# FESTIVAL ANALYSIS
# ========================

print("\n================ FESTIVAL-YEAR CENTRALITY ================")

# Festivals with the largest lineups
show_top(
    festival_df,
    "degree",
    "Festivals by Degree (largest lineups)",
)

# Festivals that connect different artist communities
show_top(
    festival_df,
    "betweenness",
    "Festivals by Betweenness (structural hubs)",
)

# Festivals most central to the ecosystem
show_top(
    festival_df,
    "closeness",
    "Festivals by Closeness (most embedded)",
)


### Projectiled Festival-Year network

In [None]:
# ========== PROJECT TO FESTIVAL-YEAR NETWORK ==========
festival_nodes = [n for n, d in G.nodes(data=True) if d['type'] == 'festival_year']
artist_nodes = [n for n, d in G.nodes(data=True) if d['type'] == 'artist']

F = nx.Graph()

# Add festival nodes
for node in festival_nodes:
    F.add_node(node, **G.nodes[node])

shared_artist_count = defaultdict(int)

# Count shared artists
for artist in artist_nodes:
    festivals = list(G.neighbors(artist))
    for i in range(len(festivals)):
        for j in range(i + 1, len(festivals)):
            f1, f2 = festivals[i], festivals[j]
            shared_artist_count[(f1, f2)] += 1

# Add edges with raw weight
for (f1, f2), weight in shared_artist_count.items():
    F.add_edge(f1, f2, weight=weight)

# ========== NORMALISATION (JACCARD SIMILARITY) ==========
for f1, f2 in F.edges():
    artists_f1 = set(G.neighbors(f1))
    artists_f2 = set(G.neighbors(f2))

    intersection = len(artists_f1 & artists_f2)
    union = len(artists_f1 | artists_f2)

    jaccard = intersection / union if union != 0 else 0

    F[f1][f2]['jaccard'] = jaccard


# ========== SAVE PROJECTED NETWORK ==========
projection_data = json_graph.node_link_data(F)

with open("festival_similarity_network.json", "w", encoding="utf-8") as f:
    json.dump(projection_data, f, ensure_ascii=False, indent=2)

print("Saved festival_similarity_network.json")


In [None]:
import itertools
import pandas as pd

# ========================
# GROUP FESTIVAL-YEAR NODES BY FESTIVAL
# ========================

festival_nodes = {
    node: data
    for node, data in G.nodes(data=True)
    if data["type"] == "festival_year"
}

festival_groups = {}

for node, data in festival_nodes.items():
    festival = data["festival"]
    year = data["year"]
    
    festival_groups.setdefault(festival, {})[year] = node


# ========================
# CALCULATE JACCARD OVER TIME
# ========================

overlap_results = []

for festival, years_dict in festival_groups.items():
    
    # Sort years chronologically
    years = sorted(years_dict.keys())
    
    # Compare all year-pairs for that festival
    for y1, y2 in itertools.combinations(years, 2):
        
        node1 = years_dict[y1]
        node2 = years_dict[y2]
        
        artists_1 = set(G.neighbors(node1))
        artists_2 = set(G.neighbors(node2))
        
        intersection = len(artists_1 & artists_2)
        union = len(artists_1 | artists_2)
        
        jaccard = intersection / union if union != 0 else 0
        
        overlap_results.append({
            "festival": festival,
            "year_1": y1,
            "year_2": y2,
            "jaccard_overlap": jaccard,
            "shared_artists": intersection,
            "total_unique_artists": union
        })


# ========================
# TURN INTO DATAFRAME FOR DISPLAY
# ========================

overlap_df = pd.DataFrame(overlap_results)


# ========================
# SHOW MOST & LEAST STABLE FESTIVALS
# ========================

print("\n===== Most Stylistically Stable Festivals =====")
print(overlap_df.sort_values("jaccard_overlap", ascending=False)
      .head(10)
      .to_string(index=False))


print("\n===== Most Dramatically Changing Festivals =====")
print(overlap_df.sort_values("jaccard_overlap", ascending=True)
      .head(10)
      .to_string(index=False))


Compute per-festival temporal overlap metrics (e.g. Jaccard) across years.

Compute global network metrics on the projection: degree distribution, clustering, connected components, average path length, maybe community detection (clusters of similar festivals).

Optionally compute artist-node degree centrality in bipartite network to identify “superstar bridging artists.”

Document lineup sizes (number of artists per festival-year) — for normalization, comparisons, controlling biases.

In [None]:
import networkx as nx
from networkx.algorithms.community import greedy_modularity_communities

# ========================
# RUN COMMUNITY DETECTION
# ========================

# Use Jaccard similarity as weight for more meaningful clustering
communities = greedy_modularity_communities(F, weight="jaccard")

print(f"Number of communities detected: {len(communities)}\n")

# ========================
# DISPLAY COMMUNITIES
# ========================

for i, community in enumerate(communities, start=1):
    print(f"\nCommunity {i} (size {len(community)}):")
    for node in community:
        festival = F.nodes[node]['festival']
        year = F.nodes[node]['year']
        print(f"  - {festival} {year}")


In [None]:
from networkx.algorithms.community.quality import modularity

mod = modularity(F, communities, weight="jaccard")
print("Modularity score:", mod)
