In [None]:
# Import necessary libraries
import os, re, time, urllib.parse, urllib.request, gzip, json
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
import networkx as nx
from networkx.readwrite import json_graph
from networkx.algorithms.community import louvain_communities
from networkx.algorithms.community.quality import modularity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

import community.community_louvain as community_louvain
import zipfile
import io
from itertools import combinations
import requests


## Analysing Communities
In order to analyse communities we would have to project to bipartite graph to a graph only consisting of the festivals, and they will be connected by weighted edges to other festivals. 

The communities are then created using the Louvin algorithm

In [None]:
# Use the raw URL, not the "blob" link
url = "https://raw.githubusercontent.com/MittaHage/danish-music-festival-ecosystem/main/festival_network_attributes.json"

# Download and decode
response = urllib.request.urlopen(url).read().decode("utf-8")

# Load JSON
data = json.loads(response)

# Convert to NetworkX graph
B = nx.node_link_graph(data, edges="links")

festival_year_nodes = [n for n, d in B.nodes(data=True) if d.get("bipartite") == "festival_year"]
if not festival_year_nodes:
    raise ValueError("No nodes found with bipartite=='festival_year'")

nbrs = {u: set(B.neighbors(u)) for u in festival_year_nodes}

G = nx.Graph()
for u in festival_year_nodes:
    G.add_node(u, **B.nodes[u])

for u, v in combinations(festival_year_nodes, 2):
    inter = len(nbrs[u] & nbrs[v])
    if inter == 0:
        continue
    union = len(nbrs[u] | nbrs[v])
    G.add_edge(u, v, weight=inter / union)

communities = nx.algorithms.community.louvain_communities(G, weight="weight", resolution=1.0, seed=0)
communities = sorted(communities, key=len, reverse=True)
node_to_comm = {node: cid for cid, comm in enumerate(communities) for node in comm}

print("Nodes:", G.number_of_nodes(), "Edges:", G.number_of_edges(), "Communities:", len(communities))

### Visualizing the communities
In order to get a further understanding of the communities and the festivals placement within, a visualization is made where it is shown which community each festival is clustered in.

In [None]:
#Creating community plot
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from collections import defaultdict, Counter

FESTIVAL_LABELS = {
    "roskilde": "Roskilde Festival",
    "smukfest": "Smukfest",
    "copenhell": "Copenhell",
    "copenhagen": "Copenhagen Jazz Festival",
    "groen": "Gr√∏n Koncert",
    "jelling": "Jelling Musikfestival",
    "nibe": "Nibe Festival",
    "rock": "Rock Under Broen",
    "skive": "Skive Festival",
    "vig": "Vig Festival",
    "northside": "NorthSide",
    "tinderbox": "Tinderbox",
}

def plot_festival_timeline_top_communities(B, node_to_comm, top_k=7, festival_labels=None):
    festival_labels = festival_labels or {}

    comm_sizes = Counter()
    valid = []
    for node, cid in node_to_comm.items():
        if node not in B:
            continue
        d = B.nodes[node]
        if d.get("bipartite") != "festival_year":
            continue
        fest = d.get("festival")
        year = d.get("year")
        if fest is None or year is None:
            continue
        try:
            int(year)
        except:
            continue
        valid.append(node)
        comm_sizes[cid] += 1

    non_singletons = [(cid, sz) for cid, sz in comm_sizes.items() if sz > 1]
    non_singletons.sort(key=lambda x: x[1], reverse=True)
    top_comms = [cid for cid, _ in non_singletons[:top_k]]
    top_set = set(top_comms)

    by_fest = defaultdict(list)
    for node in valid:
        cid = node_to_comm[node]
        if cid not in top_set:
            continue
        d = B.nodes[node]
        by_fest[d["festival"]].append((int(d["year"]), cid))

    if not by_fest:
        print("Nothing to plot: no nodes belong to the selected communities.")
        return

    for fest in by_fest:
        by_fest[fest].sort()

    # Sort by first year, and reverse order
    festivals_raw = sorted(by_fest.keys(), key=lambda f: by_fest[f][0][0])[::-1]
    festivals_display = [festival_labels.get(f, f) for f in festivals_raw]

    cid_to_idx = {cid: i for i, cid in enumerate(top_comms)}
    cmap = plt.get_cmap("tab10" if len(top_comms) <= 10 else "tab20", len(top_comms))

    xs, ys, cols = [], [], []
    for row, fest in enumerate(festivals_raw):
        for year, cid in by_fest[fest]:
            xs.append(year)
            ys.append(row)
            cols.append(cmap(cid_to_idx[cid]))

    fig_h = max(5, 0.55 * len(festivals_raw))
    plt.figure(figsize=(14, fig_h))
    plt.scatter(xs, ys, c=cols, s=120, alpha=0.95, edgecolors="black", linewidths=0.7)

    plt.yticks(range(len(festivals_raw)), festivals_display)
    plt.xlabel("Year")
    plt.grid(axis="x", alpha=0.25)
    plt.xlim(min(xs) - 1, max(xs) + 1)

    handles = []
    for cid in top_comms:
        i = cid_to_idx[cid]
        handles.append(
            Line2D([0], [0], marker="o", linestyle="",
                   markersize=9, markerfacecolor=cmap(i),
                   markeredgecolor="black", markeredgewidth=0.8,
                   label=f"comm {cid+1} (n={comm_sizes[cid]})")
        )

    plt.legend(handles=handles, title="Top communities", loc="lower left", frameon=True)
    plt.tight_layout()
    plt.show()

plot_festival_timeline_top_communities(B, node_to_comm, top_k=7, festival_labels=FESTIVAL_LABELS)


In [None]:
# Communities list for modularity
comm_to_nodes = defaultdict(set)
for node, cid in node_to_comm.items():
    comm_to_nodes[cid].add(node)
communities_list = list(comm_to_nodes.values())

# Overall modularity
mod = nx.algorithms.community.quality.modularity(G, communities_list, weight="weight")
print("Overall modularity (partition on projected graph):", round(mod, 4))

def comm_internal_weight(G, S):
    total = 0.0
    for u in S:
        for v, attrs in G[u].items():
            if v in S and u < v:
                total += attrs.get("weight", 1.0)
    return total

def comm_cut_weight(G, S):
    total = 0.0
    for u in S:
        for v, attrs in G[u].items():
            if v not in S:
                total += attrs.get("weight", 1.0)
    return total

def weighted_degree(G, u):
    return sum(attrs.get("weight", 1.0) for _, attrs in G[u].items())

def comm_volume(G, S):
    return sum(weighted_degree(G, u) for u in S)

def to_percent_dict(d):
    total = sum(d.values())
    if total <= 0:
        return {}
    return {k: 100.0 * v / total for k, v in d.items()}

comm_genres = defaultdict(lambda: defaultdict(float)) 
comm_pop = defaultdict(list)
comm_years = defaultdict(list)
comm_fest = defaultdict(Counter)

missing = 0

for fy_node, cid in node_to_comm.items():
    if fy_node not in B:
        missing += 1
        continue

    d = B.nodes[fy_node]
    y = d.get("year")
    if y is not None:
        try:
            comm_years[cid].append(int(y))
        except:
            pass

    f = d.get("festival")
    if f is not None:
        comm_fest[cid][f] += 1

    for a in B.neighbors(fy_node):
        if B.nodes[a].get("bipartite") != "artist":
            continue

        genres = B.nodes[a].get("genres") or []
        # Fractional counting: total contribution per artist = 1 split across genres
        if genres:
            w = 1.0 / len(genres)
            for g in genres:
                if g:  # keep original names, just skip empty
                    comm_genres[cid][g] += w

        pop = B.nodes[a].get("popularity")
        if isinstance(pop, (int, float)):
            comm_pop[cid].append(pop)

print("Skipped festival-year nodes not in B:", missing)

# Print descriptive blocks
all_cids = sorted(comm_to_nodes.keys(), key=lambda c: len(comm_to_nodes[c]), reverse=True)

for cid in all_cids:
    S = comm_to_nodes[cid]

    n_nodes = len(S)
    w_int = comm_internal_weight(G, S)
    w_cut = comm_cut_weight(G, S)
    volS = comm_volume(G, S)
    volNotS = comm_volume(G, set(G.nodes()) - set(S))
    denom = min(volS, volNotS) if min(volS, volNotS) > 0 else None
    conductance = (w_cut / denom) if denom else None

    years = comm_years.get(cid, [])
    year_span = (min(years), max(years)) if years else None
    top_fests = comm_fest[cid].most_common(5)

    pops = comm_pop.get(cid, [])
    mean_pop = float(np.mean(pops)) if pops else None
    med_pop = float(np.median(pops)) if pops else None

    # ORIGINAL genre names, fractional, normalized to percentages
    raw_pct = to_percent_dict(comm_genres[cid])
    top_raw = sorted(raw_pct.items(), key=lambda kv: kv[1], reverse=True)[:12]

    print(f"\n=== Community {cid} ===")
    print(f"festival-year nodes: {n_nodes}")
    print(f"year span: {year_span}" if year_span else "year span: n/a")
    print("top festivals:", top_fests)
    print(f"internal edge weight: {w_int:.3f} | cut weight: {w_cut:.3f}")
    print("conductance (lower=more separated):", round(conductance, 4) if conductance is not None else "n/a")
    print("artist popularity mean/median:", (round(mean_pop, 2), round(med_pop, 2)) if pops else "n/a")
    print("original genre % (fractional, top):", [(g, round(p, 2)) for g, p in top_raw])


In [None]:
import json
import networkx as nx
import matplotlib.pyplot as plt
from itertools import combinations
from collections import defaultdict
from matplotlib.lines import Line2D

# -------------------------
# CONTROL FESTIVAL NAMES HERE (legend labels)
# keys must match node attribute: d["festival"]
# -------------------------
FESTIVAL_LABELS = {
    "roskilde": "Roskilde Festival",
    "smukfest": "Smukfest",
    "copenhell": "Copenhell",
    "copenhagen": "Copenhagen Jazz Festival",
    "groen": "Gr√∏n Koncert",
    "jelling": "Jelling Musikfestival",
    "nibe": "Nibe Festival",
    "rock": "Rock Under Broen",
    "skive": "Skive Festival",
    "vig": "Vig Festival",
    "northside": "NorthSide",
    "tinderbox": "Tinderbox",
}

# -------------------------
# 1) Load bipartite graph
# -------------------------
with open("festival_network_attributes.json", "r", encoding="utf-8") as f:
    B = nx.node_link_graph(json.load(f), edges="links")

festival_nodes = [n for n, d in B.nodes(data=True) if d.get("bipartite") == "festival_year"]

# -------------------------
# 2) Project with Jaccard weights
# -------------------------
nbrs = {u: set(B.neighbors(u)) for u in festival_nodes}

G = nx.Graph()
for u in festival_nodes:
    G.add_node(u, **B.nodes[u])

for u, v in combinations(festival_nodes, 2):
    inter = len(nbrs[u] & nbrs[v])
    if inter == 0:
        continue
    union = len(nbrs[u] | nbrs[v])
    w = inter / union
    G.add_edge(u, v, weight=w)

print("Projected graph:", G.number_of_nodes(), "nodes,", G.number_of_edges(), "edges")

# -------------------------
# 3) Backbone: kNN edges per node
# -------------------------
def knn_backbone(G, k=6, weight="weight", union=True, min_weight=0.0):
    keep = set()
    for u in G.nodes():
        edges = []
        for v, d in G[u].items():
            w = d.get(weight, 1.0)
            if w >= min_weight:
                edges.append((w, u, v))
        edges.sort(reverse=True)
        for w, a, b in edges[:k]:
            keep.add((a, b) if a < b else (b, a))

    H = nx.Graph()
    H.add_nodes_from(G.nodes(data=True))

    if union:
        for a, b in keep:
            if G.has_edge(a, b):
                H.add_edge(a, b, **G[a][b])
    else:
        chosen = defaultdict(set)
        for a, b in keep:
            chosen[a].add(b)
            chosen[b].add(a)
        for a, b in keep:
            if (b in chosen[a]) and (a in chosen[b]) and G.has_edge(a, b):
                H.add_edge(a, b, **G[a][b])

    isolates = [n for n in H.nodes() if H.degree(n) == 0]
    H.remove_nodes_from(isolates)
    return H

H = knn_backbone(G, k=6, min_weight=0.03, union=True)
print("Backbone:", H.number_of_nodes(), "nodes,", H.number_of_edges(), "edges")

# largest connected component (optional; makes one clean figure)
if H.number_of_nodes() > 0:
    cc = max(nx.connected_components(H), key=len)
    H = H.subgraph(cc).copy()

# -------------------------
# 4) Plot with legend (no labels on nodes)
# -------------------------
def plot_projected_backbone_with_legend(H, seed=42):
    pos = nx.spring_layout(H, seed=seed, weight="weight")

    festivals = sorted({H.nodes[n].get("festival", "unknown") for n in H.nodes()})
    cmap = plt.get_cmap("tab20", max(1, len(festivals)))
    fest_color = {f: cmap(i) for i, f in enumerate(festivals)}

    node_colors = [fest_color.get(H.nodes[n].get("festival", "unknown")) for n in H.nodes()]

    # node size = weighted degree (strength)
    strength = {n: sum(H[n][nbr].get("weight", 1.0) for nbr in H.neighbors(n)) for n in H.nodes()}
    svals = list(strength.values()) or [1.0]
    smin, smax = min(svals), max(svals)
    def scale(x):
        if smax == smin:
            return 90
        return 60 + 260 * (x - smin) / (smax - smin)
    node_sizes = [scale(strength[n]) for n in H.nodes()]

    # edges: width by weight
    wvals = [d.get("weight", 1.0) for _, _, d in H.edges(data=True)] or [1.0]
    wmin, wmax = min(wvals), max(wvals)
    def escale(w):
        if wmax == wmin:
            return 1.0
        return 0.4 + 4.0 * (w - wmin) / (wmax - wmin)
    edge_widths = [escale(d.get("weight", 1.0)) for _, _, d in H.edges(data=True)]

    plt.figure(figsize=(12, 9))
    nx.draw_networkx_edges(H, pos, alpha=0.18, width=edge_widths)
    nx.draw_networkx_nodes(
        H, pos,
        node_color=node_colors,
        node_size=node_sizes,
        edgecolors="black",
        linewidths=0.3,
        alpha=0.95
    )

    # Legend handles (festival -> color)
    handles = []
    for f in festivals:
        label = FESTIVAL_LABELS.get(f, f)  # <-- control names via FESTIVAL_LABELS
        handles.append(Line2D(
            [0], [0], marker='o', linestyle='',
            markerfacecolor=fest_color[f], markeredgecolor='black',
            markersize=9, label=label
        ))

    plt.legend(
        handles=handles,
        title="Festival",
        loc="center left",
        bbox_to_anchor=(1.02, 0.5),
        frameon=True
    )

    plt.axis("off")
    plt.tight_layout()
    plt.show()

plot_projected_backbone_with_legend(H, seed=42)


In [None]:
# Map node -> community id
comms = sorted(communities, key=len, reverse=True)
cid = {n: i for i, C in enumerate(comms) for n in C}
TOP_K = 6  # highlight top-K communities

# Node sizes based on degree
deg = np.array([G.degree(n) for n in G.nodes()], float)
deg = (deg - deg.min()) / (deg.max() - deg.min() + 1e-9)
sizes = 100 + deg * (2000 - 100)

# Layout
pos = nx.forceatlas2_layout(G, max_iter=500)

# Colors by community
import matplotlib.cm as cm
cmap = cm.get_cmap("tab10", TOP_K)
colors = [cmap(cid[n]) if cid[n] < TOP_K else (0.85, 0.85, 0.85, 0.7) for n in G.nodes()]

# Separate artists vs festivals
artist_nodes = [n for n in G.nodes() if G.nodes[n].get("bipartite") == "artist"]
festival_nodes = [n for n in G.nodes() if G.nodes[n].get("bipartite") == "festival_year"]

fig, ax = plt.subplots(figsize=(10, 8))

# Draw edges
nx.draw_networkx_edges(G, pos, alpha=0.25, width=0.4, ax=ax)

# Draw artists (circles)
nx.draw_networkx_nodes(G, pos,
                       nodelist=artist_nodes,
                       node_size=[sizes[list(G.nodes()).index(n)] for n in artist_nodes],
                       node_color=[colors[list(G.nodes()).index(n)] for n in artist_nodes],
                       node_shape="o", ax=ax)

# Draw festivals (squares)
nx.draw_networkx_nodes(G, pos,
                       nodelist=festival_nodes,
                       node_size=[sizes[list(G.nodes()).index(n)] for n in festival_nodes],
                       node_color=[colors[list(G.nodes()).index(n)] for n in festival_nodes],
                       node_shape="s", ax=ax)

ax.set_title(f"ForceAtlas2 ‚Ä¢ Louvain Communities ‚Ä¢ M = {M_louvain:.3f}")
ax.axis("off")
plt.tight_layout()
plt.show()


## NLP - natural language processing
When performing analysis through NLP wikipedia pages, are searched for in order to obtain some text material to analyse, using different tools tought trhoughout the course, in order to further examine our research question. 

The choice to perform text analysis on the available wikipedia pages for the artists whom have performed at any of the festivals examined in our research, is made since even though not every artist has a page, we were not able to find a better suited website, and this was the one where most artists were represented.

#### Retrieving the graphs
When doing the natural language processing we would have to retrieve the graph where the wikitext and sentiment value are added as attributes.

In [None]:
# Raw URL to the zip file
zip_url = "https://raw.githubusercontent.com/MittaHage/danish-music-festival-ecosystem/main/festival_graph_newWiki.zip"

# Download zip into memory
response = urllib.request.urlopen(zip_url).read()

# Open zip from memory
with zipfile.ZipFile(io.BytesIO(response)) as z:
    print("Files inside zip:", z.namelist())  # check contents
    
    # Assuming the JSON file is inside the zip
    with z.open("festival_graph_newWiki.json") as f:
        data = json.load(f)

# Convert to NetworkX graph
GWiki = json_graph.node_link_graph(data)
print("Graph has", GWiki.number_of_nodes(), "nodes and", GWiki.number_of_edges(), "edges")


#### Cleaning the data
To conduct meaningful text analysis, it is necessary to preprocess and clean the textual data. When applying TF‚ÄìIDF, the presence of multiple languages would distort the results, as terms cannot be consistently compared across linguistic boundaries. Therefore, Danish wikitexts are removed at this stage. Since sentiment values have already been calculated for the corresponding nodes, their wikitext entries are simply set to None, ensuring that the sentiment information is preserved while avoiding interference in the TF‚ÄìIDF analysis.

In [None]:
def clean(txt):
    if not isinstance(txt, str): return ""
    txt = re.sub(r"\{\{.*?\}\}", " ", txt, flags=re.S)  # remove templates
    txt = re.sub(r"<ref.*?>.*?</ref>", " ", txt, flags=re.S)  # remove refs
    txt = re.sub(r"==.*?==", " ", txt)  # remove section headers
    txt = re.sub(r"\[\[|\]\]|\{|}|==|''+", " ", txt)
    txt = re.sub(r"http\S+", " ", txt)
    txt = re.sub(r"[^a-zA-Z0-9\s]", " ", txt)
    return re.sub(r"\s+", " ", txt).strip().lower()

# ---------- pick top-K  communities ----------
# communities already computed: `communities`
comms_sorted = sorted(communities, key=len, reverse=True)
TOPK_COMMS = 6
top_comm_ids = list(range(min(TOPK_COMMS, len(comms_sorted))))
node2comm = {n: i for i, C in enumerate(comms_sorted) for n in C}

# -------------- Translate the danish nodes --------------
# Loop through all nodes in the graph
for node, data in B.nodes(data=True):
    # Check if node has wikitext and is in Danish
    if data.get("wikitext") and data.get("wiki_language") == "da":
            # Translate to English
        B.nodes[node]["wikitext"] = None            
        print(f"Deleted danish node {node}")  # preview



### Calculating sentiment for communities
In order to calculate the sentiment for the identified communities, the sentiment values are found in graph GWiki, for those nodes where the artist performed at the festival in the community.

In [None]:
# --------------- Create Sentiment Analysis ---------------
# Calculate the sentiment for the 10 largest communities

# Step 1: Select the 7 largest communities - as these are the communities for which we made TF.IDF analysis
sorted_communities = sorted(communities, key=len, reverse=True)
top_communities = sorted_communities[:6]

community_info = []

for i, community in enumerate(top_communities):
    # Find festival-nodes in community (from G)
    festivals_in_comm = [n for n in community if G.nodes[n].get("bipartite") == "festival_year"]

    # Find all artists connected to those festivals in B(bipartite graph) 
    artist_neighbors = set()
    for f in festivals_in_comm:
        artist_neighbors.update([
            nbr for nbr in B.neighbors(f)
            if B.nodes[nbr].get("bipartite") == "artist"
        ])

    # Calculate Average sentiment for artists, using GWiki since this is the graph with sentimentvalue
    scores = []
    for a in artist_neighbors:
        if a in GWiki and GWiki.nodes[a].get("sentiment") is not None:
            scores.append(GWiki.nodes[a]["sentiment"])
        elif a in B and B.nodes[a].get("sentiment") is not None:
            scores.append(B.nodes[a]["sentiment"])
        avg_sentiment = sum(scores) / len(scores) if scores else None

    # Save info
    community_info.append({
        "index": i,
        "festivals": festivals_in_comm,
        "artists": list(artist_neighbors),
        "avg_sentiment": avg_sentiment,
        "size": len(community)
    })

# Print result
print("\nüéº Community Sentiment Overview (via festivals in GWiki):")
for info in community_info:
    print(f"Community {info['index'] + 1}:")
    print(f"  Size (festivals+artists): {info['size']}")
    print(f"  Festivals: {', '.join(info['festivals']) if info['festivals'] else 'None'}")
    print(f"  Artists: {len(info['artists'])}")
    if info['avg_sentiment'] is not None:
        print(f"  Average Sentiment: {info['avg_sentiment']:.3f}")
    else:
        print("  No sentiment data")

### Calculate TF-IDF

In [None]:
comm_docs  = defaultdict(list)   # comm_id -> [texts...]

for n in GWiki.nodes():
    txt = GWiki.nodes[n].get("wikitext")
    if not txt: 
        continue
    txt = clean(txt)

    # add to community doc (if in top-K)
    cid = node2comm.get(n)
    if cid in top_comm_ids:
        comm_docs[cid].append(txt)

# ---------- concatenate & show sizes ----------

for cid, community in enumerate(comms_sorted):
    if cid not in top_comm_ids:
        continue

    # festivals in this community
    festivals_in_comm = [n for n in community if G.nodes[n].get("bipartite") == "festival_year"]

    # artist neighbours from B
    artist_neighbors = set()
    for f in festivals_in_comm:
        if f not in B: 
            continue
        artist_neighbors.update([
            nbr for nbr in B.neighbors(f)
            if B.nodes[nbr].get("bipartite") == "artist"
        ])

    # collect wikitext from GWiki for those artists
    for a in artist_neighbors:
        if a in GWiki:
            txt = GWiki.nodes[a].get("wikitext")
            if txt:
                comm_docs[cid].append(clean(txt))

# concatenate per community
comm_docs = {cid: " ".join(docs) for cid, docs in comm_docs.items()}

def show_top_tfidf(docs_dict, title, top_n=6):
    
    print(f"\nüîπ Top TF‚ÄìIDF words per {title.lower()}")
    print("=" * 60)

    # TF‚ÄìIDF model
    vectorizer = TfidfVectorizer(
        stop_words='english', # removes commonly used words with little meaning, such as "the", "are" and "is"
        lowercase=True,
        max_features=5000,
        max_df=0.80,
        token_pattern=r"(?u)\b[a-zA-Z]{2,}\b"
    )

    labels = list(docs_dict.keys())
    texts = [docs_dict[l] for l in labels]
    X = vectorizer.fit_transform(texts)
    terms = vectorizer.get_feature_names_out()

    for i, label in enumerate(labels):
        row = X[i].toarray().flatten()
        top_idx = row.argsort()[-top_n:][::-1]
        top_terms = [(terms[j], round(row[j], 3)) for j in top_idx]
        print(f"\n{title[:-1]} {label}:")
        print("   " + ", ".join([f"{w} ({v})" for w, v in top_terms]))


# --- Run for top 4 communities ---
comm_docs_top10 = {cid: comm_docs[cid] for cid in list(comm_docs.keys())[:6]}
show_top_tfidf(comm_docs_top10, "Communities", top_n=10)



### Create Wordclouds
Using TF-IDF word clouds are created to visualize the most meaningful words in the documents, for each of the 6 biggest communities.

In [None]:
def plot_tfidf_wordclouds(docs_dict, title, top_n=100):
    vectorizer = TfidfVectorizer(
        stop_words='english',
        lowercase=True,
        max_features=5000,
        max_df=0.80,
        token_pattern=r"(?u)\b[a-zA-Z]{2,}\b"
    )
    labels = list(docs_dict.keys())
    texts = [docs_dict[l] for l in labels]
    X = vectorizer.fit_transform(texts)
    terms = vectorizer.get_feature_names_out()

    # 2x3 grid
    fig, axes = plt.subplots(2, 3, figsize=(12, 8))
    axes = axes.flatten()  # flatten to 1D list

    for i, label in enumerate(labels[:6]):  # limit to 6 communities
        row = X[i].toarray().flatten()
        top_idx = row.argsort()[-top_n:]
        freqs = {terms[j]: row[j] for j in top_idx}
        wc = WordCloud(width=600, height=400, background_color="white").generate_from_frequencies(freqs)
        axes[i].imshow(wc, interpolation="bilinear")
        axes[i].set_title(f"{title[:-1]} {label}", fontsize=12)
        axes[i].axis("off")

    # Hide unused subplots if fewer than 6
    for j in range(len(labels), 6):
        axes[j].axis("off")

    plt.suptitle(f"TF‚ÄìIDF Word Clouds ‚Äî {title}", fontsize=16)
    plt.tight_layout()
    plt.show()


# --- Plot for top 6 communities ---
comm_docs_top6 = {cid: comm_docs[cid] for cid in list(comm_docs.keys())[:6]}
plot_tfidf_wordclouds(comm_docs_top6, "Communities")

## Adding and finding the available Wikipedia pages
Before finding the wikipedia pages, the network consisting of the artists and festivals are retrived from GitHub

In [None]:
RAW_URL = "https://raw.githubusercontent.com/MittaHage/danish-music-festival-ecosystem/main/festival_network_attributes.json"

response = requests.get(RAW_URL, timeout=30)
response.raise_for_status()
data = response.json()

# Build the graph
G = nx.node_link_graph(data)   # converts JSON into a NetworkX graph

#### Creating the function for data retrieval
When searching for wikipedia pages of the artists, some considerations had to be made. Not all artists are spelled the exact same way in our data as it is on wikipedia, hence some adjustments had to be made to the simple wikitext retrieval code had to be made. 

An artist such as Gasolin is spelled like Gasolin' on wikipedia, but just when searching gasolin, the wikitext found is a Redirect page, therefore in the cases where the search is redirected, the fetch_data function is run on the title of the redirect, to overcome this obstacle of small differences between our dataset and the titles on wikipedia. However in order to end in an endless loop, a page can only be redirected once. 

Furthermore some artists have rather general names such as 'Engine' and 'Hair', and therefor the page to be found could be a wiktionary side, and hence if this is the case, it is not the page we would have needed, and hence the text will not be added to the wikitext attribute in the graph.

However if a wikipedia page is found, were none of the above is the case, the wikitext is returned.

In [None]:
# User-Agent for polite requests 
UA = "Mozilla/5.0 (student project)" 
OUTDIR = "Assignment 2 data" 
os.makedirs(OUTDIR, exist_ok=True)


# Helper function to fetch page from a given language wiki
def fetch_page(title, lang="en", _redirect = False):
    baseurl = f"https://{lang}.wikipedia.org/w/api.php?"
    params = {
        "action": "query",
        "titles": title,
        "prop": "revisions",
        "rvprop": "content",
        "format": "json"
    }
    try:
        req = urllib.request.Request(baseurl + urllib.parse.urlencode(params), headers={"User-Agent": UA})
        with urllib.request.urlopen(req) as response:
            data = json.loads(response.read().decode("utf-8"))
        page = next(iter(data["query"]["pages"].values()))
        if "missing" in page:
            return None
        rev = page["revisions"][0]
        wikitext = rev.get("*") if "*" in rev else rev["slots"]["main"]["*"]

        # If wikitext starts with REDIRECT
        if wikitext.upper().startswith("#REDIRECT") and _redirect is False:
            # Find redirect m√•l inde i [[...]]
            match = re.search(r"\[\[(.*?)\]\]", wikitext)
            if match:
                redirect_target = match.group(1).strip()
                print(f"‚û°Ô∏è Redirected to: {redirect_target}")
                # K√∏r fetch_page igen p√• redirect_target
                return fetch_page(redirect_target.replace(" ", "_").replace("-", "_"), lang=lang, _redirect = True) # _redirect = True, to ensure a redirect only occurs once
            else:
                return None

        # If the page is a wiktionary side, skip it
        if wikitext.startswith("{{wiktionary"):
            return None
        if wikitext.startswith("{{Wiktionary"):
            return None
        
        return wikitext
    except Exception as e:
        print(f"‚ö†Ô∏è Error fetching {title} ({lang}): {e}")
        return None

#### Searching for the wikipages
As mentioned above, some artists have rather ordinary names as 'engine' and the returned wikipedia page would in this example be a page about an actual engine(like the one in a car), in order to succesfully obtain the desired pages, suffixes such as band and musician is searched for prior to trying just the artist name. 

Furthermore since the festivals are danish and smaller danish artists might appear, if no english wikipedia is found, a search for a danish wikipedia is created.

If a wikipedia page is found, two values are added to their respective attributes in the graph, being the language of the wikipedia page, as well as the wikitext.

In [None]:
results = []

for node in G.nodes(data=True):
    if node[1].get("bipartite") == "artist":
        artist_id = node[0]   # node ID
        wikitext = None
        lang_used = None

        en_suffixes = ["_(musician)", "_(band)", "_(singer)", "_(American_band)", ""]
        da_suffixes = ["_(musiker)", "_(band)", "_(sanger)", "_(kor)", ""]

        # Try English Wikipedia
        for suffix in en_suffixes:
            if wikitext is None:
                wikitext = fetch_page(artist_id + suffix, lang="en")
                if wikitext:
                    lang_used = "en"

        # Try Danish Wikipedia
        if wikitext is None:
            for suffix in da_suffixes:
                if wikitext is None:
                    wikitext = fetch_page(artist_id + suffix, lang="da")
                    if wikitext:
                        lang_used = "da"

        # Attach as node attribute
        if wikitext:
            G.nodes[artist_id]["wikitext"] = wikitext
            G.nodes[artist_id]["wiki_language"] = lang_used
        else:
            G.nodes[artist_id]["wikitext"] = None
            G.nodes[artist_id]["wiki_language"] = None


### Calculating Sentiment and adding as attribute
After adding the text attributes, it is possible to create a sentiment value of the wikitext to the node as well. The sentiment is calculated using the AFINN dataset, since this method exists both in danish or english. 

When calculating the sentiment the neutral words are excluded, and after calculating the score, it is normalized and the average sentiment value of the wikitext is added as an attribute to the network, such that it can be easily accesed in further analysis.  

In [None]:
from afinn import Afinn
import re

afinn_en = Afinn(language='en')
afinn_da = Afinn(language='da')

def tokenize(text):
    if not text:   # catches None or empty string
        return []
    return re.findall(r'\b[a-z√¶√∏√•]+\b', text.lower())

for node_id, attrs in G.nodes(data=True):
    if attrs.get("bipartite") == "artist":
        text = attrs.get("wikitext", "")
        lang = attrs.get("wiki_language", "en")

        tokens = tokenize(text)

        if lang == "en":
            scores = [afinn_en.score(word) for word in tokens if -1 > afinn_en.score(word) or afinn_en.score(word) > 1]
        elif lang == "da":
            scores = [afinn_da.score(word) for word in tokens if -1 > afinn_en.score(word) or afinn_en.score(word) > 1]
        else:
            scores = []

        if scores:
            # Normalize from -5‚Ä¶+5 to 0‚Ä¶10
            normalized_scores = [(s + 5) for s in scores]
            sentiment_value = sum(normalized_scores) / len(normalized_scores)
        else:
            sentiment_value = None

        # Attach sentiment as a node attribute
        G.nodes[node_id]["sentiment"] = sentiment_value


import networkx as nx
import json


### Saving the graph
Saving the graph as a json file locally on computer to upload to GitHub such that it can be retrieved and used by any user. 

In [None]:
# Convert graph to node-link dictionary
graph_dict = nx.node_link_data(G)

# Choose a specific path on your computer
save_path = r"C:\Users\KarolineHeleneBaars√∏\Desktop\11 - semester\Social Graphs\festival_graph_newWiki.json"

# Save dictionary as JSON file
with open(save_path, "w", encoding="utf-8") as f:
    json.dump(graph_dict, f, ensure_ascii=False, indent=2)

print(f"Graph saved as JSON at {save_path}")