In [None]:
import requests
import urllib.request
import urllib.parse
import json
import regex as re
import pandas as pd
import numpy as np
import networkx as nx
from tqdm import tqdm

# Build network

In [None]:
# List of mainstream rock performers
# wiki_url = https://en.wikipedia.org/wiki/List_of_mainstream_rock_performers

baseurl = "https://en.wikipedia.org/w/api.php?"
action = "action=query"
title = "titles=List_of_mainstream_rock_performers"
content = "prop=revisions&rvprop=content"
dataformat ="format=json"
rvslots = "rvslots=main"


query = "{}{}&{}&{}&{}&{}".format(baseurl, action, content, title, dataformat, rvslots)
print(query)

headers = {"User-Agent" : "MyWikipediaClient/1.0 (example@example.com)"} # just use this dict as-is.
wikirequest = urllib.request.Request(query,None,headers)    # Needed to pass error 403
wikiresponse = urllib.request.urlopen(wikirequest)
wikidata = wikiresponse.read()
wikitext = wikidata.decode('utf-8')

In [None]:
# Load JSON and extract page content
wiki_json = json.loads(wikitext)
pages = wiki_json['query']['pages']['68324070']['revisions'][0]['slots']['main']['*']
pages

In [None]:
# Example of an url listed -- |url=https://www.allmusic.com/artist/10cc-mn0000502163 
# Example of a name listed -- \n* [[10cc]]
# Build regex to extract all urls


# artist_links = re.findall(r'\|url=(https?://[^\s\|]+)', pages)
# artist_links


artists = re.findall(r'\n\*\s\[\[(.*?)\]\]', pages)
artists_cleaned = [artist.split('|')[0] for artist in artists]
artists_joined = [artist.replace(' ', '_') for artist in artists_cleaned]
artists_joined;

In [None]:
adjacency_matrix = pd.DataFrame(0, index=artists_cleaned, columns=artists_cleaned)
word_count_dict = {}


for name in tqdm(artists_joined):

    # Properly encode the title to handle special characters like Ö, -, etc.
    encoded_title = urllib.parse.quote(name)
    artist_title = f"titles={encoded_title}"

    query = "{}{}&{}&{}&{}&{}".format(baseurl, action, content, artist_title, dataformat, rvslots)

    try:
        artist_wikirequest = urllib.request.Request(query,None,headers)    # Needed to pass error 403
        artist_wikiresponse = urllib.request.urlopen(artist_wikirequest)
        artist_wikidata = artist_wikiresponse.read()
        artist_wikitext = artist_wikidata.decode('utf-8')

        artist_wiki_json = json.loads(artist_wikitext)

        artist_page_id = list(artist_wiki_json['query']['pages'].keys())[0]
        artist_page_content = artist_wiki_json['query']['pages'][artist_page_id]['revisions'][0]['slots']['main']['*']

        artist_page_content_valid = re.split(r'==References==', artist_page_content)[0] # split into sections
        artist_page_words = re.findall(r'\w+', artist_page_content_valid)
        artist_page_references = re.findall(r"<ref[^>]*>(.*?)<\/ref>", artist_page_content_valid)
        artist_page_word_count = len(artist_page_words)-len(artist_page_references) # exclude references from word count
        
        # Store word count
        clean_artist_name = name.replace('_', ' ')
        if clean_artist_name not in word_count_dict:
            word_count_dict[clean_artist_name] = artist_page_word_count
        else:
            print(f"Duplicate entry found for {clean_artist_name}")

        # Check if other artists are mentioned in the output
        for artist in artists_cleaned:
            if artist in str(artist_page_content) and clean_artist_name != artist: # avoid self loops
                adjacency_matrix.loc[clean_artist_name, artist] += 1
                
    except Exception as e:
        print(f"Error processing {name}: {e}")

In [None]:
# Save adjacency matrix and word counts to CSV files
word_count_df = pd.DataFrame(list(word_count_dict.items()), columns=['Artist', 'Word_Count'])
word_count_df.to_csv(r'files/rock_artists_word_counts.csv', index=False)
adjacency_matrix.to_csv(r'files/rock_artists_adjacency_matrix.csv')