## Extracting edges from song releases

Below is the code that extracts song releases from discogs release type files.

Strips troublesome control characters as well as adding a root element so iterparse can parse.

This only gets an edge if artists >= 2

Drag discogs files and change the path name as needed.

In [1]:
import xml.etree.ElementTree as ET
import io
import re

edges = []

# with open("./data_raw/discogs_20080309_releases.xml", "r", encoding="utf-8") as file:
#     raw = file.read()

with open("./data_raw/discogs_20100101_releases.xml", "r", encoding="utf-8") as file:
    raw = file.read()

# stripping control characters and escaping stray ampersands
raw = re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F]", "", raw)
safe = re.sub(r"&(?!(amp|lt|gt|apos|quot|#\d+);)", "&amp;", raw)

# adding a root element because iterparse needs to find a root element to parse
wrapped_source = "<root>\n" + safe + "\n</root>"

for event, elem in ET.iterparse(io.StringIO(wrapped_source), events= ("end",)):
    # want to look by song release
    if elem.tag == "release":
        # for each song release we get the artists by the id and then name
        artists = [
                a.findtext("id") or a.findtext("name") for a in elem.findall("./artists/artist")
        ]
        
        # removes all the nones (data is old so might be bad)
        artists = [a for a in artists if a]
        # generate edges for all pairs, we are only looking for artists collaborations
        if len(artists) >= 2:
            for i in range(len(artists)):
                for j in range(i+1, len(artists)):
                    edges.append((artists[i], artists[j]))

        elem.clear()


In [4]:
print(len(edges))
for u, v in edges[:14]:
    print(f"{u} <-> {v}")

233217
DJ Romain <-> Danny Krivit
Robert Rich <-> Lustmord
Kings Of Tomorrow <-> Soul Vision
Josh Wink <-> Lil' Louis
Critical Point <-> Vikter Duplaix
Marshall Jefferson <-> Noosa Heads
Max Reich <-> Johannes Foufas
Pure Science <-> Mashupheadz
Miguel Migs <-> DJ Rasoul
Onionz <-> Joeski
Onionz <-> Master D
Joeski <-> Master D
J.T. Donaldson <-> Lance DeSardi
J.T. Donaldson <-> Chris Nazuka


## Exracting artists and integrating spotify web api (WIP)

In [None]:
import xml.etree.ElementTree as ET
import io
import re

edges = []

with open("./data_raw/discogs_20080309_releases.xml", "r", encoding="utf-8") as file:
    raw = file.read()

# stripping control characters and escaping stray ampersands
raw = re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F]", "", raw)
safe = re.sub(r"&(?!(amp|lt|gt|apos|quot|#\d+);)", "&amp;", raw)

# adding a root element because iterparse needs to find a root element to parse
wrapped_source = "<root>\n" + safe + "\n</root>"

for event, elem in ET.iterparse(io.StringIO(wrapped_source), events= ("end",)):
    # want to look by song release
    if elem.tag == "release":
        # for each song release we get the artists by the id and then name
        artists = [
                a.findtext("id") or a.findtext("name") for a in elem.findall("./artists/artist")
        ]
        
        # removes all the nones (data is old so might be bad)
        artists = [a for a in artists if a]
        # generate edges for all pairs, we are only looking for artists collaborations
        if len(artists) >= 2:
            for i in range(len(artists)):
                for j in range(i+1, len(artists)):
                    edges.append((artists[i], artists[j]))

        elem.clear()
