In [11]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from collections import Counter

### Formatting DF

In [2]:
data = pd.read_csv("data.csv", sep = ";", index_col = "Unnamed: 0")
data = data[["song", "composer"]]
composer = [c for c in list(data["composer"])]
unique_composers = set([c for c in list(data["composer"]) if "-" not in c])
song = list(data["song"])

In [3]:
landingpage = requests.get("https://www.jazzdisco.org/").text
landingpage = BeautifulSoup(landingpage, "html.parser")

In [4]:
# Getting URLs to composers' catalogs
catalogs = []
composers_in_db = []
url_dict = {}

for c in composer:
    coriginal = c
    c = c.split(" ")
    c.reverse()
    c = " ".join(c)
    if c in landingpage.text:
        c = f"https://www.jazzdisco.org/{c.replace(" ", "-").lower()}/catalog/"
        catalogs.append(c)
        composers_in_db.append(c)
        url_dict[coriginal] = c

# Formatting songs to match jazzdisco format
song_formatted = []
for s in song:
    if s.endswith(", The"):
        s =  "The " + s[:-5]
    song_formatted.append(s)

# Adding the urls to the DF
col3 = []
for c in composer:
    if c in url_dict.keys():
        col3.append(url_dict[c])
    else:
        col3.append(None)

data["song_original"] = data["song"]
data["song"] = song_formatted
data = data.set_index(data["song"])
data["catalogue_url"] = col3

In [5]:
data = data.dropna(axis = 0, how = "any")
data = data.sort_values(by = ["composer"], axis = 0)

### Scraping Data

In [6]:
# Scraping each composer's catalogue.
comp_url = {}
for composer, url in zip(data["composer"], data["catalogue_url"]):
    if composer not in comp_url.keys():
        comp_url[composer] = requests.get(url).text
        comp_url[composer] = BeautifulSoup(comp_url[composer], "html.parser")

In [7]:
# Defining scraping function.
def extractloc(song, composer):
    
    for t in comp_url[composer].find_all("table"):
        if song in t.text:
            date = t.find_previous("p", class_="date")
            locdate = date.text.strip() if date else None
            
            releasecode = t.find_previous("i")
            label = releasecode.text.strip() if releasecode else None
            
            return (locdate, label)

    return (None, None)

In [8]:
# Applying scraping function.
locs = {}
labels = {}
for song, composer in zip(data["song"], data["composer"]):
    locs[song] = extractloc(song, composer)[0]
    labels[song] = extractloc(song, composer)[1]

### Cleaning

In [29]:
locations = ["CA", "IL", "NY", "NYC", "NJ", "LA"]
#locations = ["Los Angeles", "Chicago", "NYC", "Englewood Cliffs", "Hackensack", "Greenwich Village", "New Orleans"]
record_labels = ["Atlantic", "Bethlehem", "Blue Note", "Candid", "Contemporary", "CTI", "Debut", "Dial", "ECM", "ESP", 
                 "Fantasy", "Impulse!", "Landmark", "Mercury", "Pablo", "Pacific", "Prestige", "Riverside", "Savoy",
                 "Verve", "Capitol", "Columbia", "Warner Bros"]

song_loc = {}
song_lab = {}

for s, l in locs.items():
    if l is not None:
        for town in locations:
            if town in l:
                song_loc[s] = town
                break 
        else:
            song_loc[s] = None
    else:
        song_loc[s] = None
            
for s, l in labels.items():
    if l is not None:
        for rl in record_labels:
            if rl in l:
                song_lab[s] = rl
                break
            else:
                song_lab[s] = None
    else:
        song_lab[s] = None

In [30]:
locdf = data[["song_original", "composer"]].copy()
locdf["location"] = None

for s, l in song_loc.items():
    if s in locdf.index:
        locdf.loc[s, "location"] = l

locdf = locdf.dropna(axis=0, how="any")

locdf.to_csv("locations.csv")


labdf = data[["song_original", "composer"]].copy()
labdf["label"] = None

for s, l in song_lab.items():
    if s in labdf.index:
        labdf.loc[s, "label"] = l

labdf = labdf.dropna(axis=0, how="any")

#labdf.to_csv("labels.csv")

In [31]:
Counter(list(locdf["location"]))

Counter({'NY': 180, 'NJ': 103, 'CA': 37, 'IL': 4})