# Extra Aufgaben

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
PATH = "gdrive/MyDrive/application-project-abgabe"

## Filterung der Datenbank um irrelevante Ergebnisse zu verhindern

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel, polynomial_kernel, sigmoid_kernel, rbf_kernel, laplacian_kernel, chi2_kernel, euclidean_distances, manhattan_distances, cosine_distances
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tqdm
import time
from ast import literal_eval

In [170]:
#by_isrc = pd.read_csv('data/by_isrc.csv')
by_isrc = pd.read_csv(f"{PATH}/by_isrc.csv")

In [171]:
by_isrc["country_code"] = by_isrc["isrc"].apply(lambda x: x[:2])

In [172]:
by_isrc["country_code"].unique()

array(['AE', 'AR', 'AT', 'AU', 'AZ', 'BA', 'BC', 'BE', 'BG', 'BR', 'BX',
       'Br', 'CA', 'CH', 'CI', 'CL', 'CN', 'CO', 'CS', 'CZ', 'DE', 'DG',
       'DK', 'EE', 'EG', 'ES', 'FI', 'FR', 'FX', 'GB', 'GN', 'GR', 'GT',
       'GX', 'HK', 'HR', 'HU', 'ID', 'IE', 'IL', 'IN', 'IR', 'IS', 'IT',
       'JM', 'JP', 'KE', 'KR', 'KS', 'LB', 'LK', 'LT', 'LU', 'LV', 'MD',
       'MM', 'MT', 'MU', 'MX', 'MY', 'NG', 'NI', 'NL', 'NO', 'NZ', 'No',
       'PE', 'PH', 'PL', 'PT', 'QM', 'QZ', 'RE', 'RO', 'RS', 'RU', 'SE',
       'SG', 'SI', 'SK', 'SM', 'SW', 'TC', 'TH', 'TR', 'TW', 'UA', 'UK',
       'UR', 'US', 'UY', 'VE', 'VN', 'ZA', 'ZZ', 'ae', 'ca', 'gb', 'il',
       'lv', 'nl', 'qz', 'us', 've'], dtype=object)

Einige Ländercodes sind kleingeschrieben. Diese sollen in Großbuchstaben umgewandelt werden.

In [173]:
by_isrc["country_code"] = by_isrc["country_code"].str.upper()

In [174]:
by_isrc["country_code"].unique()

array(['AE', 'AR', 'AT', 'AU', 'AZ', 'BA', 'BC', 'BE', 'BG', 'BR', 'BX',
       'CA', 'CH', 'CI', 'CL', 'CN', 'CO', 'CS', 'CZ', 'DE', 'DG', 'DK',
       'EE', 'EG', 'ES', 'FI', 'FR', 'FX', 'GB', 'GN', 'GR', 'GT', 'GX',
       'HK', 'HR', 'HU', 'ID', 'IE', 'IL', 'IN', 'IR', 'IS', 'IT', 'JM',
       'JP', 'KE', 'KR', 'KS', 'LB', 'LK', 'LT', 'LU', 'LV', 'MD', 'MM',
       'MT', 'MU', 'MX', 'MY', 'NG', 'NI', 'NL', 'NO', 'NZ', 'PE', 'PH',
       'PL', 'PT', 'QM', 'QZ', 'RE', 'RO', 'RS', 'RU', 'SE', 'SG', 'SI',
       'SK', 'SM', 'SW', 'TC', 'TH', 'TR', 'TW', 'UA', 'UK', 'UR', 'US',
       'UY', 'VE', 'VN', 'ZA', 'ZZ'], dtype=object)

Gruppierung der Songs nach den Ländercodes.

In [175]:
countries = {
    'AE': 'United Arab Emirates',
    'AR': 'Argentina',
    'AT': 'Austria',
    'AU': 'Australia',
    'AZ': 'Azerbaijan',
    'BA': 'Bosnia and Herzegovina',
    'BC': 'Canada (British Columbia)',
    'BE': 'Belgium',
    'BG': 'Bulgaria',
    'BR': 'Brazil',
    'BX': 'Benelux',
    'CA': 'Canada',
    'CH': 'Switzerland',
    'CI': 'Ivory Coast (Côte d\'Ivoire)',
    'CL': 'Chile',
    'CN': 'China',
    'CO': 'Colombia',
    'CS': 'Serbia and Montenegro',
    'CZ': 'Czech Republic',
    'DE': 'Germany',
    'DG': 'Germany (East)',
    'DK': 'Denmark',
    'EE': 'Estonia',
    'EG': 'Egypt',
    'ES': 'Spain',
    'FI': 'Finland',
    'FR': 'France',
    'FX': 'France (Metropolitan)',
    'GB': 'United Kingdom',
    'GN': 'Guinea',
    'GR': 'Greece',
    'GT': 'Guatemala',
    'GX': 'Greece (Ancient)',
    'HK': 'Hong Kong',
    'HR': 'Croatia',
    'HU': 'Hungary',
    'ID': 'Indonesia',
    'IE': 'Ireland',
    'IL': 'Israel',
    'IN': 'India',
    'IR': 'Iran',
    'IS': 'Iceland',
    'IT': 'Italy',
    'JM': 'Jamaica',
    'JP': 'Japan',
    'KE': 'Kenya',
    'KR': 'South Korea',
    'KS': 'Kosovo',
    'LB': 'Lebanon',
    'LK': 'Sri Lanka',
    'LT': 'Lithuania',
    'LU': 'Luxembourg',
    'LV': 'Latvia',
    'MD': 'Moldova',
    'MM': 'Myanmar (Burma)',
    'MT': 'Malta',
    'MU': 'Mauritius',
    'MX': 'Mexico',
    'MY': 'Malaysia',
    'NG': 'Nigeria',
    'NI': 'Nicaragua',
    'NL': 'Netherlands',
    'NO': 'Norway',
    'NZ': 'New Zealand',
    'PE': 'Peru',
    'PH': 'Philippines',
    'PL': 'Poland',
    'PT': 'Portugal',
    'QM': 'Marshall Islands', #second country code for the United States
    'QZ': 'Unknown or Invalid Area',
    'RE': 'Réunion',
    'RO': 'Romania',
    'RS': 'Serbia',
    'RU': 'Russia',
    'SE': 'Sweden',
    'SG': 'Singapore',
    'SI': 'Slovenia',
    'SK': 'Slovakia',
    'SM': 'San Marino',
    'SW': 'Sweden',
    'TC': 'Turks and Caicos Islands',
    'TH': 'Thailand',
    'TR': 'Turkey',
    'TW': 'Taiwan',
    'UA': 'Ukraine',
    'UK': 'United Kingdom',
    'UR': 'Uruguay',
    'US': 'United States',
    'UY': 'Uruguay',
    'VE': 'Venezuela',
    'VN': 'Vietnam',
    'ZA': 'South Africa',
    'ZZ': 'Unknown or Invalid Area'
}

Erstellen eines neuen Features `country`, welches den Namen des jeweiligen Landes beinhaltet.

In [176]:
by_isrc["country"] = by_isrc["country_code"].apply(lambda x: countries[x])

Gruppieren der Daten nach `country` und `country_code`. Dies soll genutzt werden, um zu analysieren, aus welchem Land am meisten Songs veröffentlicht worden sind.

In [177]:
grouped_countries = by_isrc.copy()
grouped_countries = grouped_countries[["country_code", "country"]]
grouped_countries["num_countries"] = 0
grouped_countries = grouped_countries.groupby(["country_code", "country"]).count()

In [178]:
grouped_countries = grouped_countries.sort_values("num_countries", ascending=False)

In [179]:
grouped_countries.reset_index(inplace=True)

In [180]:
# country und country_code werden kombiniert für eine bessere Darstellung
grouped_countries["country_comb"] = grouped_countries.apply(lambda x: f'{x["country"]}-{x["country_code"]}', axis=1)

In [181]:
fig = px.bar(grouped_countries[:30], x="country_comb", y="num_countries", template="plotly_dark")
fig.update_layout(
    height=500,
    title="Veröffentlichte Songs pro Land (Top 30)",
    xaxis_title="Land",
    yaxis_title="Veröffentlichte Songs"
)
fig.show()

Es sollen alle Lieder entfernt werden welche nicht aus englischsprachigen Länger (US, UK, CA, AU, Neuseeland) sowie Deutschland, Schweiz, Österreich, Italien und Schweden stammen.

In [182]:
important_country_codes = ["US", "GB", "CA", "AU", "NZ", "DE", "CH", "AT", "IT", "SE"]

# kleinere Inselgegenden von Amerika: Baker Island, Howland Island, Jarvis Island, Johnston Atoll, Kingman Reef, Midway Islands, Navassa Island, Palmyra Atoll, Wake Island
minor_insular_areas_us = ["XB", "XH", "XQ", "XU", "XM", "QM", "XV", "XL", "QW"]
important_country_codes.extend(minor_insular_areas_us)

`QM` wird heute als zusätzlicher Country code für Amerika verwendet, da der Code `US` sein Limit erreicht hat. Da dieser Beschluss jedoch aus 2010 stammt, sollten die Lieder der 80er nicht beeinflusst werden?

In [183]:
by_isrc[by_isrc["country_code"] == "QM"].sort_values("popularity", ascending=False)[["isrc", "genres", "name", "artists", "popularity", "chart_power"]]

Unnamed: 0,isrc,genres,name,artists,popularity,chart_power
274154,QMKHM1600219,"['hard-rock', 'metal', 'rock']",master of puppets,metallica,80,
274107,QMKHM1600096,"['hard-rock', 'metal', 'rock']",for whom the bell tolls - remastered,metallica,75,
271243,QMFME1326440,"['latin', 'pop']",tu dama de hierro,marisela,71,
274108,QMKHM1600097,"['hard-rock', 'metal', 'rock']",fade to black - remastered,metallica,70,
274057,QMKHM1600034,"['hard-rock', 'metal', 'rock']",seek & destroy - remastered,metallica,69,
...,...,...,...,...,...,...
267541,QM7281628419,['tango'],fuegos artificiales,juan d'arienzo y su orquesta típica,0,
267551,QM7281713701,['jazz'],raincheck,nick brignola,0,
267552,QM7281713702,['jazz'],tenderly,nick brignola,0,
267553,QM7281713703,['jazz'],hurricane connie,nick brignola,0,


Herausfiltern der Lieder aus den "wichtigen" Länder.

In [184]:
reduced_isrc = by_isrc.copy()
reduced_isrc = reduced_isrc[reduced_isrc["country_code"].isin(important_country_codes)]
reduced_isrc["genres"] = reduced_isrc["genres"].apply(literal_eval)

In [185]:
print(f"Die Anzahl der Lieder hat sich von {by_isrc.shape[0]} auf {reduced_isrc.shape[0]} verringert.")

Die Anzahl der Lieder hat sich von 416154 auf 251101 verringert.


Gruppieren der Daten nach den Genres => Welche Genres waren in den wichtigen Ländern relevant?

In [20]:
def filter_important_genres(df):
  important_genres = df.copy()
  important_genres = important_genres.explode("genres")
  important_genres["num"] = 0
  important_genres = important_genres[["genres", "num"]]
  important_genres = important_genres.groupby("genres").count()
  important_genres.reset_index(inplace=True)
  important_genres = important_genres.sort_values("num", ascending=False)
  return important_genres

In [21]:
filter_important_genres(reduced_isrc)

Unnamed: 0,genres,num
16,classical,63432
87,rock,60598
77,pop,48841
38,german,28985
62,jazz,22632
...,...,...
28,dubstep,2
12,cantopop,1
53,idm,1
69,metalcore,1


Am meisten Lieder wurden für das Genre Klassik veröffentlicht. Da dieses Genre nicht wirklich relevant ist, müssen zunächst die irrelevanten Genres entfernt werden. Hiebei gibt es zwei verschiedene Wege. Verfügt ein Künstler sowohl über ein erwünschtes als auch ein unerwünschtes Genre, so können entweder alle Songs von ihm behalten oder entfernt werden. Zunächst wird geprüft, wie häufig ein solcher Fall eintritt.

In [22]:
unwanted_genres = ["classical", "jazz", "folk", "french", "turkish", "gospel", "samba", "piano", "mpb", "sertanejo", "pagode", "sleep", "forro", "malay", "anime",
                 "j-idol", "comedy", "mandopop", "cantopop", "show-tunes", "emo", "romance", "j-dance", "chill", "world-music", "iranian", "idm", "metalcore",
                 "hardstyle", "opera", "k-pop", "j-pop", "j-rock", "happy"]

In [23]:
occ_unwanted_and_wanted = by_isrc.copy()
occ_unwanted_and_wanted = occ_unwanted_and_wanted[["genres"]]
occ_unwanted_and_wanted["genres"] = occ_unwanted_and_wanted["genres"].apply(literal_eval)
occ_unwanted_and_wanted = occ_unwanted_and_wanted.explode("genres")

In [24]:
wanted_genres = list(set(occ_unwanted_and_wanted["genres"].unique()).difference(unwanted_genres))

In [25]:
def filter_row(row):
    has_wanted_genre = any(genre in row['genres'] for genre in wanted_genres)
    has_not_wanted_genre = any(genre in row['genres'] for genre in unwanted_genres)
    return has_wanted_genre and has_not_wanted_genre

In [26]:
occ_unwanted_and_wanted_df = reduced_isrc[reduced_isrc.apply(filter_row, axis=1)]

In [27]:
print(f"Insgesamt gibt es {occ_unwanted_and_wanted_df.shape[0]} Lieder, welche über ein erwünschtes und ein unerwünschtes Genre verfügen.")

Insgesamt gibt es 48318 Lieder, welche über ein erwünschtes und ein unerwünschtes Genre verfügen.


In [28]:
print(f"Hierbei gibt es {len(list(occ_unwanted_and_wanted_df.genres.astype(str).unique()))} verschiedene Kombinationen der Genres.")

Hierbei gibt es 582 verschiedene Kombinationen der Genres.


## Visualisierung Kombination

Es werden alle Songs entfernt, welche mindestens ein unerwünschtes Feature beinhalten.

In [29]:
by_isrc_sub = by_isrc.copy()
by_isrc_sub = by_isrc_sub[['isrc', 'genres']]
by_isrc_sub = by_isrc_sub[~by_isrc_sub['genres'].str.contains('|'.join(unwanted_genres))]
by_isrc_sub['genres'] = by_isrc_sub['genres'].apply(literal_eval)

Das neu entstandene DataFrame soll nun mit dem bereits auf die wichtigen Länder reduzierten DataFrame gemerged werden.

In [None]:
#reduced_isrc.drop(columns="genres", inplace=True)

In [30]:
reduced_isrc.shape, by_isrc_sub.shape

((251101, 27), (220702, 2))

In [31]:
reduced_isrc = reduced_isrc.merge(by_isrc_sub[["isrc"]], how="inner", on="isrc")

In [32]:
print(f"Die Anzahl der Songs hat sich weiter auf {reduced_isrc.shape[0]} verringert. Die Anzahl der Features ist bei {reduced_isrc.shape[1]} geblieben.")

Die Anzahl der Songs hat sich weiter auf 139629 verringert. Die Anzahl der Features ist bei 27 geblieben.


In [33]:
important_genres = filter_important_genres(reduced_isrc)
important_genres = important_genres.sort_values("num", ascending=False)
important_genres.head(10)

Unnamed: 0,genres,num
61,rock,52805
51,pop,43525
56,punk,16552
14,country,12353
47,metal,11743
35,hard-rock,9228
29,german,8980
20,disco,8065
70,swedish,7964
68,soul,7289


In [34]:
fig = px.bar(important_genres[:30], x="genres", y="num", template="plotly_dark")
fig.update_layout(
    height=500,
    title="Veröffentlichte Songs pro Genre in den wichtigen Ländern (Top 30)",
    xaxis_title="Genre",
    yaxis_title="Veröffentlichte Songs"
)
fig.show()

In [35]:
reduced_isrc.to_csv(f"{PATH}/reduced_isrc.csv")

## Auffindbarkeiten prüfen (in db und auf Spotify)

In [4]:
!pip install colab-env --upgrade

Collecting colab-env
  Downloading colab-env-0.2.0.tar.gz (4.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting python-dotenv<1.0,>=0.10.0 (from colab-env)
  Downloading python_dotenv-0.21.1-py3-none-any.whl (19 kB)
Building wheels for collected packages: colab-env
  Building wheel for colab-env (setup.py) ... [?25l[?25hdone
  Created wheel for colab-env: filename=colab_env-0.2.0-py3-none-any.whl size=3805 sha256=2e9725a2f4116beee32295c2f7d8aa2527ac58ce5c26cb337593bbba0cdde7c2
  Stored in directory: /root/.cache/pip/wheels/ae/36/4f/466c2cd4db5d08f317893a920c4a0f58a81459ee3bdb136d35
Successfully built colab-env
Installing collected packages: python-dotenv, colab-env
Successfully installed colab-env-0.2.0 python-dotenv-0.21.1


In [5]:
!pip install spotipy --upgrade

Collecting spotipy
  Downloading spotipy-2.23.0-py3-none-any.whl (29 kB)
Collecting redis>=3.5.3 (from spotipy)
  Downloading redis-5.0.0-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.1/250.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: redis, spotipy
Successfully installed redis-5.0.0 spotipy-2.23.0


In [6]:
!pip install ratelimit

Collecting ratelimit
  Downloading ratelimit-2.2.1.tar.gz (5.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ratelimit
  Building wheel for ratelimit (setup.py) ... [?25l[?25hdone
  Created wheel for ratelimit: filename=ratelimit-2.2.1-py3-none-any.whl size=5894 sha256=2a268fc9aaa35db76fc24c016628ac7cd2b48fce907bff37adc1d52d7e1bd89d
  Stored in directory: /root/.cache/pip/wheels/27/5f/ba/e972a56dcbf5de9f2b7d2b2a710113970bd173c4dcd3d2c902
Successfully built ratelimit
Installing collected packages: ratelimit
Successfully installed ratelimit-2.2.1


In [7]:
import colab_env
import os
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import requests
import json
from numpy import NaN
from tqdm import tqdm
import time
from ratelimit import limits, sleep_and_retry

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [167]:
SPOTIFY_CLIENT_ID = os.getenv("SPOTIFY_CLIENT_ID")
SPOTIFY_CLIENT_SECRET = os.getenv("SPOTIFY_CLIENT_SECRET")

In [168]:
client_credentials_manager = SpotifyClientCredentials(client_id=SPOTIFY_CLIENT_ID, client_secret=SPOTIFY_CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [169]:
# wird benutzt um die Anfragen an die Spotify API auf 60 pro 30 Sekunden zu reduzieren
@sleep_and_retry
@limits(calls=30, period=30)
def call_api():
  return

In [8]:
chart_power_df = pd.read_excel(f"{PATH}/chart-power-scores_80s.xlsx")
chart_power_df = chart_power_df.applymap(lambda s: s.lower() if type(s) == str else s)
chart_power_df = chart_power_df[['Song', 'Artist', 'Points']].groupby(['Song', 'Artist']).sum()
chart_power_df.reset_index(inplace=True)

In [10]:
chart_power_df[chart_power_df["Artist"].str.contains("manuel")]

Unnamed: 0,Song,Artist,Points
312,das lied von manuel,manuel & pony,528


Abfrage aller Lieder von Künstlern, welche in den 80ern in den Charts waren

In [158]:
chart_power_df["Artist"].unique()

array(['paul hardcastle', 'c. c. catch', 'deborah sasson & mcl',
       'coast to coast', 'fiction factory', 'o.m.d.',
       'the rock steady crew', 'cutting crew', 'sandra',
       'bill medley & jennifer warnes', 'john lennon', 'beastie boys',
       'klaus lage band', 'schweizer', 'sigue sigue sputnik',
       'jennifer rush', 'the catch', 'dolly parton', 'nena',
       'george michael', 'feargal sharkey', 'phil collins', 'queen',
       'godley & creme', 'erasure', 'sheila e', 'will downing',
       "shakin' stevens", 'depeche mode',
       "shakin' stevens & bonnie tyler", 'duran duran',
       'nick straker band', 'bad boys blue', 'genesis',
       'steve miller band', 'katja ebstein', 'roger whittaker',
       'david bowie', 'andy borg', 'toto', 'rose laurens', 'princess',
       'saragossa band', 'john farnham', 'ricky king', 'quincy jones',
       'bill withers', 'rufus & chaka khan', 'inner city', 'radiorama',
       'simple minds', 'herbert grã¶nemeyer', 'sister sledge',
  

Wenn mehrere Künstler zusammen einen Song veröffentlich haben, sind diese hier mit einem `&` getrennt. Dies muss zunächst aufgelöst werden.

In [159]:
chart_power_df["Artist"] = chart_power_df["Artist"].str.split(" & ")
chart_power_df_explode = chart_power_df.explode("Artist", ignore_index=True)

In [160]:
chart_power_df_explode

Unnamed: 0,Song,Artist,Points
0,19,paul hardcastle,1600
1,'cause you're young,c. c. catch,998
2,(carmen) danger in her eyes,deborah sasson,531
3,(carmen) danger in her eyes,mcl,531
4,(do) the hucklebuck,coast to coast,697
...,...,...,...
2022,zu spät,die ã„rzte,605
2023,zuppa romana,schrott nach 8,601
2024,â€¦ und ganz doll mich (ich mag),rolf und seine freunde,1314
2025,über sieben brücken mußt du geh'n,karat,826


Einige Künstler und Lieder enthalten zudem Umlaute, welche zunächst entsprechend formatiert werden müssen (z.B. `Die Ärzte` sind als `Die ã„rzte` gespeichert).

In [161]:
replace = {
    "ã¶": "ö",
    "ã„": "ä",
    "ã–": "ö",
    "ã©": "e"
}

chart_power_df_explode["Song"] = chart_power_df_explode["Song"].replace(replace, regex=True)
chart_power_df_explode["Artist"] = chart_power_df_explode["Artist"].replace(replace, regex=True)

In [162]:
chart_power_df_explode[chart_power_df_explode["Song"] == "zu spät"]

Unnamed: 0,Song,Artist,Points
2022,zu spät,die ärzte,605


In [163]:
chart_power_df_explode[chart_power_df_explode["Artist"] == "die ã„rzte"]


Unnamed: 0,Song,Artist,Points


Weiterhin ist hinter einigen Künstlern das Herukunftsland in eckigen Klammern gekennzeichnet. Dies muss ebenfalls entfernet werden, da diese andernfalls nicht von der Spotify API gefunden werden.

In [164]:
chart_power_df_explode[chart_power_df_explode["Artist"].str.contains("\[")]

Unnamed: 0,Song,Artist,Points
64,all of me (boy oh boy),sabrina [it],841
85,amoureux solitaires,lio [be],1555
100,another life,kano [it],1460
187,boys (summertime love),sabrina [it],1444
198,bridge to your heart,wax [uk],614
256,change your mind,raff [it],736
314,cry softly,secret service [se],451
351,delirio mind,scotch [it],1120
390,disco band,scotch [it],1806
480,ein weißes blatt'l papier,relax [de],535


In [165]:
pattern = r' \[.*?\]'
chart_power_df_explode["Artist"] = chart_power_df_explode["Artist"].str.replace(pattern, '',regex=True)

In [166]:
chart_power_df_explode[chart_power_df_explode["Artist"].str.contains("\[")]

Unnamed: 0,Song,Artist,Points


Abfrage aller Lieder für Boney M.

In [12]:
def query(year: int, offset: int):
  res = sp.search(q=f"artist:Boney M. year:{year}", type="track", limit=10, offset=offset, market="DE")
  if (len(res["tracks"]["items"]) == 10):
    return query(year, offset + 10)
  return len(res["tracks"]["items"]) + offset

In [13]:
boney_songs = pd.DataFrame()
for year in range(1980, 1990):
  boney_songs = pd.concat([boney_songs, pd.DataFrame([[year, query(year, 0)]], columns=["year", "songs"])])

In [14]:
fig = px.bar(boney_songs, x="year", y="songs", title="Songs veröffentlich von Boney M. in den 80er Jahren")
fig.update_layout(
    height=500,
    template='plotly_dark',
    xaxis_title="Jahr",
    yaxis_title="Veröffentlichte Songs"
)
fig.show()

Einbauen des Genres in die query

In [15]:
def genre_query(artist: str, year: int, offset:int, genres):
  for genre in genres:
    call_api()
    res = sp.search(q=f"artist:{artist} year:{year} genre:{genre}", type="track", limit=50, offset=offset, market="DE")
    if (len(res["tracks"]["items"] ) > 1):
      print(year, genre)
      #print(res)

In [32]:
def test_genre_query(artist: str):
  genres = sp.recommendation_genre_seeds()['genres']
  for year in range(1980, 1990):
    print(year)
    genre_query(artist, year, 0, genres)

In [33]:
test_genre_query("Boney M.")

1980
1981
1982
1983
1984
1985
1986
1987
1988
1989


Es existieren keine Songs für Boney M. wenn ein Genre in der Abfrage spezifiziert wird. Eventuell sind die Genres das Problem für die fehlenden Daten.

In [89]:
by_isrc.sort_values("popularity", ascending=False)

Unnamed: 0,isrc,genres,name,artists,album,release_date,release_date_precision,uri,spotify_id,chart_power,...,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,country_code,country
132734,GBAAM0201110,['rock'],every breath you take,the police,Synchronicity (Remastered 2003),1983-06-17,day,spotify:track:1JSTJqkT5qHq8MDJnJbRE1,1JSTJqkT5qHq8MDJnJbRE1,1419.0,...,0.5430,0.002940,0.0714,0.7400,117.401,253920,4,1983,GB,United Kingdom
144156,GBALX8300190,"['piano', 'rock']",i'm still standing,elton john,Too Low For Zero,1983-05-30,day,spotify:track:1jDJFeK9x3OZboIAHsY9k2,1jDJFeK9x3OZboIAHsY9k2,1185.0,...,0.3560,0.121000,0.1400,0.7720,176.808,183440,4,1983,GB,United Kingdom
368082,USPR38619998,"['metal', 'rock']",livin' on a prayer,bon jovi,Slippery When Wet,1986-08-16,day,spotify:track:37ZJ0p5Jm13JPevGcx4SkF,37ZJ0p5Jm13JPevGcx4SkF,1112.0,...,0.0778,0.000214,0.2940,0.7950,122.511,249293,4,1986,US,United States
169272,GBCNR8500002,"['piano', 'pop', 'r-n-b', 'rock', 'singer-song...",running up that hill (a deal with god),kate bush,Hounds Of Love,1985-09-16,day,spotify:track:1PtQJZVZIdWIYdARpZRDFO,1PtQJZVZIdWIYdARpZRDFO,,...,0.7190,0.003080,0.0604,0.1940,108.376,298933,4,1985,GB,United Kingdom
6804,AUAP08000046,"['hard-rock', 'rock']",back in black,ac/dc,Back In Black,1980-07-25,day,spotify:track:08mG3Y1vljYA6bvDt4Wqkj,08mG3Y1vljYA6bvDt4Wqkj,,...,0.0110,0.009650,0.0828,0.7630,188.386,255493,4,1980,AU,Australia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174547,GBF078300092,['classical'],"serenade in g, k.525 ""eine kleine nachtmusik"" ...","wolfgang amadeus mozart,salomon quartet,barry guy",Mozart: Eine kleine Nachtmusik; Notturno; Sere...,1984-06-03,day,spotify:track:1Sbp0G9AUAQdhXinEiDWsl,1Sbp0G9AUAQdhXinEiDWsl,,...,0.5170,0.000000,0.2780,0.1510,124.043,241373,3,1984,GB,United Kingdom
174528,GBF078300017,['classical'],"alexander nevsky, op. 78: 7. alexander's entry...","sergei prokofiev,the cleveland orchestra choru...",Prokofiev: Alexander Nevsky,1984-01-01,day,spotify:track:3EZH0BH50us07EUPKzhw3u,3EZH0BH50us07EUPKzhw3u,,...,0.9160,0.003540,0.1860,0.1460,145.809,277053,4,1984,GB,United Kingdom
174527,GBF078300016,['classical'],"alexander nevsky, op. 78: 6. field of the dead","sergei prokofiev,irina arkhipova,cleveland orc...",Prokofiev: Alexander Nevsky,1984-01-01,day,spotify:track:4EXE7ZRcKVwD1xxNZLEOPj,4EXE7ZRcKVwD1xxNZLEOPj,,...,0.9160,0.004890,0.1180,0.0399,83.968,386533,4,1984,GB,United Kingdom
174526,GBF078300015,['classical'],"alexander nevsky, op. 78: 5. the battle on the...","sergei prokofiev,the cleveland orchestra choru...",Prokofiev: Alexander Nevsky,1984-01-01,day,spotify:track:3jXNm9r9WAwj8qJvBKoXg1,3jXNm9r9WAwj8qJvBKoXg1,,...,0.6560,0.787000,0.1470,0.0637,100.909,801840,4,1984,GB,United Kingdom


Abfrage nach dem Künstler Elton John.

In [29]:
test_genre_query("Elton John")

1980 piano
1980 rock
1981 piano
1981 rock
1982 piano
1982 rock
1983 piano
1983 rock


KeyboardInterrupt: ignored

Für Elton John existieren Genres. Im nächsten Schritt soll überprüft werden, ob die Genres auch in dem Response der Spotify API wiedergefunden wird.

In [28]:
res = sp.search(q="artist:Elton John", type="track", market="DE", limit=50, offset=0)
res = res["tracks"]
if "items" in res:
  for item in res["items"]:
    if "artists" in item:
      #print(item["artists"]["genres"] if "genres" in item["artists"] else print("No genres available!"))
      #print(item["artists"])
      for artist in item["artists"]:
        print(artist["genres"] if "genres" in artist else artist.keys())
    else:
      print("No artists in item")
else:
  print("No items in res")
  print(res.keys())


dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(

Die Spotify gibt auch für Künstler die ein Genre zugewiesen bekommen haben kein Genre zurück. Laut API sollte es ein weiteres Attribut `genres` geben.

Heraussuchen von Künstlern, welche in den Charts waren, sich aber nicht im Datensatz der ersten Abfrage befinden. Eventuell wurde diesen ja auch kein Genre zugewiesen.

In [197]:
df_artists = by_isrc.copy()
df_artists = df_artists.drop_duplicates(["name","year"])

In [198]:
print('Occurences of substring ", ": ' + str(len(df_artists.loc[df_artists["artists"].str.contains(', ')])))
print('Occurences of substring "_":' + str(len(df_artists.loc[df_artists["artists"].str.contains('_')])))

Occurences of substring ", ": 2586
Occurences of substring "_":0


In [199]:
df_artists.loc[df_artists["artists"].str.contains(', '), 'artists'] = df_artists[df_artists["artists"].str.contains(', ')]['artists'].str.replace(', ', '_')

In [200]:
print('Occurences of substring ", ": ' + str(len(df_artists.loc[df_artists["artists"].str.contains(', ')])))
print('Occurences of substring "_":' + str(len(df_artists.loc[df_artists["artists"].str.contains('_')])))

Occurences of substring ", ": 0
Occurences of substring "_":2586


In [201]:
df_artists['artists'] = df_artists['artists'].str.split(',')
df_split_artists = df_artists.explode('artists')

In [202]:
print('Occurences of substring ", ": ' + str(len(df_split_artists.loc[df_split_artists["artists"].str.contains(', ')])))
print('Occurences of substring "_":' + str(len(df_split_artists.loc[df_split_artists["artists"].str.contains('_')])))

Occurences of substring ", ": 0
Occurences of substring "_":2709


In [203]:
df_split_artists.loc[df_split_artists['artists'].str.contains('_'), 'artists'] = df_split_artists[df_split_artists["artists"].str.contains('_')]['artists'].str.replace('_', ', ')

In [204]:
print('Occurences of substring ", ": ' + str(len(df_split_artists.loc[df_split_artists["artists"].str.contains(', ')])))
print('Occurences of substring "_":' + str(len(df_split_artists.loc[df_split_artists["artists"].str.contains('_')])))

Occurences of substring ", ": 2709
Occurences of substring "_":0


In [207]:
chart_artists = list(chart_power_df_explode["Artist"].unique())
unique_artists_in_dataframe = set(df_split_artists["artists"])
artists_not_in_dataframe = [artist for artist in chart_artists if artist not in unique_artists_in_dataframe]

In [210]:
len(artists_not_in_dataframe),len(chart_artists)

(464, 944)

In [215]:
"|".join(artists_not_in_dataframe)



In [218]:
chart_power_df_explode[chart_power_df_explode["Artist"].str.contains("|".join(artists_not_in_dataframe))].sort_values("Points", ascending=False)


This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.



Unnamed: 0,Song,Artist,Points
320,dance little bird,electronica's,4071
1109,maid of orleans (the waltz of joan of arc),o.m.d.,3254
44,adios amor,andy borg,3233
2005,"you want love (maria, maria...)",mixed emotions,3102
1438,santa maria,roland kaiser,2742
...,...,...,...
51,"ahoi, ay ay capt'n",ricky king,422
677,harden my heart,quarterflash,411
1566,stars on 45 vol. iii,stars on 45,410
1529,solomon gundie,amanda lear,406


In [220]:
sp.search(q="track=adios amor artist=andy borg", type="track", market="DE")

{'tracks': {'href': 'https://api.spotify.com/v1/search?query=track%3Dadios+amor+artist%3Dandy+borg&type=track&market=DE&offset=0&limit=10',
  'items': [{'album': {'album_type': 'album',
     'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/02cwf63pmoVJ5xtq7JDED4'},
       'href': 'https://api.spotify.com/v1/artists/02cwf63pmoVJ5xtq7JDED4',
       'id': '02cwf63pmoVJ5xtq7JDED4',
       'name': 'Andy Borg',
       'type': 'artist',
       'uri': 'spotify:artist:02cwf63pmoVJ5xtq7JDED4'}],
     'external_urls': {'spotify': 'https://open.spotify.com/album/2NjEAyk4dClc659U5HpH5z'},
     'href': 'https://api.spotify.com/v1/albums/2NjEAyk4dClc659U5HpH5z',
     'id': '2NjEAyk4dClc659U5HpH5z',
     'images': [{'height': 640,
       'url': 'https://i.scdn.co/image/ab67616d0000b273066d9f8c1c69a5d7136ea64a',
       'width': 640},
      {'height': 300,
       'url': 'https://i.scdn.co/image/ab67616d00001e02066d9f8c1c69a5d7136ea64a',
       'width': 300},
      {'height': 64

In [191]:
import pandas as pd

# Sample DataFrame with a 'Song' and 'Artist' column
data = {'Song': ['Song 1', 'Song 2', 'Song 3', 'Song 4'],
        'Artist': ['Artist A,Artist E', 'Artist B', 'Artist C', 'Artist D']}
df = pd.DataFrame(data)

# List of artists to check
artist_list = ['Artist A', 'Artist B', 'Artist E', 'Artist F']

# Create a set of unique artists from the DataFrame
unique_artists_in_dataframe = set(df['Artist'])

# Check which artists from the list are not in the DataFrame
artists_not_in_dataframe = [artist for artist in artist_list if artist not in unique_artists_in_dataframe]

# Print the artists that are not in the DataFrame
print("Artists not in DataFrame:", artists_not_in_dataframe)

Artists not in DataFrame: ['Artist A', 'Artist E', 'Artist F']


In [196]:
by_isrc[by_isrc["artists"].str.contains(", ")]

Unnamed: 0,isrc,genres,name,artists,album,release_date,release_date_precision,uri,spotify_id,chart_power,...,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,country_code,country
8349,AULI00617780,"['alt-rock', 'alternative', 'rock']",yes sir i can boogie,"not drowning, waving",Cold And The Crackle,1987-08-05,day,spotify:track:5Yc1yBhLVV3XmUPSWviBiu,5Yc1yBhLVV3XmUPSWviBiu,,...,0.7100,0.947000,0.0955,0.7280,112.022,285133,4,1987,AU,Australia
8350,AULI00617790,"['alt-rock', 'alternative', 'rock']",cold and the crackle,"not drowning, waving",Cold And The Crackle,1987-08-05,day,spotify:track:3CNrpEcFniwyApyVi2WLuN,3CNrpEcFniwyApyVi2WLuN,,...,0.2760,0.000447,0.1200,0.0384,180.463,300400,3,1987,AU,Australia
8351,AULI00617800,"['alt-rock', 'alternative', 'rock']",plog,"not drowning, waving",Cold And The Crackle,1987-08-05,day,spotify:track:1dhIi1c08iZ4kOCD1jY6FV,1dhIi1c08iZ4kOCD1jY6FV,,...,0.0134,0.840000,0.0456,0.5700,107.969,193413,3,1987,AU,Australia
8352,AULI00617820,"['alt-rock', 'alternative', 'rock']",brother norbert,"not drowning, waving",Cold And The Crackle,1987-08-05,day,spotify:track:5CbtRJMxKeeXX3YAR8RDcY,5CbtRJMxKeeXX3YAR8RDcY,,...,0.1340,0.745000,0.1140,0.3020,146.790,310360,4,1987,AU,Australia
8353,AULI00617830,"['alt-rock', 'alternative', 'rock']",little king,"not drowning, waving",Cold And The Crackle,1987-08-05,day,spotify:track:3sCF5bolUl3Y0NoFH3kQZz,3sCF5bolUl3Y0NoFH3kQZz,,...,0.2440,0.128000,0.3250,0.5930,129.637,147760,4,1987,AU,Australia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
406215,USY252022966,"['country', 'folk', 'rock']",comes a time,"crosby, stills, nash & young",A Bridge Of Spies,1988,year,spotify:track:1MeG0A4ulWScjNbwb5olwv,1MeG0A4ulWScjNbwb5olwv,,...,0.7230,0.000243,0.2090,0.4890,177.562,122932,4,1988,US,United States
406216,USY252022967,"['country', 'folk', 'rock']",sugar mountain,"crosby, stills, nash & young",A Bridge Of Spies,1988,year,spotify:track:4zXRj7OE7bkDuFRONZQs7O,4zXRj7OE7bkDuFRONZQs7O,,...,0.6930,0.000346,0.7710,0.5540,176.987,349240,4,1988,US,United States
406217,USY252022968,"['country', 'folk', 'rock']",this note's for you,"crosby, stills, nash & young",A Bridge Of Spies,1988,year,spotify:track:3wG63YRzBre561nWPEPNOw,3wG63YRzBre561nWPEPNOw,,...,0.7680,0.000902,0.9840,0.2450,101.803,238723,4,1988,US,United States
406218,USY252022969,"['country', 'folk', 'rock']",believe,"crosby, stills, nash & young",A Bridge Of Spies,1988,year,spotify:track:324crXJZ4LU1O0mHzNBuDi,324crXJZ4LU1O0mHzNBuDi,,...,0.9530,0.016000,0.9540,0.2640,81.206,260607,3,1988,US,United States


In [45]:
len(chart_power_df_explode["Artist"].unique())

944

In [150]:
def filter_track_features(track, genre):
    '''
    Filters the relevant features of a track in returns them in JSON object.

    Parameter
    ---------
    track: Object
        Track returend by the spotify API

    genre: string
        Genre that should be used

    Return
    ------
    relevant_features: Object
        JSON Object that contains the relevant featues
    '''

    call_api()
    features = sp.audio_features(track['id'])[0]

    external_ids = track['external_ids'] if 'external_ids' in track else {}
    isrc = external_ids['isrc'] if 'isrc' in external_ids else NaN
    artist_names = []

    if 'artists' in track and type(track['artists']) == list:
        for artist in track['artists']:
            if 'name' in artist:
                artist_names.append(artist['name'])

    artist_names = ','.join(artist_names)

    if 'album' in track:
        album = track['album']['name'] if 'name' in track['album'] else NaN
        release_date = track['album']['release_date'] if 'release_date' in track['album'] else NaN
        release_date_precision = track['album']['release_date_precision'] if 'release_date_precision' in track['album'] else NaN
    else:
         album = NaN
         release_date = NaN
         release_date_precision = NaN


    track_name = track['name'] if 'name' in track else NaN
    if track_name != NaN:
        points = chart_power_df_explode.loc[(chart_power_df_explode.Song == track_name.lower()) & (chart_power_df_explode.Artist.str.contains(artist_names.lower()))]['Points']
        if points.empty:
             points = NaN
        else:
            points = int(points)
    else:
         points = NaN


    return {
        'name': track_name,
        'artists': artist_names,
        'album': album,
        'release_date': release_date,
        'release_date_precision': release_date_precision,
        'spotify_id': track['id'] if 'id' in track else NaN,
        'chart_power': points,
        'uri': track['uri'] if 'uri' in track else NaN,
        'popularity': track['popularity'] if 'popularity' in track else NaN,
        'genres': NaN if genre == False else genre,
        'danceability': features['danceability'] if 'danceability' in features else NaN,
        'energy': features['energy'] if 'energy' in features else NaN,
        'key': features['key'] if 'key' in features else NaN,
        'loudness': features['loudness'] if 'loudness' in features else NaN,
        'mode': features['mode'] if 'mode' in features else NaN,
        'speechiness': features['speechiness'] if 'speechiness' in features else NaN,
        'acousticness': features['acousticness'] if 'acousticness' in features else NaN,
        'instrumentalness': features['instrumentalness'] if 'instrumentalness' in features else NaN,
        'liveness': features['liveness'] if 'liveness' in features else NaN,
        'valence': features['valence'] if 'valence' in features else NaN,
        'tempo': features['tempo'] if 'tempo' in features else NaN,
        'duration_ms': features['duration_ms'] if 'duration_ms' in features else NaN,
        'time_signature': features['time_signature'] if 'time_signature' in features else NaN,
        'isrc': isrc,
    }

In [141]:
def get_number_of_tracks(release_year, artist, genre):
    '''
    Retrieves the number of tracks the spotfiy API returns for a specific query.
    '''
    try:
        call_api()
        if genre == False:
          result = sp.search(q=f'year:{release_year} artist:{artist}', type='track', limit=1, offset=0, market='DE')
        else:
          result = sp.search(q=f'year:{release_year} artist:{artist} genre:{genre}', type='track', limit=1, offset=0, market='DE')
        tracks = result['tracks'] if 'tracks' in result else ''
        return tracks['total'] if 'total' in tracks else 0
    except Exception as e:
        print(e)
        #error_logger.error(e)
    return 0

In [153]:
genres = ['chicago-house', 'hard-rock', 'rock', 'power-pop', 'guitar', 'hardcore',
          'groove', 'songwriter', 'garage', 'disco', 'grunge', 'ambient',
          'minimal-techno', 'brazil', 'indie', 'detroit-techno', 'electronic',
          'edm', 'dubstep', 'children', 'drum-and-bass', 'indian', 'death-metal',
          'club', 'salsa', 'bluegrass', 'metal', 'reggae', 'tango', 'black-metal',
          'british', 'german', 'punk', 'reggaeton', 'alternative', 'soul', 'r-n-b',
          'goth', 'pop-film', 'breakbeat', 'indie-pop', 'heavy-metal', 'dance',
          'honky-tonk', 'dancehall', 'dub', 'singer-songwriter', 'spanish',
          'deep-house', 'rock-n-roll', 'techno', 'hip-hop', 'punk-rock', 'industrial',
          'afrobeat', 'trip-hop', 'funk', 'blues', 'swedish', 'latino', 'alt-rock',
          'country', 'acoustic', 'trance', 'grindcore', 'ska', 'house', 'progressive-house',
          'new-age', 'electro', 'rockabilly', 'party', 'pop', 'synth-pop', 'latin',
          'psych-rock']

In [None]:
artist_over_1000 = []
df = pd.DataFrame()
with tqdm(total=len(genres)*len(chart_power_df_explode["Artist"].unique())) as pbar:
  for artist in chart_power_df_explode["Artist"].unique():
    for genre in genres:
      total_results = get_number_of_tracks("1980-1989", artist, genre)
      if total_results < 1000:
        offset = 0
        while offset < total_results:
          try:
            track_features = []
            call_api()
            result = sp.search(q=f"year:1980-1989 artist:{artist} genre:{genre}", type="track", limit=50, offset=offset, market="DE")
            tracks = result["tracks"] if "tracks" in result else ""
            if "items" in tracks:
              for track in tracks["items"]:
                features = filter_track_features(track, genre)
                track_features.append(features)
              offset += 50
              df = pd.concat([df, pd.DataFrame(track_features)], ignore_index=True)
              df.to_scv
            else:
              continue
          except Exception as e:
            print(e)
      else:
        print(f"Artist {artist} has published over 1000 songs in the 80s")
        artist_over_1000.append(artist)

      pbar.update(1)
      df.to_csv(f"{PATH}/data-new.csv")

In [None]:
artist_over_1000 = []
df = pd.DataFrame()
with tqdm(total=len(chart_power_df_explode["Artist"].unique())) as pbar:
  for artist in chart_power_df_explode["Artist"].unique():
    total_results = get_number_of_tracks("1980-1989", artist, False)
    if total_results < 1000:
      offset = 0
      while offset < total_results:
        try:
          track_features = []
          call_api()
          result = sp.search(q=f"year:1980-1989 artist:{artist}", type="track", limit=50, offset=offset, market="DE")
          tracks = result["tracks"] if "tracks" in result else ""
          if "items" in tracks:
            for track in tracks["items"]:
              features = filter_track_features(track, False)
              track_features.append(features)
            offset += 50
            df = pd.concat([df, pd.DataFrame(track_features)], ignore_index=True)
            df.to_csv(f"{PATH}/data-new-without-genres.csv")
          else:
            continue
        except Exception as e:
          print(e)
    else:
      print(f"Artist {artist} has published over 1000 songs in the 80s")
      artist_over_1000.append(artist)

    pbar.update(1)