# Extra Aufgaben

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
LOCAL = False

In [3]:
if LOCAL:
  PATH = "./data"
else:
  PATH = "gdrive/MyDrive/application-project-abgabe"

## Filterung der Datenbank um irrelevante Ergebnisse zu verhindern

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel, polynomial_kernel, sigmoid_kernel, rbf_kernel, laplacian_kernel, chi2_kernel, euclidean_distances, manhattan_distances, cosine_distances
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tqdm
import time
from ast import literal_eval

In [5]:
#by_isrc = pd.read_csv('data/by_isrc.csv')
by_isrc = pd.read_csv(f"{PATH}/by_isrc.csv")

In [None]:
by_isrc["country_code"] = by_isrc["isrc"].apply(lambda x: x[:2])

In [None]:
by_isrc["country_code"].unique()

array(['AE', 'AR', 'AT', 'AU', 'AZ', 'BA', 'BC', 'BE', 'BG', 'BR', 'BX',
       'Br', 'CA', 'CH', 'CI', 'CL', 'CN', 'CO', 'CS', 'CZ', 'DE', 'DG',
       'DK', 'EE', 'EG', 'ES', 'FI', 'FR', 'FX', 'GB', 'GN', 'GR', 'GT',
       'GX', 'HK', 'HR', 'HU', 'ID', 'IE', 'IL', 'IN', 'IR', 'IS', 'IT',
       'JM', 'JP', 'KE', 'KR', 'KS', 'LB', 'LK', 'LT', 'LU', 'LV', 'MD',
       'MM', 'MT', 'MU', 'MX', 'MY', 'NG', 'NI', 'NL', 'NO', 'NZ', 'No',
       'PE', 'PH', 'PL', 'PT', 'QM', 'QZ', 'RE', 'RO', 'RS', 'RU', 'SE',
       'SG', 'SI', 'SK', 'SM', 'SW', 'TC', 'TH', 'TR', 'TW', 'UA', 'UK',
       'UR', 'US', 'UY', 'VE', 'VN', 'ZA', 'ZZ', 'ae', 'ca', 'gb', 'il',
       'lv', 'nl', 'qz', 'us', 've'], dtype=object)

Einige Ländercodes sind kleingeschrieben. Diese sollen in Großbuchstaben umgewandelt werden.

In [None]:
by_isrc["country_code"] = by_isrc["country_code"].str.upper()

In [None]:
by_isrc["country_code"].unique()

array(['AE', 'AR', 'AT', 'AU', 'AZ', 'BA', 'BC', 'BE', 'BG', 'BR', 'BX',
       'CA', 'CH', 'CI', 'CL', 'CN', 'CO', 'CS', 'CZ', 'DE', 'DG', 'DK',
       'EE', 'EG', 'ES', 'FI', 'FR', 'FX', 'GB', 'GN', 'GR', 'GT', 'GX',
       'HK', 'HR', 'HU', 'ID', 'IE', 'IL', 'IN', 'IR', 'IS', 'IT', 'JM',
       'JP', 'KE', 'KR', 'KS', 'LB', 'LK', 'LT', 'LU', 'LV', 'MD', 'MM',
       'MT', 'MU', 'MX', 'MY', 'NG', 'NI', 'NL', 'NO', 'NZ', 'PE', 'PH',
       'PL', 'PT', 'QM', 'QZ', 'RE', 'RO', 'RS', 'RU', 'SE', 'SG', 'SI',
       'SK', 'SM', 'SW', 'TC', 'TH', 'TR', 'TW', 'UA', 'UK', 'UR', 'US',
       'UY', 'VE', 'VN', 'ZA', 'ZZ'], dtype=object)

Gruppierung der Songs nach den Ländercodes.

In [None]:
countries = {
    'AE': 'United Arab Emirates',
    'AR': 'Argentina',
    'AT': 'Austria',
    'AU': 'Australia',
    'AZ': 'Azerbaijan',
    'BA': 'Bosnia and Herzegovina',
    'BC': 'Canada (British Columbia)',
    'BE': 'Belgium',
    'BG': 'Bulgaria',
    'BR': 'Brazil',
    'BX': 'Benelux',
    'CA': 'Canada',
    'CH': 'Switzerland',
    'CI': 'Ivory Coast (Côte d\'Ivoire)',
    'CL': 'Chile',
    'CN': 'China',
    'CO': 'Colombia',
    'CS': 'Serbia and Montenegro',
    'CZ': 'Czech Republic',
    'DE': 'Germany',
    'DG': 'Germany (East)',
    'DK': 'Denmark',
    'EE': 'Estonia',
    'EG': 'Egypt',
    'ES': 'Spain',
    'FI': 'Finland',
    'FR': 'France',
    'FX': 'France (Metropolitan)',
    'GB': 'United Kingdom',
    'GN': 'Guinea',
    'GR': 'Greece',
    'GT': 'Guatemala',
    'GX': 'Greece (Ancient)',
    'HK': 'Hong Kong',
    'HR': 'Croatia',
    'HU': 'Hungary',
    'ID': 'Indonesia',
    'IE': 'Ireland',
    'IL': 'Israel',
    'IN': 'India',
    'IR': 'Iran',
    'IS': 'Iceland',
    'IT': 'Italy',
    'JM': 'Jamaica',
    'JP': 'Japan',
    'KE': 'Kenya',
    'KR': 'South Korea',
    'KS': 'Kosovo',
    'LB': 'Lebanon',
    'LK': 'Sri Lanka',
    'LT': 'Lithuania',
    'LU': 'Luxembourg',
    'LV': 'Latvia',
    'MD': 'Moldova',
    'MM': 'Myanmar (Burma)',
    'MT': 'Malta',
    'MU': 'Mauritius',
    'MX': 'Mexico',
    'MY': 'Malaysia',
    'NG': 'Nigeria',
    'NI': 'Nicaragua',
    'NL': 'Netherlands',
    'NO': 'Norway',
    'NZ': 'New Zealand',
    'PE': 'Peru',
    'PH': 'Philippines',
    'PL': 'Poland',
    'PT': 'Portugal',
    'QM': 'Marshall Islands', #second country code for the United States
    'QZ': 'Unknown or Invalid Area',
    'RE': 'Réunion',
    'RO': 'Romania',
    'RS': 'Serbia',
    'RU': 'Russia',
    'SE': 'Sweden',
    'SG': 'Singapore',
    'SI': 'Slovenia',
    'SK': 'Slovakia',
    'SM': 'San Marino',
    'SW': 'Sweden',
    'TC': 'Turks and Caicos Islands',
    'TH': 'Thailand',
    'TR': 'Turkey',
    'TW': 'Taiwan',
    'UA': 'Ukraine',
    'UK': 'United Kingdom',
    'UR': 'Uruguay',
    'US': 'United States',
    'UY': 'Uruguay',
    'VE': 'Venezuela',
    'VN': 'Vietnam',
    'ZA': 'South Africa',
    'ZZ': 'Unknown or Invalid Area'
}

Erstellen eines neuen Features `country`, welches den Namen des jeweiligen Landes beinhaltet.

In [None]:
by_isrc["country"] = by_isrc["country_code"].apply(lambda x: countries[x])

Gruppieren der Daten nach `country` und `country_code`. Dies soll genutzt werden, um zu analysieren, aus welchem Land am meisten Songs veröffentlicht worden sind.

In [None]:
grouped_countries = by_isrc.copy()
grouped_countries = grouped_countries[["country_code", "country"]]
grouped_countries["num_countries"] = 0
grouped_countries = grouped_countries.groupby(["country_code", "country"]).count()

In [None]:
grouped_countries = grouped_countries.sort_values("num_countries", ascending=False)

In [None]:
grouped_countries.reset_index(inplace=True)

In [None]:
# country und country_code werden kombiniert für eine bessere Darstellung
grouped_countries["country_comb"] = grouped_countries.apply(lambda x: f'{x["country"]}-{x["country_code"]}', axis=1)

In [None]:
fig = px.bar(grouped_countries[:30], x="country_comb", y="num_countries", template="plotly_dark")
fig.update_layout(
    height=500,
    title="Veröffentlichte Songs pro Land (Top 30)",
    xaxis_title="Land",
    yaxis_title="Veröffentlichte Songs"
)
fig.show()

Es sollen alle Lieder entfernt werden welche nicht aus englischsprachigen Länger (US, UK, CA, AU, Neuseeland) sowie Deutschland, Schweiz, Österreich, Italien und Schweden stammen.

In [None]:
important_country_codes = ["US", "GB", "CA", "AU", "NZ", "DE", "CH", "AT", "IT", "SE"]

# kleinere Inselgegenden von Amerika: Baker Island, Howland Island, Jarvis Island, Johnston Atoll, Kingman Reef, Midway Islands, Navassa Island, Palmyra Atoll, Wake Island
minor_insular_areas_us = ["XB", "XH", "XQ", "XU", "XM", "QM", "XV", "XL", "QW"]
important_country_codes.extend(minor_insular_areas_us)

`QM` wird heute als zusätzlicher Country code für Amerika verwendet, da der Code `US` sein Limit erreicht hat. Da dieser Beschluss jedoch aus 2010 stammt, sollten die Lieder der 80er nicht beeinflusst werden?

In [None]:
by_isrc[by_isrc["country_code"] == "QM"].sort_values("popularity", ascending=False)[["isrc", "genres", "name", "artists", "popularity", "chart_power"]]

Unnamed: 0,isrc,genres,name,artists,popularity,chart_power
274154,QMKHM1600219,"['hard-rock', 'metal', 'rock']",master of puppets,metallica,80,
274107,QMKHM1600096,"['hard-rock', 'metal', 'rock']",for whom the bell tolls - remastered,metallica,75,
271243,QMFME1326440,"['latin', 'pop']",tu dama de hierro,marisela,71,
274108,QMKHM1600097,"['hard-rock', 'metal', 'rock']",fade to black - remastered,metallica,70,
274057,QMKHM1600034,"['hard-rock', 'metal', 'rock']",seek & destroy - remastered,metallica,69,
...,...,...,...,...,...,...
267541,QM7281628419,['tango'],fuegos artificiales,juan d'arienzo y su orquesta típica,0,
267551,QM7281713701,['jazz'],raincheck,nick brignola,0,
267552,QM7281713702,['jazz'],tenderly,nick brignola,0,
267553,QM7281713703,['jazz'],hurricane connie,nick brignola,0,


Herausfiltern der Lieder aus den "wichtigen" Länder.

In [None]:
reduced_isrc = by_isrc.copy()
reduced_isrc = reduced_isrc[reduced_isrc["country_code"].isin(important_country_codes)]
reduced_isrc["genres"] = reduced_isrc["genres"].apply(literal_eval)

In [None]:
print(f"Die Anzahl der Lieder hat sich von {by_isrc.shape[0]} auf {reduced_isrc.shape[0]} verringert.")

Die Anzahl der Lieder hat sich von 416154 auf 251101 verringert.


Gruppieren der Daten nach den Genres => Welche Genres waren in den wichtigen Ländern relevant?

In [None]:
def filter_important_genres(df):
  important_genres = df.copy()
  important_genres = important_genres.explode("genres")
  important_genres["num"] = 0
  important_genres = important_genres[["genres", "num"]]
  important_genres = important_genres.groupby("genres").count()
  important_genres.reset_index(inplace=True)
  important_genres = important_genres.sort_values("num", ascending=False)
  return important_genres

In [None]:
filter_important_genres(reduced_isrc)

Unnamed: 0,genres,num
16,classical,63432
87,rock,60598
77,pop,48841
38,german,28985
62,jazz,22632
...,...,...
28,dubstep,2
12,cantopop,1
53,idm,1
69,metalcore,1


Am meisten Lieder wurden für das Genre Klassik veröffentlicht. Da dieses Genre nicht wirklich relevant ist, müssen zunächst die irrelevanten Genres entfernt werden. Hiebei gibt es zwei verschiedene Wege. Verfügt ein Künstler sowohl über ein erwünschtes als auch ein unerwünschtes Genre, so können entweder alle Songs von ihm behalten oder entfernt werden. Zunächst wird geprüft, wie häufig ein solcher Fall eintritt.

In [None]:
unwanted_genres = ["classical", "jazz", "folk", "french", "turkish", "gospel", "samba", "piano", "mpb", "sertanejo", "pagode", "sleep", "forro", "malay", "anime",
                 "j-idol", "comedy", "mandopop", "cantopop", "show-tunes", "emo", "romance", "j-dance", "chill", "world-music", "iranian", "idm", "metalcore",
                 "hardstyle", "opera", "k-pop", "j-pop", "j-rock", "happy"]

In [None]:
occ_unwanted_and_wanted = by_isrc.copy()
occ_unwanted_and_wanted = occ_unwanted_and_wanted[["genres"]]
occ_unwanted_and_wanted["genres"] = occ_unwanted_and_wanted["genres"].apply(literal_eval)
occ_unwanted_and_wanted = occ_unwanted_and_wanted.explode("genres")

In [None]:
wanted_genres = list(set(occ_unwanted_and_wanted["genres"].unique()).difference(unwanted_genres))

In [None]:
def filter_row(row):
    has_wanted_genre = any(genre in row['genres'] for genre in wanted_genres)
    has_not_wanted_genre = any(genre in row['genres'] for genre in unwanted_genres)
    return has_wanted_genre and has_not_wanted_genre

In [None]:
occ_unwanted_and_wanted_df = reduced_isrc[reduced_isrc.apply(filter_row, axis=1)]

In [None]:
print(f"Insgesamt gibt es {occ_unwanted_and_wanted_df.shape[0]} Lieder, welche über ein erwünschtes und ein unerwünschtes Genre verfügen.")

Insgesamt gibt es 48318 Lieder, welche über ein erwünschtes und ein unerwünschtes Genre verfügen.


In [None]:
print(f"Hierbei gibt es {len(list(occ_unwanted_and_wanted_df.genres.astype(str).unique()))} verschiedene Kombinationen der Genres.")

Hierbei gibt es 582 verschiedene Kombinationen der Genres.


## Visualisierung Kombination

Es werden alle Songs entfernt, welche mindestens ein unerwünschtes Feature beinhalten.

In [None]:
by_isrc_sub = by_isrc.copy()
by_isrc_sub = by_isrc_sub[['isrc', 'genres']]
by_isrc_sub = by_isrc_sub[~by_isrc_sub['genres'].str.contains('|'.join(unwanted_genres))]
by_isrc_sub['genres'] = by_isrc_sub['genres'].apply(literal_eval)

Das neu entstandene DataFrame soll nun mit dem bereits auf die wichtigen Länder reduzierten DataFrame gemerged werden.

In [None]:
#reduced_isrc.drop(columns="genres", inplace=True)

In [None]:
reduced_isrc.shape, by_isrc_sub.shape

((251101, 27), (220702, 2))

In [None]:
reduced_isrc = reduced_isrc.merge(by_isrc_sub[["isrc"]], how="inner", on="isrc")

In [None]:
print(f"Die Anzahl der Songs hat sich weiter auf {reduced_isrc.shape[0]} verringert. Die Anzahl der Features ist bei {reduced_isrc.shape[1]} geblieben.")

Die Anzahl der Songs hat sich weiter auf 139629 verringert. Die Anzahl der Features ist bei 27 geblieben.


In [None]:
important_genres = filter_important_genres(reduced_isrc)
important_genres = important_genres.sort_values("num", ascending=False)
important_genres.head(10)

Unnamed: 0,genres,num
61,rock,52805
51,pop,43525
56,punk,16552
14,country,12353
47,metal,11743
35,hard-rock,9228
29,german,8980
20,disco,8065
70,swedish,7964
68,soul,7289


In [None]:
fig = px.bar(important_genres[:30], x="genres", y="num", template="plotly_dark")
fig.update_layout(
    height=500,
    title="Veröffentlichte Songs pro Genre in den wichtigen Ländern (Top 30)",
    xaxis_title="Genre",
    yaxis_title="Veröffentlichte Songs"
)
fig.show()

In [None]:
reduced_isrc.to_csv(f"{PATH}/reduced_isrc.csv")

## Auffindbarkeiten prüfen (in db und auf Spotify)

In [6]:
!pip install colab-env --upgrade

Collecting colab-env
  Downloading colab-env-0.2.0.tar.gz (4.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting python-dotenv<1.0,>=0.10.0 (from colab-env)
  Downloading python_dotenv-0.21.1-py3-none-any.whl (19 kB)
Building wheels for collected packages: colab-env
  Building wheel for colab-env (setup.py) ... [?25l[?25hdone
  Created wheel for colab-env: filename=colab_env-0.2.0-py3-none-any.whl size=3805 sha256=a3c1482a31faf30b67da4108ec3db9acddcad04b1f6139acad39eeac0281c879
  Stored in directory: /root/.cache/pip/wheels/ae/36/4f/466c2cd4db5d08f317893a920c4a0f58a81459ee3bdb136d35
Successfully built colab-env
Installing collected packages: python-dotenv, colab-env
Successfully installed colab-env-0.2.0 python-dotenv-0.21.1


In [7]:
!pip install spotipy --upgrade

Collecting spotipy
  Downloading spotipy-2.23.0-py3-none-any.whl (29 kB)
Collecting redis>=3.5.3 (from spotipy)
  Downloading redis-5.0.0-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.1/250.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: redis, spotipy
Successfully installed redis-5.0.0 spotipy-2.23.0


In [8]:
!pip install ratelimit

Collecting ratelimit
  Downloading ratelimit-2.2.1.tar.gz (5.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ratelimit
  Building wheel for ratelimit (setup.py) ... [?25l[?25hdone
  Created wheel for ratelimit: filename=ratelimit-2.2.1-py3-none-any.whl size=5894 sha256=2b309c0c9f9178cf6e11b6c1a70f961e34f96ca5d1bd1b844ce76ba05c06756f
  Stored in directory: /root/.cache/pip/wheels/27/5f/ba/e972a56dcbf5de9f2b7d2b2a710113970bd173c4dcd3d2c902
Successfully built ratelimit
Installing collected packages: ratelimit
Successfully installed ratelimit-2.2.1


In [9]:
import colab_env
import os
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import requests
import json
from numpy import NaN
from tqdm import tqdm
import time
from ratelimit import limits, sleep_and_retry

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [10]:
@sleep_and_retry
@limits(calls=60, period=30)
def call_api():
  return

In [11]:
chart_power_df = pd.read_excel(f"{PATH}/chart-power-scores_80s.xlsx")
chart_power_df = chart_power_df.applymap(lambda s: s.lower() if type(s) == str else s)
chart_power_df = chart_power_df[['Song', 'Artist', 'Points']].groupby(['Song', 'Artist']).sum()
chart_power_df.reset_index(inplace=True)

In [12]:
if LOCAL:
    from dotenv import load_dotenv
    load_dotenv()
    SPOTIFY_CLIENT_ID = os.environ.get("SPOTIFY_CLIENT_ID")
    SPOTIFY_CLIENT_SECRET = os.environ.get("SPOTIFY_CLIENT_SECRET")
else:
    SPOTIFY_CLIENT_ID = os.getenv("SPOTIFY_CLIENT_ID")
    SPOTIFY_CLIENT_SECRET = os.getenv("SPOTIFY_CLIENT_SECRET")

In [13]:
client_credentials_manager = SpotifyClientCredentials(client_id=SPOTIFY_CLIENT_ID, client_secret=SPOTIFY_CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

Abfrage aller Lieder für Boney M.

In [14]:
def query(year: int, offset: int):
  res = sp.search(q=f"artist:Boney M. year:{year}", type="track", limit=10, offset=offset, market="DE")
  if (len(res["tracks"]["items"]) == 10):
    return query(year, offset + 10)
  return len(res["tracks"]["items"]) + offset

In [None]:
boney_songs = pd.DataFrame()
for year in range(1980, 1990):
  boney_songs = pd.concat([boney_songs, pd.DataFrame([[year, query(year, 0)]], columns=["year", "songs"])])

In [None]:
fig = px.bar(boney_songs, x="year", y="songs", title="Songs veröffentlich von Boney M. in den 80er Jahren")
fig.update_layout(
    height=500,
    template='plotly_dark',
    xaxis_title="Jahr",
    yaxis_title="Veröffentlichte Songs"
)
fig.show()

Einbauen des Genres in die query

In [None]:
def genre_query(artist: str, year: int, offset:int, genres):
  for genre in genres:
    call_api()
    res = sp.search(q=f"artist:{artist} year:{year} genre:{genre}", type="track", limit=50, offset=offset, market="DE")
    if (len(res["tracks"]["items"] ) > 1):
      print(year, genre)
      #print(res)

In [None]:
def test_genre_query(artist: str):
  genres = sp.recommendation_genre_seeds()['genres']
  for year in range(1980, 1990):
    print(year)
    genre_query(artist, year, 0, genres)

In [None]:
test_genre_query("Boney M.")

1980


KeyboardInterrupt: 

Es existieren keine Songs für Boney M. wenn ein Genre in der Abfrage spezifiziert wird. Eventuell sind die Genres das Problem für die fehlenden Daten.

In [None]:
by_isrc.sort_values("popularity", ascending=False)

Unnamed: 0,isrc,genres,name,artists,album,release_date,release_date_precision,uri,spotify_id,chart_power,...,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,country_code,country
132734,GBAAM0201110,['rock'],every breath you take,the police,Synchronicity (Remastered 2003),1983-06-17,day,spotify:track:1JSTJqkT5qHq8MDJnJbRE1,1JSTJqkT5qHq8MDJnJbRE1,1419.0,...,0.5430,0.002940,0.0714,0.7400,117.401,253920,4,1983,GB,United Kingdom
144156,GBALX8300190,"['piano', 'rock']",i'm still standing,elton john,Too Low For Zero,1983-05-30,day,spotify:track:1jDJFeK9x3OZboIAHsY9k2,1jDJFeK9x3OZboIAHsY9k2,1185.0,...,0.3560,0.121000,0.1400,0.7720,176.808,183440,4,1983,GB,United Kingdom
368082,USPR38619998,"['metal', 'rock']",livin' on a prayer,bon jovi,Slippery When Wet,1986-08-16,day,spotify:track:37ZJ0p5Jm13JPevGcx4SkF,37ZJ0p5Jm13JPevGcx4SkF,1112.0,...,0.0778,0.000214,0.2940,0.7950,122.511,249293,4,1986,US,United States
169272,GBCNR8500002,"['piano', 'pop', 'r-n-b', 'rock', 'singer-song...",running up that hill (a deal with god),kate bush,Hounds Of Love,1985-09-16,day,spotify:track:1PtQJZVZIdWIYdARpZRDFO,1PtQJZVZIdWIYdARpZRDFO,,...,0.7190,0.003080,0.0604,0.1940,108.376,298933,4,1985,GB,United Kingdom
6804,AUAP08000046,"['hard-rock', 'rock']",back in black,ac/dc,Back In Black,1980-07-25,day,spotify:track:08mG3Y1vljYA6bvDt4Wqkj,08mG3Y1vljYA6bvDt4Wqkj,,...,0.0110,0.009650,0.0828,0.7630,188.386,255493,4,1980,AU,Australia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174547,GBF078300092,['classical'],"serenade in g, k.525 ""eine kleine nachtmusik"" ...","wolfgang amadeus mozart,salomon quartet,barry guy",Mozart: Eine kleine Nachtmusik; Notturno; Sere...,1984-06-03,day,spotify:track:1Sbp0G9AUAQdhXinEiDWsl,1Sbp0G9AUAQdhXinEiDWsl,,...,0.5170,0.000000,0.2780,0.1510,124.043,241373,3,1984,GB,United Kingdom
174528,GBF078300017,['classical'],"alexander nevsky, op. 78: 7. alexander's entry...","sergei prokofiev,the cleveland orchestra choru...",Prokofiev: Alexander Nevsky,1984-01-01,day,spotify:track:3EZH0BH50us07EUPKzhw3u,3EZH0BH50us07EUPKzhw3u,,...,0.9160,0.003540,0.1860,0.1460,145.809,277053,4,1984,GB,United Kingdom
174527,GBF078300016,['classical'],"alexander nevsky, op. 78: 6. field of the dead","sergei prokofiev,irina arkhipova,cleveland orc...",Prokofiev: Alexander Nevsky,1984-01-01,day,spotify:track:4EXE7ZRcKVwD1xxNZLEOPj,4EXE7ZRcKVwD1xxNZLEOPj,,...,0.9160,0.004890,0.1180,0.0399,83.968,386533,4,1984,GB,United Kingdom
174526,GBF078300015,['classical'],"alexander nevsky, op. 78: 5. the battle on the...","sergei prokofiev,the cleveland orchestra choru...",Prokofiev: Alexander Nevsky,1984-01-01,day,spotify:track:3jXNm9r9WAwj8qJvBKoXg1,3jXNm9r9WAwj8qJvBKoXg1,,...,0.6560,0.787000,0.1470,0.0637,100.909,801840,4,1984,GB,United Kingdom


Abfrage nach dem Künstler Elton John.

In [None]:
test_genre_query("Elton John")

1980 piano
1980 rock
1981 piano
1981 rock
1982 piano
1982 rock
1983 piano
1983 rock


KeyboardInterrupt: ignored

Für Elton John existieren Genres. Im nächsten Schritt soll überprüft werden, ob die Genres auch in dem Response der Spotify API wiedergefunden wird.

In [None]:
res = sp.search(q="artist:Elton John", type="track", market="DE", limit=50, offset=0)
res = res["tracks"]
if "items" in res:
  for item in res["items"]:
    if "artists" in item:
      #print(item["artists"]["genres"] if "genres" in item["artists"] else print("No genres available!"))
      #print(item["artists"])
      for artist in item["artists"]:
        print(artist["genres"] if "genres" in artist else artist.keys())
    else:
      print("No artists in item")
else:
  print("No items in res")
  print(res.keys())


dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(['external_urls', 'href', 'id', 'name', 'type', 'uri'])
dict_keys(

Die Spotify gibt auch für Künstler die ein Genre zugewiesen bekommen haben kein Genre zurück. Laut API sollte es ein weiteres Attribut `genres` geben.

Abfrage aller Lieder von Künstlern, welche in den 80ern in den Charts waren

In [23]:
chart_power_df["Artist"].unique()

array(['paul hardcastle', 'c. c. catch', 'deborah sasson & mcl',
       'coast to coast', 'fiction factory', 'o.m.d.',
       'the rock steady crew', 'cutting crew', 'sandra',
       'bill medley & jennifer warnes', 'john lennon', 'beastie boys',
       'klaus lage band', 'schweizer', 'sigue sigue sputnik',
       'jennifer rush', 'the catch', 'dolly parton', 'nena',
       'george michael', 'feargal sharkey', 'phil collins', 'queen',
       'godley & creme', 'erasure', 'sheila e', 'will downing',
       "shakin' stevens", 'depeche mode',
       "shakin' stevens & bonnie tyler", 'duran duran',
       'nick straker band', 'bad boys blue', 'genesis',
       'steve miller band', 'katja ebstein', 'roger whittaker',
       'david bowie', 'andy borg', 'toto', 'rose laurens', 'princess',
       'saragossa band', 'john farnham', 'ricky king', 'quincy jones',
       'bill withers', 'rufus & chaka khan', 'inner city', 'radiorama',
       'simple minds', 'herbert grã¶nemeyer', 'sister sledge',
  

Wenn mehrere Künstler zusammen einen Song veröffentlich haben, sind diese hier mit einem `&` getrennt. Dies muss zunächst aufgelöst werden.

In [15]:
chart_power_df["Artist"] = chart_power_df["Artist"].str.split(" & ")
chart_power_df_explode = chart_power_df.explode("Artist", ignore_index=True)

In [16]:
chart_power_df_explode

Unnamed: 0,Song,Artist,Points
0,19,paul hardcastle,1600
1,'cause you're young,c. c. catch,998
2,(carmen) danger in her eyes,deborah sasson,531
3,(carmen) danger in her eyes,mcl,531
4,(do) the hucklebuck,coast to coast,697
...,...,...,...
2022,zu spät,die ã„rzte,605
2023,zuppa romana,schrott nach 8,601
2024,â€¦ und ganz doll mich (ich mag),rolf und seine freunde,1314
2025,über sieben brücken mußt du geh'n,karat,826


Einige Künstler und Lieder enthalten zudem Umlaute, welche zunächst entsprechend formatiert werden müssen (z.B. `Die Ärzte` sind als `Die ã„rzte` gespeichert).

In [17]:
replace = {
    "ã¶": "ö",
    "ã„": "ä",
    "ã–": "ö",
    "ã©": "e"
}

chart_power_df_explode["Song"] = chart_power_df_explode["Song"].replace(replace, regex=True)
chart_power_df_explode["Artist"] = chart_power_df_explode["Artist"].replace(replace, regex=True)

In [18]:
chart_power_df_explode[chart_power_df_explode["Song"] == "zu spät"]

Unnamed: 0,Song,Artist,Points
2022,zu spät,die ärzte,605


In [19]:
chart_power_df_explode[chart_power_df_explode["Artist"] == "die ã„rzte"]


Unnamed: 0,Song,Artist,Points


In [20]:
chart_power_df_explode[chart_power_df_explode["Artist"].str.contains("\[")]

Unnamed: 0,Song,Artist,Points
64,all of me (boy oh boy),sabrina [it],841
85,amoureux solitaires,lio [be],1555
100,another life,kano [it],1460
187,boys (summertime love),sabrina [it],1444
198,bridge to your heart,wax [uk],614
256,change your mind,raff [it],736
314,cry softly,secret service [se],451
351,delirio mind,scotch [it],1120
390,disco band,scotch [it],1806
480,ein weißes blatt'l papier,relax [de],535


Weiterhin ist hinter einigen Künstlern das Herukunftsland in eckigen Klammern gekennzeichnet. Dies muss ebenfalls entfernet werden, da diese andernfalls nicht von der Spotify API gefunden werden.

In [21]:
pattern = r' \[.*?\]'
chart_power_df_explode["Artist"] = chart_power_df_explode["Artist"].str.replace(pattern, '',regex=True)

In [22]:
chart_power_df_explode[chart_power_df_explode["Artist"].str.contains("\[")]

Unnamed: 0,Song,Artist,Points


In [23]:
len(chart_power_df_explode["Artist"].unique())

944

In [24]:
import time

def sp_request(req, retries, max_retries, *args):
  try:
    call_api()
    return req(*args)
  except Exception as e:
    if isinstance(e, spotipy.exceptions.SpotifyException) and e.http_status == 429:
      # Handle rate limiting (HTTP 429)
      if retries < max_retries:
          sleep_time = exponential_backoff(retries)
          print(f"Rate limited, waiting for {sleep_time} seconds before retrying...")
          time.sleep(sleep_time)
          retries += 1
          return sp_request(req, retries, max_retries, *args)
      else:
          print("Max retries reached, exiting...")
    else:
      print(f"An error occurred: {e}")
      retries += 1
      return sp_request(req, retries, max_retries, *args)

# Define a function for exponential backoff
def exponential_backoff(retries):
    return 2 ** retries

In [25]:
def filter_track_features(track, features, genre):
    '''
    Filters the relevant features of a track in returns them in JSON object.

    Parameter
    ---------
    track: Object
        Track returend by the spotify API

    genre: string
        Genre that should be used

    Return
    ------
    relevant_features: Object
        JSON Object that contains the relevant featues
    '''

    external_ids = track['external_ids'] if 'external_ids' in track else {}
    isrc = external_ids['isrc'] if 'isrc' in external_ids else np.nan
    artist_names = []

    if 'artists' in track and isinstance(track['artists'],list):
        for artist in track['artists']:
            if 'name' in artist:
                artist_names.append(artist['name'])
        artist_names = ','.join(artist_names)
    else:
        artist_names = np.nan



    if 'album' in track:
        album = track['album']['name'] if 'name' in track['album'] else np.nan
        release_date = track['album']['release_date'] if 'release_date' in track['album'] else np.nan
        release_date_precision = track['album']['release_date_precision'] if 'release_date_precision' in track['album'] else np.nan
    else:
         album = np.nan
         release_date = np.nan
         release_date_precision = np.nan


    track_name = track['name'] if 'name' in track else np.nan
    if track_name != np.nan and artist_names != np.nan:
        points = chart_power_df_explode.loc[(chart_power_df_explode.Song == track_name.lower()) & (chart_power_df_explode.Artist.str.contains(artist_names.lower()))]['Points']
        if points.empty:
             points = np.nan
        else:
            try:
                points = int(points)
            except Exception as e:
                print(e)
                print(track_name)
                print(artist_names)
                print(points)
                print("-------")
    else:
         points = np.nan

    return {
        'name': track_name,
        'artists': artist_names,
        'album': album,
        'release_date': release_date,
        'release_date_precision': release_date_precision,
        'spotify_id': track['id'] if 'id' in track else np.nan,
        'chart_power': points,
        'uri': track['uri'] if 'uri' in track else np.nan,
        'popularity': track['popularity'] if 'popularity' in track else np.nan,
        'genres': np.nan if genre == False else genre,
        'danceability': features['danceability'] if 'danceability' in features else np.nan,
        'energy': features['energy'] if 'energy' in features else np.nan,
        'key': features['key'] if 'key' in features else np.nan,
        'loudness': features['loudness'] if 'loudness' in features else np.nan,
        'mode': features['mode'] if 'mode' in features else np.nan,
        'speechiness': features['speechiness'] if 'speechiness' in features else np.nan,
        'acousticness': features['acousticness'] if 'acousticness' in features else np.nan,
        'instrumentalness': features['instrumentalness'] if 'instrumentalness' in features else np.nan,
        'liveness': features['liveness'] if 'liveness' in features else np.nan,
        'valence': features['valence'] if 'valence' in features else np.nan,
        'tempo': features['tempo'] if 'tempo' in features else np.nan,
        'duration_ms': features['duration_ms'] if 'duration_ms' in features else np.nan,
        'time_signature': features['time_signature'] if 'time_signature' in features else np.nan,
        'isrc': isrc,
    }

In [26]:
def get_number_of_tracks(release_year, artist, genre):
    '''
    Retrieves the number of tracks the spotfiy API returns for a specific query.
    '''
    if genre == False:
      result = sp_request(lambda release_year, artist: sp.search(q=f'year:{release_year} artist:{artist}', type='track', limit=1, offset=0, market='DE'), 0, 100, release_year, artist)
    else:
      result = sp_request(lambda release_year, artist, genre: sp.search(q=f'year:{release_year} artist:{artist} genre:{genre}', type='track', limit=1, offset=0, market='DE'), 0, 100, release_year, artist, genre)

    tracks = result['tracks'] if 'tracks' in result else ''
    return tracks['total'] if 'total' in tracks else 0

In [27]:
genres = ['chicago-house', 'hard-rock', 'rock', 'power-pop', 'guitar', 'hardcore',
          'groove', 'songwriter', 'garage', 'disco', 'grunge', 'ambient',
          'minimal-techno', 'brazil', 'indie', 'detroit-techno', 'electronic',
          'edm', 'dubstep', 'children', 'drum-and-bass', 'indian', 'death-metal',
          'club', 'salsa', 'bluegrass', 'metal', 'reggae', 'tango', 'black-metal',
          'british', 'german', 'punk', 'reggaeton', 'alternative', 'soul', 'r-n-b',
          'goth', 'pop-film', 'breakbeat', 'indie-pop', 'heavy-metal', 'dance',
          'honky-tonk', 'dancehall', 'dub', 'singer-songwriter', 'spanish',
          'deep-house', 'rock-n-roll', 'techno', 'hip-hop', 'punk-rock', 'industrial',
          'afrobeat', 'trip-hop', 'funk', 'blues', 'swedish', 'latino', 'alt-rock',
          'country', 'acoustic', 'trance', 'grindcore', 'ska', 'house', 'progressive-house',
          'new-age', 'electro', 'rockabilly', 'party', 'pop', 'synth-pop', 'latin',
          'psych-rock']

In [173]:
'''
artist_over_1000 = []
df = pd.DataFrame()
with tqdm(total=len(genres)*len(chart_power_df_explode["Artist"].unique())) as pbar:
  for artist in chart_power_df_explode["Artist"].unique():
    for genre in genres:
      total_results = get_number_of_tracks("1980-1989", artist, genre)
      if total_results < 1000:
        offset = 0
        while offset < total_results:
          try:
            track_features = []
            call_api()
            result = sp.search(q=f"year:1980-1989 artist:{artist} genre:{genre}", type="track", limit=50, offset=offset, market="DE")
            tracks = result["tracks"] if "tracks" in result else ""
            if "items" in tracks:
              for track in tracks["items"]:
                features = filter_track_features(track, genre)
                track_features.append(features)
              offset += 50
              df = pd.concat([df, pd.DataFrame(track_features)], ignore_index=True)
              df.to_scv
            else:
              continue
          except Exception as e:
            print(e)
      else:
        print(f"Artist {artist} has published over 1000 songs in the 80s")
        artist_over_1000.append(artist)

      pbar.update(1)
      df.to_csv(f"{PATH}/data-new.csv")
  '''

'\nartist_over_1000 = []\ndf = pd.DataFrame()\nwith tqdm(total=len(genres)*len(chart_power_df_explode["Artist"].unique())) as pbar:\n  for artist in chart_power_df_explode["Artist"].unique():\n    for genre in genres:\n      total_results = get_number_of_tracks("1980-1989", artist, genre)\n      if total_results < 1000:\n        offset = 0\n        while offset < total_results:\n          try:\n            track_features = []\n            call_api()\n            result = sp.search(q=f"year:1980-1989 artist:{artist} genre:{genre}", type="track", limit=50, offset=offset, market="DE")\n            tracks = result["tracks"] if "tracks" in result else ""\n            if "items" in tracks:\n              for track in tracks["items"]:\n                features = filter_track_features(track, genre)\n                track_features.append(features)\n              offset += 50\n              df = pd.concat([df, pd.DataFrame(track_features)], ignore_index=True)\n              df.to_scv\n          

In [None]:
# Initialize offset and retries
offset = 0

artist_over_1000 = []
no_tracks = []
no_features = []
artists_not_found = []


df = pd.DataFrame()
with tqdm(total=len(chart_power_df_explode["Artist"].unique())) as pbar:
  tracks_ = []
  audio_features_ = []

  for artist in chart_power_df_explode["Artist"].unique():
    total_results = get_number_of_tracks("1980-1989", artist, False)
    if total_results == 0:
      print(f"{artist} not found!")
      artists_not_found.append(artist)
    elif total_results < 1000:
      offset = 0
      while offset < total_results:
        result = sp_request(lambda artist, offset: sp.search(q=f"year:1980-1989 artist:{artist}", type="track", limit=50, offset=offset, market="DE"), 0, 100, artist, offset)
        offset += 50
        if "tracks" not in result:
          print("No tracks in result")
          continue

        tracks = result["tracks"]

        if "items" in tracks and isinstance(tracks["items"], list):
          for track in tracks["items"]:
            tracks_.append(track)
            if len(tracks_) == 100:
              audio_features_ = sp_request(lambda x: sp.audio_features([track["id"] for track in x]), 0, 100, tracks_)
              track_features = []
              if len(tracks_) != len(audio_features_):
                print("Tracks and features have not the same lengh!")
                print(len(tracks_), len(audio_features_))
              for t, af in list(zip(tracks_, audio_features_)):
                if not t:
                  print("No track!")
                  print(t)
                  print(af)
                  no_tracks.append(af)
                  continue
                if not af:
                  print("No features")
                  print(af)
                  print(t)
                  no_features.append(t)
                  continue
                features = filter_track_features(t, af, False)
                track_features.append(features)
              df = pd.concat([df, pd.DataFrame(track_features)], ignore_index=True)
              #df.to_csv(f"{PATH}/data-new-without-genres-final.csv")
              tracks_ = []
              audio_features_ = []
        else:
          continue
    else:
      print(f"Artist {artist} has published over 1000 songs in the 80s")
      artist_over_1000.append(artist)

    pbar.update(1)

problems = {
    "artist_over_1000": artist_over_1000,
    "artist_not_found": artists_not_found,
    "no_features": [track["id"] for track in no_features]
}
#with open(f"{PATH}/problems.json", "w") as file:
    #json.dump(problems, file)

In [31]:
df = pd.read_csv(f"{PATH}/data-new-without-genres-final.csv")

In [33]:
with open(f"{PATH}/problems.json", "r") as file:
    loaded_problems = json.load(file)
    artist_over_1000 = loaded_problems["artist_over_1000"]
    artists_not_found = loaded_problems["artist_not_found"]
    no_features = loaded_problems["no_features"]

Untersuchen der aufgetretenen Probleme.

In [34]:
print(f"Insgesamt wurden für {len(no_features)} Songs keine Features gefunden.")

# Überprüfen, ob die Songs wirklich über keine Features verfügen
test_features = sp.audio_features(no_features)
songs_with_features = [item for item in test_features if item != None]
if len(songs_with_features):
  print(songs_with_features)

Insgesamt wurden für 29 Songs keine Features gefunden.


In [35]:
artists = chart_power_df_explode["Artist"].unique()
print(f"{len(artists_not_found)} von {len(artists)} Künstlern wurden von der Spotify API nicht gefunden.")

156 von 944 Künstlern wurden von der Spotify API nicht gefunden.


Eventuell sind die Lieder der Künstler in anderen Jahren erschienen.

In [211]:
artist_nfo = []
for artist in artists_not_found:
  res = sp.search(q=f"artist:{artist}", type="track", market="DE")
  if len(res["tracks"]["items"]) == 0:
    print(f"Artist {artist} not found overall.")
    artist_nfo.append(artist)

Artist b.a. robertson not found overall.
Artist the world's famous supreme team not found overall.
Artist anete humpe not found overall.
Artist t'pau not found overall.
Artist dexys midnight runners with the emerald express not found overall.
Artist terence trent d'arby not found overall.
Artist patrick cowley feat. sylvester not found overall.
Artist the christians, holly johnson, paul mccartney, gerry marsden not found overall.
Artist tommy piper singt alf not found overall.
Artist koreana not found overall.
Artist michael jackson with siedah garrett not found overall.
Artist m.a.r.r.s. not found overall.
Artist barbara gaskin not found overall.
Artist mary roos und david hanselmann not found overall.
Artist g'race not found overall.
Artist band für afrika not found overall.
Artist mike oldfield feat. aled jones, anita hegerland not found overall.
Artist technotronic feat. felly not found overall.
Artist tommi ohrner not found overall.
Artist mike oldfield and roger chapman not found

In [215]:
print(f"Insgesamt {len(artist_nfo)} Künstler wurden auch für andere Jahre nicht von der Spotify API gefunden.")
print(f"{len(artists_not_found) - len(artist_nfo)} Künstler haben Lieder in anderen Jahren veröffentlicht. Es ist jedoch unklar, ob es sich hierbei um besagte Chartsongs handelt.")

Insgesamt 31 Künstler wurden auch für andere Jahre nicht von der Spotify API gefunden.
125 Künstler haben Lieder in anderen Jahren veröffentlicht. Es ist jedoch unklar, ob es sich hierbei um besagte Chartsongs handelt.


In [36]:
df = df.applymap(lambda s: s.lower() if type(s) == str else s)

In [44]:
def convert_artist_names(df, logs = False):
  '''

  '''
  df_artists = df.copy()
  if logs:
    print('Occurences of substring ", ": ' + str(len(df_artists.loc[df_artists["artists"].str.contains(', ')])))
    print('Occurences of substring "_":' + str(len(df_artists.loc[df_artists["artists"].str.contains('_')])))

  df_artists.loc[df_artists["artists"].str.contains(', '), 'artists'] = df_artists[df_artists["artists"].str.contains(', ')]['artists'].str.replace(', ', '_')

  if logs:
    print('Occurences of substring ", ": ' + str(len(df_artists.loc[df_artists["artists"].str.contains(', ')])))
    print('Occurences of substring "_":' + str(len(df_artists.loc[df_artists["artists"].str.contains('_')])))

  df_artists['artists'] = df_artists['artists'].str.split(',')
  df_split_artists = df_artists.explode('artists')

  if logs:
    print('Occurences of substring ", ": ' + str(len(df_split_artists.loc[df_split_artists["artists"].str.contains(', ')])))
    print('Occurences of substring "_":' + str(len(df_split_artists.loc[df_split_artists["artists"].str.contains('_')])))

  df_split_artists.loc[df_split_artists['artists'].str.contains('_'), 'artists'] = df_split_artists[df_split_artists["artists"].str.contains('_')]['artists'].str.replace('_', ', ')

  if logs:
    print('Occurences of substring ", ": ' + str(len(df_split_artists.loc[df_split_artists["artists"].str.contains(', ')])))
    print('Occurences of substring "_":' + str(len(df_split_artists.loc[df_split_artists["artists"].str.contains('_')])))
  return df_split_artists

In [45]:
df_split_artists = convert_artist_names(df)
chart_artists = list(chart_power_df_explode["Artist"].unique())
unique_artists_in_dataframe = set(df_split_artists["artists"])
artists_not_in_dataframe = [artist for artist in chart_artists if artist not in unique_artists_in_dataframe]

In [46]:
len(artists_not_in_dataframe),len(chart_artists)

(288, 944)

In [49]:
dfa = df.copy()
dfa["artists"] = dfa["artists"].str.split(",")
dfa = dfa.explode("artists")

In [50]:
dfa.shape, df_split_artists.shape, df.shape

((81605, 25), (81176, 25), (68371, 25))

In [52]:
uaid = set(dfa["artists"])
anid = [artist for artist in chart_artists if artist not in uaid]
len(anid),len(chart_artists)

(287, 944)

In [60]:
chart_songs = list(chart_power_df_explode["Song"])
chart_artists = list(chart_power_df_explode["Artist"])
#zip(chart_songs, )
print(len(chart_songs), len(chart_artists))
chart_songs_artists = list(zip(chart_songs, chart_artists))

df_songs = list(df["name"])
df_artists = list(df["artists"])
df_songs_artists = list(zip(df_songs, df_artists))

2027 2027


Suchen nach Songs aus den Charts, welche nicht im DataFrame sind.

In [70]:
df_songs_artists = list(zip(df_songs, df_artists))
not_in_df = [[song, artist] for song, artist in chart_songs_artists if (song, artist) not in df_songs_artists]
print(f"Insgesamt sind {len(not_in_df)} von {len(chart_songs_artists)} Songs nicht im DataFrame enthalten")

Insgesamt sind 1153 von 2027 Songs nicht im DataFrame enthalten


In [87]:
def in_df(s, a):
  exist = False
  for song, artist in df_songs_artists:
    if (str(artist) == str(a)) and (str(s) in str(song)):
        exist = True

  return exist

not_in_df2 = [[song, artist] for song, artist in chart_songs_artists if not in_df(song, artist)]
print(f"Insgesamt sind {len(not_in_df2)} von {len(chart_songs_artists)} Songs nicht im DataFrame enthalten")

Insgesamt sind 917 von 2027 Songs nicht im DataFrame enthalten


In [None]:
!pip install levenshtein

In [110]:
import Levenshtein

def find_closest_string(target, string_list):
    closest_string = None
    min_distance = float('inf')
    closest_index = -1

    for index, string in enumerate(string_list):
        distance = Levenshtein.distance(target, string)
        if distance < min_distance:
            min_distance = distance
            closest_string = string
            closest_index = index

    return closest_string, min_distance, closest_index

# Example usage
target_string = "absolute beginners"
string_list = [
    "beginners",
    "absolute wodka",
    "absolute beginners -"
    ]
closest, distance, index = find_closest_string(target_string, string_list)

print(f"The closest string to '{target_string}' is '{closest}' with a Levenshtein distance of {distance} and index of {index}")

The closest string to 'absolute beginners' is 'absolute beginners -' with a Levenshtein distance of 2 and index of 2


In [112]:
df.head(1)

Unnamed: 0,name,artists,album,release_date,release_date_precision,spotify_id,chart_power,uri,popularity,genres,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,isrc
0,19 - destruction mix,paul hardcastle,paul hardcastle,1985-01-01,day,1azifmcvw3orhfmdxmcro1,,spotify:track:1azifmcvw3orhfmdxmcro1,35,,...,1,0.0816,0.00296,0.62,0.0501,0.729,117.859,428667,4,gbayk8500158


In [96]:
df_copy = df.copy()

def in_df(s, a):
  similar_songs = []
  for song, artist in df_songs_artists:
    if ((str(artist) in str(a)) or str(a) in str(artist)) and ((str(s) in str(song)) or (str(song) in str(s))):
      #if (artist != a) or (s != song):
      #  print(artist, a)
      #  print(song, s)
      #  print("##########")
      #return True
      if (s != song):
        similar_songs.append([song, artist])
  closest, distance, index = find_closest_string(s, [item[0] for item in similar_songs])
  best_song = similar_songs[index]
  df_copy.loc[(df["name"] == best_song[0]) and (df["artists"] == best_song[1])]
  return True if closest else False

not_in_df2 = [[song, artist] for song, artist in chart_songs_artists if not in_df(song, artist)]
print(f"Insgesamt sind {len(not_in_df2)} von {len(chart_songs_artists)} Songs nicht im DataFrame enthalten")

erich kunzel,cincinnati pops orchestra,janet stubbs,sandra graham,kimberly barber,eileen farrell sandra
maria (i'll never be) maria magdalena
##########
bill medley,jennifer warnes bill medley
(i've had) the time of my life - from "dirty dancing" soundtrack (i've had) the time of my life
##########
bill medley,jennifer warnes jennifer warnes
(i've had) the time of my life - from "dirty dancing" soundtrack (i've had) the time of my life
##########
sheila e. sheila e
a love bizarre a love bizarre
##########
quincy jones,charles may quincy jones
ai no corrida ai no corrida (i-no-ko-ree-da)
##########
quincy jones,charles may quincy jones
ai no corrida ai no corrida (i-no-ko-ree-da)
##########
rufus,chaka khan rufus
ain't nobody ain't nobody
##########
rufus,chaka khan rufus
ain't nobody (live) ain't nobody
##########
rufus,chaka khan chaka khan
ain't nobody ain't nobody
##########
rufus,chaka khan chaka khan
ain't nobody (live) ain't nobody
##########
julio iglesias,diana ross diana ross


In [226]:
res = sp.search(q="artist:van halen year:1980-1989", type="track", market="DE")

In [77]:
[item["name"] for item in res["tracks"]["items"]]

NameError: ignored

In [229]:
not_in_df[:10]

[[19, 'paul hardcastle'],
 ["'cause you're young", 'c. c. catch'],
 ['(carmen) danger in her eyes', 'deborah sasson'],
 ['(carmen) danger in her eyes', 'mcl'],
 ['(do) the hucklebuck', 'coast to coast'],
 ['(feels like) heaven', 'fiction factory'],
 ['(forever) live and die', 'o.m.d.'],
 ['(hey you) the rock steady crew', 'the rock steady crew'],
 ['(i just) died in your arms', 'cutting crew'],
 ["(i'll never be) maria magdalena", 'sandra']]

Anpassen der Query. Es wird nur nach Songnamen + Künstlernamen gefiltert. Das Veröffentlichungsjahr wird nicht beachtet.

In [115]:
# Initialize offset and retries
offset = 0

artist_over_1000 = []
no_tracks = []
no_features = []
artists_not_found = []

def req_tot_results(song, artist):
  result = sp_request(lambda song, artist: sp.search(q=f"artist:{artist} track:{song}", type="track", market="DE"), 0, 100, song, artist)
  tracks = result['tracks'] if 'tracks' in result else ''
  return tracks['total'] if 'total' in tracks else 0


df = pd.DataFrame()
with tqdm(total=len(not_in_df)) as pbar:
  tracks_ = []
  audio_features_ = []

  for song, artist in not_in_df:
    total_results = req_tot_results(song, artist)
    if total_results == 0:
      print(f"{artist}, {song} not found!")
      artists_not_found.append([artist, song])
    elif total_results < 1000:
      offset = 0
      while offset < total_results:
        result = sp_request(lambda artist, offset, song: sp.search(q=f"artist:{artist} track:{song}", type="track", limit=50, offset=offset, market="DE"), 0, 100, artist, offset, song)
        offset += 50
        if "tracks" not in result:
          print("No tracks in result")
          continue

        tracks = result["tracks"]

        if "items" in tracks and isinstance(tracks["items"], list):
          for track in tracks["items"]:
            tracks_.append(track)
            if len(tracks_) == 100:
              audio_features_ = sp_request(lambda x: sp.audio_features([track["id"] for track in x]), 0, 100, tracks_)
              track_features = []
              if len(tracks_) != len(audio_features_):
                print("Tracks and features have not the same lengh!")
                print(len(tracks_), len(audio_features_))
              for t, af in list(zip(tracks_, audio_features_)):
                if not t:
                  print("No track!")
                  print(t)
                  print(af)
                  no_tracks.append(af)
                  continue
                if not af:
                  print("No features")
                  print(af)
                  print(t)
                  no_features.append(t)
                  continue
                features = filter_track_features(t, af, False)
                track_features.append(features)
              df = pd.concat([df, pd.DataFrame(track_features)], ignore_index=True)
              #df.to_csv(f"{PATH}/data-missing.csv")
              tracks_ = []
              audio_features_ = []
        else:
          continue
    else:
      print(f"Artist {artist} has published over 1000 songs in the 80s")
      artist_over_1000.append(artist)

    pbar.update(1)

problems = {
    "artist_over_1000": artist_over_1000,
    "artist_not_found": artists_not_found,
    "no_features": [track["id"] for track in no_features]
}
#with open(f"{PATH}/data-missing-problems.json", "w") as file:
#    json.dump(problems, file)

  0%|          | 3/1153 [00:01<06:54,  2.78it/s]

mcl, (carmen) danger in her eyes not found!


  0%|          | 5/1153 [00:01<04:53,  3.91it/s]

o.m.d., (forever) live and die not found!


  1%|          | 9/1153 [00:02<03:13,  5.92it/s]

bill medley, (i've had) the time of my life not found!
jennifer warnes, (i've had) the time of my life not found!


  1%|          | 12/1153 [00:03<05:13,  3.64it/s]

sigue sigue sputnik, 21st century boy not found!


  2%|▏         | 22/1153 [00:06<03:52,  4.87it/s]

bonnie tyler, a rockin' good way not found!
nick straker band, a walk in the park not found!


  3%|▎         | 31/1153 [00:09<05:09,  3.62it/s]

ricky king, ahoi, ay ay capt'n not found!
quincy jones, ai no corrida (i-no-ko-ree-da) not found!
bill withers, ain't no sunshine (remix '88) not found!


  3%|▎         | 33/1153 [00:09<03:41,  5.07it/s]

rufus, ain't nobody not found!


  3%|▎         | 39/1153 [00:32<24:36,  1.33s/it]

caroline loeb, alles hat ein ende, nur die wurst hat zwei not found!


  points = chart_power_df_explode.loc[(chart_power_df_explode.Song == track_name.lower()) & (chart_power_df_explode.Artist.str.contains(artist_names.lower()))]['Points']
  points = chart_power_df_explode.loc[(chart_power_df_explode.Song == track_name.lower()) & (chart_power_df_explode.Artist.str.contains(artist_names.lower()))]['Points']
  5%|▍         | 55/1153 [00:38<05:47,  3.16it/s]

pink floyd, another brick in the wall (part ii) not found!


  points = chart_power_df_explode.loc[(chart_power_df_explode.Song == track_name.lower()) & (chart_power_df_explode.Artist.str.contains(artist_names.lower()))]['Points']
  points = chart_power_df_explode.loc[(chart_power_df_explode.Song == track_name.lower()) & (chart_power_df_explode.Artist.str.contains(artist_names.lower()))]['Points']
  7%|▋         | 75/1153 [01:05<08:46,  2.05it/s]

c. c. catch, back seat of your cadillac not found!


  7%|▋         | 81/1153 [01:07<09:09,  1.95it/s]

simple minds, ballad of the streets (belfast child) not found!


  7%|▋         | 85/1153 [01:08<05:46,  3.08it/s]

b.a. robertson, bang bang not found!


  8%|▊         | 94/1153 [01:30<57:52,  3.28s/it]  

boy george, big fun not found!
robbie nevil, big love not found!


  8%|▊         | 95/1153 [01:30<45:08,  2.56s/it]

ted herold, bill haley not found!


  9%|▉         | 104/1153 [01:33<07:05,  2.46it/s]

new order, blue monday 1988 not found!


  9%|▉         | 108/1153 [01:34<05:21,  3.25it/s]

joboxers, boxerbeat not found!


 10%|█         | 117/1153 [01:37<06:44,  2.56it/s]

the world's famous supreme team, buffalo gals not found!


 11%|█         | 123/1153 [02:00<53:28,  3.12s/it]  

caroline loeb, c'est la ouate not found!


 11%|█▏        | 131/1153 [02:02<07:59,  2.13it/s]

eddy grant, can't get enough of you not found!
boys town gang, can't take my eyes off you not found!
inga, careless love not found!


 12%|█▏        | 135/1153 [02:03<03:49,  4.44it/s]

anete humpe, careless love not found!
mysterious art, carma - omen 2 not found!
aretha franklin, casanova not found!


 12%|█▏        | 137/1153 [02:03<03:19,  5.10it/s]

t'pau, causing a commotion not found!


  points = chart_power_df_explode.loc[(chart_power_df_explode.Song == track_name.lower()) & (chart_power_df_explode.Artist.str.contains(artist_names.lower()))]['Points']
 12%|█▏        | 143/1153 [02:10<15:00,  1.12it/s]

kool, celebration (s.a.w. remix) not found!
the gang, celebration (s.a.w. remix) not found!


 13%|█▎        | 145/1153 [02:10<10:27,  1.61it/s]

raff, change your mind not found!


  points = chart_power_df_explode.loc[(chart_power_df_explode.Song == track_name.lower()) & (chart_power_df_explode.Artist.str.contains(artist_names.lower()))]['Points']
 13%|█▎        | 150/1153 [02:31<35:20,  2.11s/it]

t'pau, china in your hand not found!


 13%|█▎        | 154/1153 [02:32<13:41,  1.22it/s]

fun fun, colour my love not found!
mental as anything, come back and stay not found!


 13%|█▎        | 155/1153 [02:32<13:43,  1.21it/s]

dexys midnight runners with the emerald express, come on eileen not found!


 14%|█▍        | 159/1153 [02:33<06:37,  2.50it/s]

erasure, coming around again not found!
falco, coming home (jeanny part ii) not found!


 14%|█▍        | 161/1153 [02:33<05:45,  2.87it/s]

hongkong syndikat, concrete & clay not found!


 14%|█▍        | 166/1153 [02:35<08:01,  2.05it/s]

erasure, crackers international (ep) not found!


 15%|█▌        | 178/1153 [02:38<03:56,  4.13it/s]

glenn medeiros, cry wolf not found!


 16%|█▌        | 184/1153 [03:01<28:31,  1.77s/it]

kool, dance little lady (1987) not found!
the gang, dance little lady (1987) not found!
terence trent d'arby, dance little sister not found!


 16%|█▌        | 186/1153 [03:02<18:37,  1.16s/it]

the pins, dance on not found!


 16%|█▋        | 188/1153 [03:02<13:02,  1.23it/s]

mick jagger, dancing in the street not found!


 17%|█▋        | 197/1153 [03:04<04:55,  3.23it/s]

mysterious art, das omen (teil 1) not found!


 18%|█▊        | 212/1153 [03:30<1:38:02,  6.25s/it]

andy gibb, desire not found!


 19%|█▊        | 216/1153 [03:31<32:38,  2.09s/it]

ted herold, die besten sterben jung not found!


 19%|█▉        | 222/1153 [03:32<08:08,  1.91it/s]

kiz, die sennerin vom königssee not found!
peter schilling, die wüste lebt (alarmsignal ...) not found!
george kranz, din daa daa (trommeltanz) not found!


 20%|██        | 234/1153 [03:41<12:04,  1.27it/s]

patrick cowley feat. sylvester, do ya wanna funk not found!
eddy grant, do you feel my love not found!


 21%|██        | 242/1153 [04:01<25:28,  1.68s/it]

electric light orchestra, don't bring me down not found!
the communards, don't leave me this way not found!
bomb the bass, don't make me wait not found!


 21%|██▏       | 246/1153 [04:01<10:37,  1.42it/s]

lorraine, don't make me wait not found!
leon haywood, don't push it, don't force it not found!
michael jackson, don't stop 'til you get enough not found!


 22%|██▏       | 249/1153 [04:02<06:17,  2.39it/s]

simple minds, don't you (forget about me) not found!
nicole j. mccloud, don't you want my love not found!
level 42, down to earth not found!


 22%|██▏       | 251/1153 [04:02<04:57,  3.03it/s]

rififi, dr. acid and mr. house not found!


 22%|██▏       | 254/1153 [04:03<04:01,  3.72it/s]

o.m.d., dreaming not found!


 23%|██▎       | 261/1153 [04:04<04:05,  3.63it/s]

the bee gees, e.s.p. not found!


 23%|██▎       | 267/1153 [04:07<05:40,  2.61it/s]

stevie wonder, ebony and ivory not found!


 24%|██▎       | 273/1153 [04:30<1:29:53,  6.13s/it]

eddy grant, electric avenue not found!


 24%|██▍       | 282/1153 [04:33<09:58,  1.46it/s]

guesch patti, etienne not found!


 25%|██▍       | 285/1153 [04:33<05:54,  2.45it/s]

johnny hates jazz, ever fallen in love not found!


 25%|██▌       | 290/1153 [04:35<05:37,  2.56it/s]

gregory abbott, f.l.m. not found!


 25%|██▌       | 294/1153 [04:36<03:04,  4.66it/s]

pierre cosso, face your life not found!
the communards, faith not found!
16 bit, fake not found!


 26%|██▌       | 297/1153 [04:37<03:13,  4.42it/s]

darinka, fang das licht not found!


 26%|██▌       | 301/1153 [04:37<03:16,  4.33it/s]

klaus lage band, faust auf faust (schimanski) not found!


  points = chart_power_df_explode.loc[(chart_power_df_explode.Song == track_name.lower()) & (chart_power_df_explode.Artist.str.contains(artist_names.lower()))]['Points']
 26%|██▋       | 304/1153 [04:39<04:29,  3.15it/s]

conny, felicita not found!


 27%|██▋       | 307/1153 [05:00<47:49,  3.39s/it]  

the christians, holly johnson, paul mccartney, gerry marsden, ferry 'cross the mersey not found!
stock aitken waterman, ferry 'cross the mersey not found!


 27%|██▋       | 309/1153 [05:00<30:30,  2.17s/it]

nina, feuerwerk not found!


 27%|██▋       | 312/1153 [05:01<17:10,  1.23s/it]

hot shot, fire in the night not found!


 27%|██▋       | 315/1153 [05:04<16:47,  1.20s/it]

orlando riva sound, fire on the water not found!


 28%|██▊       | 319/1153 [05:06<09:51,  1.41it/s]

elton john, flames of paradise not found!


 28%|██▊       | 324/1153 [05:09<08:31,  1.62it/s]

ricky king, fly with me to malibu not found!
opus, flyin' high (live version) not found!


 28%|██▊       | 328/1153 [05:10<05:22,  2.56it/s]

bap, fortsetzung folgt ... not found!


 29%|██▉       | 333/1153 [05:11<03:43,  3.66it/s]

will to power, freebaby (medley: baby i love your way / freebird) not found!


 30%|██▉       | 342/1153 [05:33<10:58,  1.23it/s]

the bangles, full metal jacket (i wanna be your drill instructor) not found!


 30%|██▉       | 343/1153 [05:34<11:54,  1.13it/s]

fleetwood mac, funky town not found!


 30%|███       | 351/1153 [05:39<05:25,  2.46it/s]

o.m.d., genetic engineering not found!


 31%|███       | 358/1153 [06:00<1:11:57,  5.43s/it]

saragossa band, ginger red not found!


 32%|███▏      | 370/1153 [06:05<06:49,  1.91it/s]

eruption, go johnnie go not found!


 33%|███▎      | 381/1153 [06:12<03:53,  3.31it/s]

tommy piper singt alf, hallo alf, hier ist rhonda not found!


 33%|███▎      | 385/1153 [06:30<38:42,  3.02s/it]  

biene, hallo klaus (i wü nur zruck) not found!
koreana, hand in hand not found!
ottawan, hands up (give me your heart) not found!


 34%|███▎      | 389/1153 [06:32<15:54,  1.25s/it]

t'pau, heart and soul not found!


 34%|███▍      | 395/1153 [06:33<05:17,  2.38it/s]

la na nee nee noo noo, help! not found!


  points = chart_power_df_explode.loc[(chart_power_df_explode.Song == track_name.lower()) & (chart_power_df_explode.Artist.str.contains(artist_names.lower()))]['Points']
 35%|███▍      | 402/1153 [06:38<06:15,  2.00it/s]

dusty springfield, herz aus glas not found!
pet shop boys, herz aus glas not found!


 35%|███▍      | 403/1153 [06:38<05:08,  2.43it/s]

trio, herz ist trumpf (dann rufst du an ...) not found!


 35%|███▌      | 406/1153 [06:39<03:09,  3.95it/s]

kim wilde, hey mr. heartache not found!
s-express, hey music lover not found!


 36%|███▋      | 419/1153 [07:06<05:22,  2.27it/s]

the royal philharmonic orchestra (rpo), hooked on classics not found!


 38%|███▊      | 435/1153 [07:32<20:56,  1.75s/it]

howard carpendale, i don't want to be a hero not found!


 38%|███▊      | 437/1153 [07:32<12:06,  1.01s/it]

marc almond, i feel love not found!


 38%|███▊      | 439/1153 [07:33<08:21,  1.42it/s]

chrissie hynde, i got you babe not found!


 38%|███▊      | 443/1153 [07:33<04:24,  2.68it/s]

kim wilde, i heard a rumour not found!
michael jackson with siedah garrett, i just can't stop loving you not found!
u2, i just can't wait not found!


 40%|████      | 465/1153 [08:07<20:24,  1.78s/it]

glen goldsmith, i won't cry not found!


 41%|████      | 469/1153 [08:07<07:26,  1.53it/s]

jon, i'll find my way home not found!
vangelis, i'll find my way home not found!
boney m., i'm born again / bahama mama not found!


 41%|████      | 473/1153 [08:08<03:35,  3.15it/s]

eighth wonder, i'm not scared not found!
a-ha, i've been losing you not found!
peter griffin, i've lost my way not found!


 41%|████▏     | 476/1153 [08:09<03:22,  3.34it/s]

neue heimat, ich bau' dir ein schloß not found!


 42%|████▏     | 483/1153 [08:11<04:26,  2.52it/s]

nino de angelo, ich sterbe nicht noch mal not found!


 43%|████▎     | 494/1153 [08:32<10:45,  1.02it/s]

pepsi, if you let me stay not found!
shirlie, if you let me stay not found!
joe dolce, if you want to be happy not found!


 43%|████▎     | 497/1153 [08:34<07:30,  1.46it/s]

pete bardens, in dreams not found!


 44%|████▎     | 502/1153 [08:35<04:07,  2.63it/s]

tina turner, in the army now not found!


 44%|████▍     | 508/1153 [08:37<03:34,  3.01it/s]

orlando riva sound, indian reservation not found!
pat benatar, invincible (billie jean theme) not found!


 44%|████▍     | 510/1153 [08:38<05:01,  2.13it/s]

cock robin, irgendwann not found!


 45%|████▍     | 517/1153 [08:44<05:59,  1.77it/s]

bonnie tyler, islands not found!


 45%|████▌     | 520/1153 [09:00<32:00,  3.03s/it]

m.a.r.r.s., it doesn't have to be not found!


 46%|████▌     | 527/1153 [09:03<07:19,  1.43it/s]

barbara gaskin, it's my party not found!


 46%|████▌     | 531/1153 [09:05<04:56,  2.10it/s]

hot chocolate, italo boot mix vol. 7 not found!
nick kamen, italo boot mix vol. 8 not found!


 46%|████▋     | 536/1153 [09:06<02:33,  4.02it/s]

michael jackson with siedah garrett, jack mix ii not found!
jennifer rush, jack your body not found!
elton john, jack your body not found!


 47%|████▋     | 538/1153 [09:06<01:55,  5.33it/s]

falco, jeanny (part i) not found!


 47%|████▋     | 541/1153 [09:07<02:06,  4.83it/s]

club nouveau, jet airliner not found!


 47%|████▋     | 546/1153 [09:09<04:19,  2.34it/s]

depeche mode, johnny b not found!


  points = chart_power_df_explode.loc[(chart_power_df_explode.Song == track_name.lower()) & (chart_power_df_explode.Artist.str.contains(artist_names.lower()))]['Points']
  points = chart_power_df_explode.loc[(chart_power_df_explode.Song == track_name.lower()) & (chart_power_df_explode.Artist.str.contains(artist_names.lower()))]['Points']
 50%|████▉     | 574/1153 [10:03<22:36,  2.34s/it]

mary roos und david hanselmann, lady not found!


 51%|█████▏    | 591/1153 [10:09<02:13,  4.22it/s]

guesch patti, let be must the queen not found!
ferry aid, let it be not found!
oran "juice" jones, let's dance not found!


 52%|█████▏    | 599/1153 [10:30<35:10,  3.81s/it]

peter maffay, lieber gott ... not found!


 52%|█████▏    | 600/1153 [10:30<26:03,  2.83s/it]

soulsister, like a mountain (remix) not found!


 53%|█████▎    | 608/1153 [10:33<04:56,  1.84it/s]

o.m.d., locomotion not found!
c. c. catch, looking for a new love not found!


 54%|█████▍    | 622/1153 [10:41<04:37,  1.92it/s]

ingrid kup, love what's your face not found!


  points = chart_power_df_explode.loc[(chart_power_df_explode.Song == track_name.lower()) & (chart_power_df_explode.Artist.str.contains(artist_names.lower()))]['Points']
 55%|█████▍    | 629/1153 [11:02<26:55,  3.08s/it]

ricchi e poveri, m'innamoro di te not found!


 55%|█████▌    | 635/1153 [11:03<05:54,  1.46it/s]

o.m.d., maid of orleans (the waltz of joan of arc) not found!
boney m., malaika / consuela biaz not found!


  points = chart_power_df_explode.loc[(chart_power_df_explode.Song == track_name.lower()) & (chart_power_df_explode.Artist.str.contains(artist_names.lower()))]['Points']
 56%|█████▌    | 645/1153 [11:08<03:45,  2.25it/s]

g'race, manhattan not found!
the hornettes, mannequin not found!


 56%|█████▋    | 650/1153 [11:09<01:50,  4.56it/s]

rocco, marina (remix '89) not found!
the carnations, marina (remix '89) not found!
gazebo, master piece not found!


 57%|█████▋    | 654/1153 [11:10<01:49,  4.54it/s]

leinemann, mein tuut tuut not found!


 57%|█████▋    | 656/1153 [11:30<35:06,  4.24s/it]

matchbox, midnight dynamoes not found!


 58%|█████▊    | 664/1153 [11:33<06:04,  1.34it/s]

bonnie bianco, miss you so not found!
pierre cosso, mit dir des wär mei leben not found!


 58%|█████▊    | 668/1153 [11:34<03:55,  2.06it/s]

phil carmen, moonshine still not found!


 59%|█████▊    | 677/1153 [11:38<02:25,  3.27it/s]

guillermo marchena, my love is a tango not found!
joe cocker, my pretty one not found!


  points = chart_power_df_explode.loc[(chart_power_df_explode.Song == track_name.lower()) & (chart_power_df_explode.Artist.str.contains(artist_names.lower()))]['Points']
 59%|█████▉    | 686/1153 [11:41<01:59,  3.91it/s]

band für afrika, nackt im wind not found!


  points = chart_power_df_explode.loc[(chart_power_df_explode.Song == track_name.lower()) & (chart_power_df_explode.Artist.str.contains(artist_names.lower()))]['Points']
 60%|██████    | 696/1153 [12:08<04:41,  1.63it/s]

tony holiday, nie mehr allein sein not found!
trude herr, niemals geht man so ganz not found!


 61%|██████    | 700/1153 [12:09<02:07,  3.56it/s]

inga, no longer friends not found!
anete humpe, no longer friends not found!


 61%|██████    | 705/1153 [12:10<01:56,  3.86it/s]

the jacksons, nothin' (that compares to you) not found!


 62%|██████▏   | 712/1153 [12:12<02:41,  2.72it/s]

roxy music, oh yeah (on the radio) not found!


 62%|██████▏   | 714/1153 [12:30<28:31,  3.90s/it]

michael schanze und die fußball-nationalmannschaft, ole espana not found!


 62%|██████▏   | 716/1153 [12:31<16:02,  2.20s/it]

michael mcdonald, on my own not found!


 63%|██████▎   | 729/1153 [12:35<02:39,  2.65it/s]

crowded house, open your heart not found!


 64%|██████▎   | 735/1153 [12:37<02:41,  2.59it/s]

phil lynott, out in the fields not found!


 64%|██████▍   | 739/1153 [12:39<03:01,  2.28it/s]

matchbox, over the rainbow - you belong to me (medley) not found!


 64%|██████▍   | 743/1153 [13:03<17:10,  2.51s/it]

the jonzun crew, pack jam (look out for the ovc) not found!


 65%|██████▍   | 748/1153 [13:05<05:13,  1.29it/s]

max headroom, paranoimia not found!


 65%|██████▌   | 750/1153 [13:07<05:27,  1.23it/s]

saragossa band, pas pleurer (please no more crying) not found!


 66%|██████▌   | 762/1153 [13:10<01:32,  4.21it/s]

mike oldfield feat. aled jones, anita hegerland, pictures in the dark not found!
barry palmer, pictures in the dark not found!


 66%|██████▋   | 765/1153 [13:11<01:38,  3.95it/s]

out of the ordinary, play it again not found!


 67%|██████▋   | 768/1153 [13:33<33:36,  5.24s/it]

united balls, pogo in togo not found!


 68%|██████▊   | 779/1153 [13:40<04:47,  1.30it/s]

technotronic feat. felly, pump up the jam not found!
m.a.r.r.s., pump up the volume not found!


 68%|██████▊   | 781/1153 [13:43<07:49,  1.26s/it]

annie lennox, put a little love in your heart not found!


 69%|██████▊   | 790/1153 [13:47<02:59,  2.03it/s]

the revolution, raspberry beret not found!


 69%|██████▊   | 792/1153 [13:48<02:11,  2.74it/s]

paul engemann, reach out not found!


 69%|██████▉   | 796/1153 [14:00<10:28,  1.76s/it]

richard sanderson, reality not found!
jackie wilson, reet petite (the sweetest girl in town) not found!


 70%|██████▉   | 806/1153 [14:09<03:08,  1.84it/s]

max werner, roadrunner not found!


 70%|███████   | 810/1153 [14:09<01:25,  4.01it/s]

tommi ohrner, rock 'n' roll in old blue jeans not found!
johnny nash, rock me baby not found!


 70%|███████   | 812/1153 [14:11<03:03,  1.86it/s]

tony carey, room with a view not found!


 71%|███████   | 816/1153 [14:12<02:02,  2.76it/s]

eruption, runaway not found!


 72%|███████▏  | 828/1153 [14:31<10:05,  1.86s/it]

robin beck, save up all your tears not found!


 72%|███████▏  | 834/1153 [14:35<03:00,  1.77it/s]

jule neigel band, schatten an der wand not found!
original naabtal duo, schutzengel, bleib bei mir not found!
o.m.d., secret not found!


 73%|███████▎  | 836/1153 [14:35<01:56,  2.71it/s]

purple schulz und die neue heimat, sehnsucht not found!


 73%|███████▎  | 839/1153 [14:35<01:15,  4.16it/s]

raff, self control not found!
stephan remmler, sempre, sempre not found!


 73%|███████▎  | 846/1153 [14:37<00:59,  5.15it/s]

mike oldfield and roger chapman, shadow on the wall not found!
salt 'n' pepa feat. e.u., shake your thang (it's your thing) not found!


 74%|███████▎  | 850/1153 [14:40<02:50,  1.78it/s]

les mckeown, she's a lady not found!


 74%|███████▍  | 853/1153 [14:41<01:51,  2.70it/s]

wendy fraser, she's like the wind not found!


 74%|███████▍  | 856/1153 [14:41<01:25,  3.49it/s]

the pins, shine up not found!


 75%|███████▍  | 862/1153 [15:01<10:07,  2.09s/it]

silicon dream, sign 'o' the times not found!
terence trent d'arby, sign your name not found!


 75%|███████▌  | 866/1153 [15:03<04:08,  1.16it/s]

o.m.d., so in love not found!


 76%|███████▌  | 872/1153 [15:05<02:28,  1.89it/s]

luisa fernandez, solo por ti not found!


 76%|███████▌  | 878/1153 [15:06<01:04,  4.29it/s]

prince mohammed, someone loves you honey not found!
marc almond feat. gene pitney, something's gotten hold of my heart not found!


 78%|███████▊  | 895/1153 [15:34<03:34,  1.20it/s]

No features
None
{'album': {'album_type': 'album', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/4r8b3Hr0AMhzhAg75le0Gx'}, 'href': 'https://api.spotify.com/v1/artists/4r8b3Hr0AMhzhAg75le0Gx', 'id': '4r8b3Hr0AMhzhAg75le0Gx', 'name': 'Stars On 45', 'type': 'artist', 'uri': 'spotify:artist:4r8b3Hr0AMhzhAg75le0Gx'}], 'external_urls': {'spotify': 'https://open.spotify.com/album/3J8QtgfmfkgKzbKOE3eE4I'}, 'href': 'https://api.spotify.com/v1/albums/3J8QtgfmfkgKzbKOE3eE4I', 'id': '3J8QtgfmfkgKzbKOE3eE4I', 'images': [{'height': 640, 'url': 'https://i.scdn.co/image/ab67616d0000b273ea4918b07c54d0846ea692a1', 'width': 640}, {'height': 300, 'url': 'https://i.scdn.co/image/ab67616d00001e02ea4918b07c54d0846ea692a1', 'width': 300}, {'height': 64, 'url': 'https://i.scdn.co/image/ab67616d00004851ea4918b07c54d0846ea692a1', 'width': 64}], 'is_playable': True, 'name': '40 Years Anthology', 'release_date': '2022-09-09', 'release_date_precision': 'day', 'total_tracks': 20, 'type':

 78%|███████▊  | 896/1153 [15:36<05:18,  1.24s/it]

the jacksons with mick jagger, state of shock not found!


 78%|███████▊  | 903/1153 [15:37<01:14,  3.37it/s]

peter griffin, step by step not found!
sinitta, step by step not found!


 80%|████████  | 923/1153 [16:05<04:29,  1.17s/it]

s-express, superfly guy not found!


 80%|████████  | 928/1153 [16:06<01:44,  2.15it/s]

mixed emotions, sweetheart darling my dear (lisa my love) not found!


 81%|████████  | 935/1153 [16:30<22:34,  6.21s/it]

marti webb, take that look off your face not found!


 81%|████████▏ | 937/1153 [16:31<12:47,  3.55s/it]

o.m.d., talking loud and clear not found!


 81%|████████▏ | 939/1153 [16:31<07:58,  2.24s/it]

jawoll, taxi not found!


 82%|████████▏ | 942/1153 [16:33<05:07,  1.46s/it]

robin beck, tears in the rain not found!


 82%|████████▏ | 945/1153 [16:34<02:49,  1.23it/s]

terence trent d'arby, tell me why not found!


 82%|████████▏ | 949/1153 [16:35<01:34,  2.16it/s]

cannot convert the series to <class 'int'>
Tears Of Ice
Bolland
1645    1371
1646    1371
Name: Points, dtype: int64
-------
cannot convert the series to <class 'int'>
Tears Of Ice
Bolland
1645    1371
1646    1371
Name: Points, dtype: int64
-------


 83%|████████▎ | 956/1153 [16:39<01:22,  2.38it/s]

tolga flim flam balkan, the best of joint mix not found!


 83%|████████▎ | 962/1153 [16:40<00:45,  4.15it/s]

gary bird, the crown not found!
the g.b. experience, the crown not found!


 84%|████████▎ | 964/1153 [16:41<01:18,  2.42it/s]

michael jackson, the great commandment not found!


 85%|████████▍ | 976/1153 [17:06<01:40,  1.76it/s]

the s.o.s. band, the official bootleg mega mix not found!
the plastic population, the only way is up not found!
exile, the part of me that needs you most not found!


 85%|████████▌ | 981/1153 [17:10<02:08,  1.34it/s]

fat boys with stupid def vocals by chubby checker, the twist (yo, twist) not found!


 86%|████████▌ | 992/1153 [17:30<13:23,  4.99s/it]

pil, this is not a love song not found!


 87%|████████▋ | 998/1153 [17:33<02:41,  1.04s/it]

christian bruhn, timms thema not found!


 87%|████████▋ | 1002/1153 [17:34<01:07,  2.22it/s]

mel brooks, to be or not to be (the hitler rap) pt. 1 not found!
mike oldfield (vocals: maggie reilly), to france not found!


 87%|████████▋ | 1006/1153 [17:34<00:42,  3.43it/s]

patrick duffy, together we're strong not found!


 88%|████████▊ | 1017/1153 [17:39<00:42,  3.22it/s]

bon jovi, trouble not found!
wolf maahn und unterstützung, tschernobyl (das letzte signal) not found!
alan sorrenti, tu sei l'unica donne per me not found!


 89%|████████▊ | 1022/1153 [17:40<00:37,  3.52it/s]

moses p., twilight zone not found!


 89%|████████▉ | 1028/1153 [18:02<04:46,  2.29s/it]

eddy huntington, u.s.s.r. not found!
cliff richard, unchain my heart not found!


 90%|████████▉ | 1032/1153 [18:03<01:52,  1.07it/s]

lee majors, unknown stuntman not found!


 90%|████████▉ | 1037/1153 [18:06<01:02,  1.85it/s]

die flippers, victory not found!


 91%|█████████ | 1052/1153 [18:38<04:18,  2.56s/it]

the bootzilla orchestra, waltz darling not found!
alison moyet, warriors of the wasteland not found!


 91%|█████████▏| 1053/1153 [18:38<03:19,  2.00s/it]

christian franke, was wäre wenn â€¦ not found!


 92%|█████████▏| 1057/1153 [18:39<01:22,  1.16it/s]

d. mob feat. gary haisman, we call it acieed not found!


 92%|█████████▏| 1061/1153 [18:40<00:35,  2.57it/s]

sandra, we'll be together ('89 remix) not found!
udo lindenberg, weak in the presence of beauty not found!


 92%|█████████▏| 1063/1153 [18:40<00:30,  2.93it/s]

No features
None
{'album': {'album_type': 'album', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/39JBLI1Kj41yxv00pd2KNB'}, 'href': 'https://api.spotify.com/v1/artists/39JBLI1Kj41yxv00pd2KNB', 'id': '39JBLI1Kj41yxv00pd2KNB', 'name': 'Earth Frequencies', 'type': 'artist', 'uri': 'spotify:artist:39JBLI1Kj41yxv00pd2KNB'}, {'external_urls': {'spotify': 'https://open.spotify.com/artist/229Pj9JXTfFKS4NhYNTwDs'}, 'href': 'https://api.spotify.com/v1/artists/229Pj9JXTfFKS4NhYNTwDs', 'id': '229Pj9JXTfFKS4NhYNTwDs', 'name': '432 Hz Frequencies', 'type': 'artist', 'uri': 'spotify:artist:229Pj9JXTfFKS4NhYNTwDs'}], 'external_urls': {'spotify': 'https://open.spotify.com/album/6kE9gKvW23xPq3jUf4VDiF'}, 'href': 'https://api.spotify.com/v1/albums/6kE9gKvW23xPq3jUf4VDiF', 'id': '6kE9gKvW23xPq3jUf4VDiF', 'images': [{'height': 640, 'url': 'https://i.scdn.co/image/ab67616d0000b27319c1bb1e8d36cbd1f54662a4', 'width': 640}, {'height': 300, 'url': 'https://i.scdn.co/image/ab67616d000

 94%|█████████▎| 1080/1153 [19:05<00:50,  1.44it/s]

dusty springfield, what have i done to deserve this? not found!


  points = chart_power_df_explode.loc[(chart_power_df_explode.Song == track_name.lower()) & (chart_power_df_explode.Artist.str.contains(artist_names.lower()))]['Points']
 95%|█████████▍| 1091/1153 [19:12<00:39,  1.56it/s]

andreas martin, when a man loves a woman not found!
the revolution, when doves cry not found!


 95%|█████████▌| 1099/1153 [19:31<03:22,  3.76s/it]

ferry aid, where are you? not found!


  points = chart_power_df_explode.loc[(chart_power_df_explode.Song == track_name.lower()) & (chart_power_df_explode.Artist.str.contains(artist_names.lower()))]['Points']
 96%|█████████▌| 1107/1153 [19:34<00:27,  1.66it/s]

mike anthony, why can't we live together not found!


 96%|█████████▋| 1112/1153 [19:36<00:19,  2.16it/s]

die schlümpfe, willi, willi (kinderreime) not found!
terence trent d'arby, wishing well not found!


 97%|█████████▋| 1121/1153 [19:41<00:13,  2.41it/s]

baltimora, woodie boogie not found!
video kids, woodpeckers from space not found!


 97%|█████████▋| 1122/1153 [19:41<00:11,  2.59it/s]

the detroit spinners, working my way back to you - forgive me, girl (medley) not found!


 98%|█████████▊| 1125/1153 [19:42<00:10,  2.70it/s]

pascal, wozu sind kriege da? not found!


 98%|█████████▊| 1133/1153 [20:02<00:30,  1.52s/it]

heidi brühl, you are a part of my heart not found!


 99%|█████████▉| 1142/1153 [20:07<00:05,  2.04it/s]

jona lewie, you'll always find me in the kitchen at parties not found!
peter kent, you're all i need not found!


100%|██████████| 1153/1153 [20:10<00:00,  1.05s/it]


In [151]:
df = pd.read_csv(f"{PATH}/data-missing.csv")

Analysieren der Ergebnisse:

In [78]:
df.shape

(68371, 25)

Es wurden ~21.000 Songs erfragt bei gerade einmal 1153 Kombinationen. Eventuell macht es doch Sinn, das Jahr zu reduzieren. Es werden alle Songs ab dem Jahr 1990 herausgefiltert, da diese nicht in den Charts der 80er existieren konnten.

In [153]:
# Erstellen eines neuen Features "year"
df["year"] = df["release_date"].apply(lambda x: int(x[:4]))

In [154]:
df = df[df.year < 1990]
df.shape

(1438, 26)

In [155]:
df.head()

Unnamed: 0.1,Unnamed: 0,name,artists,album,release_date,release_date_precision,spotify_id,chart_power,uri,popularity,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,isrc,year
1,1,19 - Destruction Mix,Paul Hardcastle,Paul Hardcastle,1985-01-01,day,1AzIfmCVW3orhfmdxmCRo1,,spotify:track:1AzIfmCVW3orhfmdxmCRo1,35,...,0.0816,0.00296,0.62,0.0501,0.729,117.859,428667,4,GBAYK8500158,1985
3,3,19 - New Version,Paul Hardcastle,Paul Hardcastle,1985-01-01,day,6B2Tn5vzAFhJUuBG3N77op,,spotify:track:6B2Tn5vzAFhJUuBG3N77op,22,...,0.0681,0.00615,0.246,0.412,0.362,117.901,378240,4,GBAYK0700009,1985
5,5,19 - The Final Story,Paul Hardcastle,Paul Hardcastle,1985-01-01,day,6vLlYsgxxQCynGiEkcgELZ,,spotify:track:6vLlYsgxxQCynGiEkcgELZ,17,...,0.0873,0.00133,0.121,0.28,0.626,117.971,286933,4,GBAYK0700168,1985
10,10,19 - The Final Story Requiem,Paul Hardcastle,Paul Hardcastle,1985-01-01,day,4yEyIFA2DhKbDIOMB0dOBF,,spotify:track:4yEyIFA2DhKbDIOMB0dOBF,13,...,0.0964,0.0468,0.0886,0.42,0.545,116.874,185560,4,GBAYK0700169,1985
74,74,Earth from Space / Ground Zero / No Winners / ...,Paul Hardcastle,No Winners,1988-01-01,day,0XrnWmiSCL5TJtclcfFvwM,,spotify:track:0XrnWmiSCL5TJtclcfFvwM,4,...,0.069,0.0476,0.00358,0.335,0.298,120.022,1889840,4,GBAYK0800041,1988


Dies hat die Anzahl der Songs von 21098 auf 1438 reduziert.

In [156]:
df = df.applymap(lambda s: s.lower() if type(s) == str else s)
df_songs = list(df["name"])
df_artists = list(df["artists"])
df_songs_artists = list(zip(df_songs, df_artists))

In [157]:
not_in_df = [[song, artist] for song, artist in chart_songs_artists if (song, artist) not in df_songs_artists]
print(f"Insgesamt sind {len(not_in_df)} von {len(chart_songs_artists)} Songs nicht im DataFrame enthalten")

Insgesamt sind 1978 von 2027 Songs nicht im DataFrame enthalten


In [217]:
res = sp.search(q="Zob Marley", type="track", market="DE")

In [219]:
res["tracks"]["items"]

[{'album': {'album_type': 'album',
   'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/0KYZ0U8VUXgIscN7Scodd2'},
     'href': 'https://api.spotify.com/v1/artists/0KYZ0U8VUXgIscN7Scodd2',
     'id': '0KYZ0U8VUXgIscN7Scodd2',
     'name': 'Jah Marley',
     'type': 'artist',
     'uri': 'spotify:artist:0KYZ0U8VUXgIscN7Scodd2'}],
   'external_urls': {'spotify': 'https://open.spotify.com/album/7gLQR5W5eqZYKZRQHRRQTi'},
   'href': 'https://api.spotify.com/v1/albums/7gLQR5W5eqZYKZRQHRRQTi',
   'id': '7gLQR5W5eqZYKZRQHRRQTi',
   'images': [{'height': 640,
     'url': 'https://i.scdn.co/image/ab67616d0000b27338a5a4c647dc803c1cc45ff6',
     'width': 640},
    {'height': 300,
     'url': 'https://i.scdn.co/image/ab67616d00001e0238a5a4c647dc803c1cc45ff6',
     'width': 300},
    {'height': 64,
     'url': 'https://i.scdn.co/image/ab67616d0000485138a5a4c647dc803c1cc45ff6',
     'width': 64}],
   'is_playable': True,
   'name': 'Solitaire',
   'release_date': '2018-12-01',

In [158]:
df.head()

Unnamed: 0.1,Unnamed: 0,name,artists,album,release_date,release_date_precision,spotify_id,chart_power,uri,popularity,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,isrc,year
1,1,19 - destruction mix,paul hardcastle,paul hardcastle,1985-01-01,day,1azifmcvw3orhfmdxmcro1,,spotify:track:1azifmcvw3orhfmdxmcro1,35,...,0.0816,0.00296,0.62,0.0501,0.729,117.859,428667,4,gbayk8500158,1985
3,3,19 - new version,paul hardcastle,paul hardcastle,1985-01-01,day,6b2tn5vzafhjuubg3n77op,,spotify:track:6b2tn5vzafhjuubg3n77op,22,...,0.0681,0.00615,0.246,0.412,0.362,117.901,378240,4,gbayk0700009,1985
5,5,19 - the final story,paul hardcastle,paul hardcastle,1985-01-01,day,6vllysgxxqcyngiekcgelz,,spotify:track:6vllysgxxqcyngiekcgelz,17,...,0.0873,0.00133,0.121,0.28,0.626,117.971,286933,4,gbayk0700168,1985
10,10,19 - the final story requiem,paul hardcastle,paul hardcastle,1985-01-01,day,4yeyifa2dhkbdiomb0dobf,,spotify:track:4yeyifa2dhkbdiomb0dobf,13,...,0.0964,0.0468,0.0886,0.42,0.545,116.874,185560,4,gbayk0700169,1985
74,74,earth from space / ground zero / no winners / ...,paul hardcastle,no winners,1988-01-01,day,0xrnwmiscl5tjtclcffvwm,,spotify:track:0xrnwmiscl5tjtclcffvwm,4,...,0.069,0.0476,0.00358,0.335,0.298,120.022,1889840,4,gbayk0800041,1988


In [159]:
with open(f"{PATH}/data-missing-problems.json", "r") as file:
  data_missing_problems = json.load(file)

In [160]:
artists_not_found = data_missing_problems["artist_not_found"]

In [164]:
len(artists_not_found)

263