# Extra Aufgaben

## Filterung der Datenbank um irrelevante Ergebnisse zu verhindern

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel, polynomial_kernel, sigmoid_kernel, rbf_kernel, laplacian_kernel, chi2_kernel, euclidean_distances, manhattan_distances, cosine_distances
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tqdm
import time
from ast import literal_eval

In [27]:
by_isrc = pd.read_csv('data/by_isrc.csv')

In [28]:
by_isrc["country_code"] = by_isrc["isrc"].apply(lambda x: x[:2])

In [29]:
by_isrc["country_code"].unique()

array(['AE', 'AR', 'AT', 'AU', 'AZ', 'BA', 'BC', 'BE', 'BG', 'BR', 'BX',
       'Br', 'CA', 'CH', 'CI', 'CL', 'CN', 'CO', 'CS', 'CZ', 'DE', 'DG',
       'DK', 'EE', 'EG', 'ES', 'FI', 'FR', 'FX', 'GB', 'GN', 'GR', 'GT',
       'GX', 'HK', 'HR', 'HU', 'ID', 'IE', 'IL', 'IN', 'IR', 'IS', 'IT',
       'JM', 'JP', 'KE', 'KR', 'KS', 'LB', 'LK', 'LT', 'LU', 'LV', 'MD',
       'MM', 'MT', 'MU', 'MX', 'MY', 'NG', 'NI', 'NL', 'NO', 'NZ', 'No',
       'PE', 'PH', 'PL', 'PT', 'QM', 'QZ', 'RE', 'RO', 'RS', 'RU', 'SE',
       'SG', 'SI', 'SK', 'SM', 'SW', 'TC', 'TH', 'TR', 'TW', 'UA', 'UK',
       'UR', 'US', 'UY', 'VE', 'VN', 'ZA', 'ZZ', 'ae', 'ca', 'gb', 'il',
       'lv', 'nl', 'qz', 'us', 've'], dtype=object)

Einige Ländercodes sind kleingeschrieben. Diese sollen in Großbuchstaben umgewandelt werden.

In [30]:
by_isrc["country_code"] = by_isrc["country_code"].str.upper()

In [31]:
by_isrc["country_code"].unique()

array(['AE', 'AR', 'AT', 'AU', 'AZ', 'BA', 'BC', 'BE', 'BG', 'BR', 'BX',
       'CA', 'CH', 'CI', 'CL', 'CN', 'CO', 'CS', 'CZ', 'DE', 'DG', 'DK',
       'EE', 'EG', 'ES', 'FI', 'FR', 'FX', 'GB', 'GN', 'GR', 'GT', 'GX',
       'HK', 'HR', 'HU', 'ID', 'IE', 'IL', 'IN', 'IR', 'IS', 'IT', 'JM',
       'JP', 'KE', 'KR', 'KS', 'LB', 'LK', 'LT', 'LU', 'LV', 'MD', 'MM',
       'MT', 'MU', 'MX', 'MY', 'NG', 'NI', 'NL', 'NO', 'NZ', 'PE', 'PH',
       'PL', 'PT', 'QM', 'QZ', 'RE', 'RO', 'RS', 'RU', 'SE', 'SG', 'SI',
       'SK', 'SM', 'SW', 'TC', 'TH', 'TR', 'TW', 'UA', 'UK', 'UR', 'US',
       'UY', 'VE', 'VN', 'ZA', 'ZZ'], dtype=object)

Gruppierung der Songs nach den Ländercodes.

In [32]:
countries = {
    'AE': 'United Arab Emirates',
    'AR': 'Argentina',
    'AT': 'Austria',
    'AU': 'Australia',
    'AZ': 'Azerbaijan',
    'BA': 'Bosnia and Herzegovina',
    'BC': 'Canada (British Columbia)',
    'BE': 'Belgium',
    'BG': 'Bulgaria',
    'BR': 'Brazil',
    'BX': 'Benelux',
    'CA': 'Canada',
    'CH': 'Switzerland',
    'CI': 'Ivory Coast (Côte d\'Ivoire)',
    'CL': 'Chile',
    'CN': 'China',
    'CO': 'Colombia',
    'CS': 'Serbia and Montenegro',
    'CZ': 'Czech Republic',
    'DE': 'Germany',
    'DG': 'Germany (East)',
    'DK': 'Denmark',
    'EE': 'Estonia',
    'EG': 'Egypt',
    'ES': 'Spain',
    'FI': 'Finland',
    'FR': 'France',
    'FX': 'France (Metropolitan)',
    'GB': 'United Kingdom',
    'GN': 'Guinea',
    'GR': 'Greece',
    'GT': 'Guatemala',
    'GX': 'Greece (Ancient)',
    'HK': 'Hong Kong',
    'HR': 'Croatia',
    'HU': 'Hungary',
    'ID': 'Indonesia',
    'IE': 'Ireland',
    'IL': 'Israel',
    'IN': 'India',
    'IR': 'Iran',
    'IS': 'Iceland',
    'IT': 'Italy',
    'JM': 'Jamaica',
    'JP': 'Japan',
    'KE': 'Kenya',
    'KR': 'South Korea',
    'KS': 'Kosovo',
    'LB': 'Lebanon',
    'LK': 'Sri Lanka',
    'LT': 'Lithuania',
    'LU': 'Luxembourg',
    'LV': 'Latvia',
    'MD': 'Moldova',
    'MM': 'Myanmar (Burma)',
    'MT': 'Malta',
    'MU': 'Mauritius',
    'MX': 'Mexico',
    'MY': 'Malaysia',
    'NG': 'Nigeria',
    'NI': 'Nicaragua',
    'NL': 'Netherlands',
    'NO': 'Norway',
    'NZ': 'New Zealand',
    'PE': 'Peru',
    'PH': 'Philippines',
    'PL': 'Poland',
    'PT': 'Portugal',
    'QM': 'Marshall Islands', #second country code for the United States
    'QZ': 'Unknown or Invalid Area',
    'RE': 'Réunion',
    'RO': 'Romania',
    'RS': 'Serbia',
    'RU': 'Russia',
    'SE': 'Sweden',
    'SG': 'Singapore',
    'SI': 'Slovenia',
    'SK': 'Slovakia',
    'SM': 'San Marino',
    'SW': 'Sweden',
    'TC': 'Turks and Caicos Islands',
    'TH': 'Thailand',
    'TR': 'Turkey',
    'TW': 'Taiwan',
    'UA': 'Ukraine',
    'UK': 'United Kingdom',
    'UR': 'Uruguay',
    'US': 'United States',
    'UY': 'Uruguay',
    'VE': 'Venezuela',
    'VN': 'Vietnam',
    'ZA': 'South Africa',
    'ZZ': 'Unknown or Invalid Area'
}

In [33]:
for c in by_isrc["country_code"].unique():
    if c not in countries:
        print(c)

In [43]:
by_isrc["country"] = by_isrc["country_code"].apply(lambda x: countries[x])

In [44]:
grouped_countries = by_isrc.copy()
grouped_countries = grouped_countries[["country_code", "country"]]
grouped_countries["num_countries"] = 0
grouped_countries = grouped_countries.groupby(["country_code", "country"]).count()

In [45]:
grouped_countries = grouped_countries.sort_values("num_countries", ascending=False)

In [46]:
grouped_countries.reset_index(inplace=True)

In [54]:
grouped_countries["country_comb"] = grouped_countries.apply(lambda x: f'{x["country"]}-{x["country_code"]}', axis=1)

In [53]:
fig = px.bar(grouped_countries[:30], x="country_comb", y="num_countries", template="plotly_dark")
fig.update_layout(
    height=500,
)
fig.show()

Es sollen alle Lieder entfernt werden welche nicht aus englischsprachigen Länger (US, UK, CA, AU, Neuseeland) sowie Deutschland, Schweiz, Österreich, Italien und Schweden stammen.

In [87]:
important_country_codes = ["US", "GB", "CA", "AU", "NZ", "DE", "CH", "AT", "IT", "SE"]

# kleinere Inselgegenden von Amerika: Baker Island, Howland Island, Jarvis Island, Johnston Atoll, Kingman Reef, Midway Islands, Navassa Island, Palmyra Atoll, Wake Island
minor_insular_areas_us = ["XB", "XH", "XQ", "XU", "XM", "QM", "XV", "XL", "QW"]

`QM` wird heute als zusätzlicher Country code für Amerika verwendet, da der Code `US` sein Limit erreicht hat. Da dieser Beschluss jedoch aus 2010 stammt, sollten die Lieder der 80er nicht beeinflusst werden?

In [90]:
by_isrc[by_isrc["country_code"] == "QM"]

Unnamed: 0,isrc,genres,name,artists,album,release_date,release_date_precision,uri,spotify_id,chart_power,...,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,country_code,country
260929,QM2PV1614171,"['industrial', 'rock']",all saints day,severed heads,Rotund for Success,1988-10-31,day,spotify:track:2eLRr44vcdLTrNFLB9GRZZ,2eLRr44vcdLTrNFLB9GRZZ,,...,0.000346,0.637000,0.0791,0.7450,119.955,323128,4,1988,QM,Marshall Islands
260930,QM2PV1614172,"['industrial', 'rock']",triangle tangle tango,severed heads,Rotund for Success,1988-10-31,day,spotify:track:1g9jtzoxCrQV7PM0zZQVSE,1g9jtzoxCrQV7PM0zZQVSE,,...,0.154000,0.820000,0.1140,0.8790,118.544,247445,4,1988,QM,Marshall Islands
260931,QM2PV1614173,"['industrial', 'punk', 'punk-rock', 'rock']",bad times too,severed heads,Rotund for Success,1988-10-31,day,spotify:track:56cD9Fe0EaHChJvubuxoF2,56cD9Fe0EaHChJvubuxoF2,,...,0.001350,0.000576,0.1870,0.5750,128.701,240800,4,1988,QM,Marshall Islands
260932,QM2PV1614174,"['industrial', 'punk', 'punk-rock', 'rock']",midget sings,severed heads,Rotund for Success,1988-10-31,day,spotify:track:3U9SjWleTJyGXzMVeKlksN,3U9SjWleTJyGXzMVeKlksN,,...,0.083000,0.834000,0.0779,0.9780,126.927,197038,4,1988,QM,Marshall Islands
260933,QM2PV1614175,"['industrial', 'punk', 'punk-rock', 'rock']",seven miles,severed heads,Rotund for Success,1988-10-31,day,spotify:track:3gwsxQ3A4qbq7ryhRLwr4B,3gwsxQ3A4qbq7ryhRLwr4B,,...,0.009730,0.178000,0.6160,0.5770,139.642,231369,4,1988,QM,Marshall Islands
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275322,QMZUC1500044,['jazz'],song for kinyatta,alexander zonjic,Romance With You,1989-04-12,day,spotify:track:0dJeVAevdgkItt6jz6PwQb,0dJeVAevdgkItt6jz6PwQb,,...,0.161000,0.782000,0.0719,0.9090,124.924,318627,4,1989,QM,Marshall Islands
275323,QMZUC1500045,['jazz'],shawnee,alexander zonjic,Romance With You,1989-04-12,day,spotify:track:1fc4wC7HXS1cKvVmKr1tOp,1fc4wC7HXS1cKvVmKr1tOp,,...,0.262000,0.550000,0.2370,0.6050,99.608,415507,4,1989,QM,Marshall Islands
275324,QMZUC1500046,['jazz'],altos de chavon,alexander zonjic,Romance With You,1989-04-12,day,spotify:track:2Us8sWKEaFRUCMTMVUoCai,2Us8sWKEaFRUCMTMVUoCai,,...,0.562000,0.179000,0.1280,0.7920,89.535,377333,4,1989,QM,Marshall Islands
275325,QMZUC1500047,['jazz'],romance with you,alexander zonjic,Romance With You,1989-04-12,day,spotify:track:5upxN1EgFRQq5YZ8chMP4q,5upxN1EgFRQq5YZ8chMP4q,,...,0.939000,0.894000,0.0787,0.5480,80.818,408227,4,1989,QM,Marshall Islands


In [88]:
important_country_codes.extend(minor_insular_areas_us)

In [89]:
reduced_isrc = by_isrc.copy()
reduced_isrc = reduced_isrc[reduced_isrc["country_code"].isin(important_country_codes)]
reduced_isrc["genres"] = reduced_isrc["genres"].apply(literal_eval)