In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel, polynomial_kernel, sigmoid_kernel, rbf_kernel, laplacian_kernel, chi2_kernel, euclidean_distances, manhattan_distances

In [None]:
df = pd.read_csv('data/DE/data-neu.csv')

In [None]:
df.head()

In [None]:
df['genres'].unique()

Check for duplicate values

In [None]:
df.shape

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

There were a lot of duplicate values..

Check for null values

In [None]:
df.isna().any()

The null values for the Chart Power are valid, because not every song was in the Charts.

In [None]:
df.columns

First group the songs by the spotify id. This brings together all instances with different genres but same spotify id.

In [None]:
by_spotify_id = df.copy()
by_spotify_id = by_spotify_id.groupby('spotify_id').agg({
    'genres': list,
    'name': list,
    'artists': list,
    'album': list,
    'release_date': list,
    'release_date_precision': list,
    'uri': list,
    'isrc':list,
    'chart_power': list, 
    'popularity': list, 
    'danceability': list, 
    'energy': list,
    'key': list, 
    'loudness': list, 
    'mode': list, 
    'speechiness': list, 
    'acousticness': list, 
    'instrumentalness': list, 
    'liveness': list, 
    'valence': list,
    'tempo': list, 
    'duration_ms': list, 
    'time_signature': list
})

In [None]:
by_spotify_id.head()

Then reduce all categorical features such that there are no duplicated values in an instance for one feature.

In [None]:
def reduce_list(elements, string_return = True):
    '''
    Removes duplicate elements in a list

    Parameter
    ---------
    elements: list
        List that should be reduced

    string_return: boolean; default=True
        Whether a list with just one element should be returned as string or list

    Return
    ------
    unique_elements: list or str
    
    '''
    unique_elements = []
    for element in elements:
        if element not in unique_elements:
            unique_elements.append(element)
    if (len(unique_elements) == 1 and string_return):
        return unique_elements[0]
    return unique_elements

In [None]:
# by_spotify_id['name'] = by_spotify_id['name'].agg(reduce_list)
# by_spotify_id['isrc'] = by_spotify_id['isrc'].agg(reduce_list)

In [None]:
by_spotify_id.head().T

In [None]:
def get_release_year_index(release_dates):
    '''
    Returns the index of the release year.

    Parameter
    ---------
    release_dates: List
        List that contains all dates a version of the song was released.

    Return
    ------
    min_index: number
        Index in the list that contains the oldest release year.
    '''
    release_year = []
    for date in release_dates:
        release_year.append(int(date[:4]))
    return (np.array(release_year)).argmin()

In [None]:
def select_oldest_song(df, isrc_flag=False):
    features = ['artists', 'album', 'release_date_precision', 'uri', 'release_date', 'chart_power', 'popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
                'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature', 'name']

    if isrc_flag:
        features.append('genres')
        features.append('spotify_id')
    else:
        features.append('isrc')
    df_copy = df.copy()
    for index, instance in df_copy.iterrows():
        if type(instance['release_date']) == list:
            min_index = get_release_year_index(instance['release_date'])
            for feature in features:
                if type(instance[feature] == list):
                    df_copy.loc[index, feature] = instance[feature][min_index]
    return df_copy

In [None]:
by_spotify_id = select_oldest_song(by_spotify_id)

In [None]:
by_spotify_id.head().T

In [None]:
# by_spotify_id.to_csv('data/checkpoint/by_spotify_id_oldest.csv')

In [None]:
# by_spotify_id = pd.read_csv('data/checkpoint/by_spotify_id_oldest.csv')

The next step is to group all the songs by the isrc number since this should be the unique identifier. 

In [None]:
by_isrc = by_spotify_id.copy()
by_isrc.reset_index(inplace=True)
by_isrc = by_isrc.groupby('isrc').agg({
    'genres': list,
    'name': list,
    'artists': list,
    'album': list, 
    'release_date': list,
    'release_date_precision': list,
    'uri': list,
    'spotify_id':list,
    'chart_power': list, 
    'popularity': list, 
    'danceability': list, 
    'energy': list,
    'key': list, 
    'loudness': list, 
    'mode': list, 
    'speechiness': list, 
    'acousticness': list, 
    'instrumentalness': list, 
    'liveness': list, 
    'valence': list,
    'tempo': list, 
    'duration_ms': list, 
    'time_signature': list
})

In [None]:
by_isrc.head().T

In [None]:
by_isrc = select_oldest_song(by_isrc, True)

In [None]:
by_isrc.head().T

In [None]:
# by_isrc.to_csv('data/checkpoint/by_isrc_oldest.csv')

In [None]:
# by_isrc = pd.read_csv('data/checkpoint/by_isrc_oldest.csv')
# by_isrc.set_index('isrc', inplace=True)

In [None]:
# special_cases = by_isrc.copy()
# special_cases['name'] = special_cases['name'].agg(reduce_list, string_return=False)
# special_cases = special_cases[special_cases['name'].apply(len) > 1]

In [None]:
# special_cases.shape

Beispiel eines special cases

In [None]:
# df[df.isrc == 'AUCI10753909']

Filtern der Special cases: Wirklich relevant sind lediglich Lieder aus Deutschland, USA, UK, Italien und Schweden. Daher werden zunächst alle anderen Lieder herausgefiltert.

In [None]:
# country_codes = ['DE', 'IT', 'GB', 'US', 'SE']

In [None]:
# relevant_special_cases = pd.DataFrame()
# sum_entries = 0
# for code in country_codes:
#     rsc_country = special_cases[special_cases.index.str.startswith(code)]
#     sum_entries += rsc_country.shape[0]
#     relevant_special_cases = pd.concat([relevant_special_cases, rsc_country])
# sum_entries == relevant_special_cases.shape[0]

In [None]:
# print(f'Old shape: {special_cases.shape}')
# special_cases.drop(index=list(relevant_special_cases.index.values), inplace=True)
# print(f'New shape: {special_cases.shape}')

In [None]:
# relevant_special_cases.shape[0] + special_cases.shape[0]

In [None]:
# relevant_special_cases.head()

In [None]:
# relevant_special_cases

In [None]:
# indices = special_cases.index

In [None]:
# by_isrc_copyy = by_isrc_copyy.drop(index=indices.values)

In [None]:
# s = by_isrc_copyy['name'].apply(lambda x: type(x) != str)

In [None]:
# by_isrc.loc[s.values]

In [None]:
by_isrc['year'] = by_isrc['release_date'].apply(lambda x: int(x[:4]))

In [None]:
by_isrc['year'].unique()

In [None]:
index_songs_from_2023 = list(by_isrc[by_isrc['year'] == 2023].index)
by_isrc.drop(index=index_songs_from_2023, inplace=True)

In [None]:
by_isrc['year'].unique()

In [None]:
by_isrc.release_date_precision = by_isrc.release_date_precision.astype(str)

In [None]:
by_isrc.to_csv('data/checkpoint/by_isrc_oldest.csv')

Create a DataFrame which contains only the relevant features for the recommender system

In [None]:
df_recommender = by_isrc.drop(columns=['artists', 'genres', 'album', 'release_date', 'release_date_precision', 'chart_power', 'uri', 'popularity', 'name', 'spotify_id'])

Drop duplicate values..

In [None]:
df_recommender.shape

In [None]:
df_recommender.drop_duplicates(inplace=True)

In [None]:
df_recommender.shape

There were some duplicates..

In [None]:
df_recommender.head().T

Scale the data, so every feature has the same influence.

In [None]:
scaler = MinMaxScaler()
df_recommender_scaled = scaler.fit_transform(df_recommender)
df_recommender_scaled = pd.DataFrame(df_recommender_scaled, columns=df_recommender.columns, index = df_recommender.index)

In [None]:
df_recommender_scaled.reset_index(inplace=True)
df_recommender.reset_index(inplace=True)

In [None]:
df_recommender_scaled.head().T

In [None]:
df_recommender_scaled.describe().T[['min', 'max']]

Convert every genre to a feature. If a song is part of a genre it should contain the value 1 otherwise 0.

In [None]:
ct = pd.crosstab(df['isrc'], df['genres'])
# ct.reset_index(inplace=True)
ct = ct.applymap(lambda x: 1 if x > 1 else x)
ct.reset_index(inplace=True)

In [None]:
display(ct.head().T)
ct.shape

In [None]:
ct.describe()

In [None]:
ctMinMax = ct.describe().T
if (ctMinMax['min'].min() != 0) | (ctMinMax['min'].max() != 0) | (ctMinMax['max'].min() != 1) | (ctMinMax['max'].max() != 1):
    print('Values are not scaled correctly')
else:
    print('Values are all scaled between 0 and 1')

Merge both DataFrames together to create the Recommender System.

In [None]:
ct_merged = ct.merge(df_recommender_scaled, on=['isrc'], how='right')

In [None]:
ct_merged.isna().any().sum()

In [None]:
ct_merged.set_index(['isrc'], inplace=True)

In [None]:
display(ct_merged.head().T)
ct_merged.shape

In [None]:
ct_merged.to_csv('data/checkpoint/ct_merged.csv')

Overall there are 124 features used for the Recommendation system.

Try different distance measures / similarity functions.

In [None]:
def recommend_tracks_kernel(track: str, recommender_function, distance: bool = False):
    '''
    Recommends tracks that are similar to the provided track.

    Parameter
    ---------
    track: str
        Provided track

    df: pd.DataFrame
        DataFrame used for the Recommendation
    
    '''
    global ct_merged

    global by_isrc

    ids = list(by_isrc[by_isrc.name == track].index)

    if (len(ids)):
        kernel_array = recommender_function(ct_merged, ct_merged[ct_merged.index == ids[0]])
        kernel_df = pd.DataFrame(kernel_array, index=ct_merged.index)

        kernel_df = kernel_df.rename(columns={0: 'Score'})
        kernel_df = kernel_df.merge(by_isrc, how='left', on='isrc')
        display(kernel_df.sort_values(by='Score', ascending=distance).head(6))
        return kernel_df
    else:
        print('Error')


In [None]:
def try_functions(track):
    kernel_functions = [cosine_similarity]
    distance_functions = [euclidean_distances, manhattan_distances]
    result = {}
    for kernel_function in kernel_functions:
        display(kernel_function.__name__)
        result[kernel_function.__name__] = recommend_tracks_kernel(track, kernel_function, False)

    for distance_function in distance_functions:
        display(distance_function.__name__)
        result[distance_function.__name__] = recommend_tracks_kernel(track, distance_function, True)

    return result

In [None]:
by_isrc.head().T

In [None]:
result = try_functions("I'm Still Standing")