### Step 1: data load

In [1]:
import pandas as pd
import numpy as np

file_path = '../../../data/Spotify_Dataset_V3.csv'
spotify_data = pd.read_csv(file_path, delimiter=';')

### Step 2: data prepare



In [2]:

spotify_data.drop_duplicates(inplace=True)

In [3]:
spotify_data['Date'] = pd.to_datetime(spotify_data['Date'], format='%d/%m/%Y', errors='coerce')

spotify_data.dropna(inplace=True)

In [4]:

audio_features = ['Danceability', 'Energy', 'Loudness', 'Speechiness', 
                  'Acousticness', 'Instrumentalness', 'Valence']

for feature in audio_features:
    min_val = spotify_data[feature].min()
    max_val = spotify_data[feature].max()
    print(f"{feature}: min={min_val}, max={max_val}")

Danceability: min=0.073, max=0.985
Energy: min=0.005, max=0.996
Loudness: min=-34475.0, max=1509.0
Speechiness: min=0.022, max=0.966
Acousticness: min=0.0, max=0.994
Instrumentalness: min=0.0, max=0.956
Valence: min=0.026, max=0.982


In [5]:
from sklearn.preprocessing import MinMaxScaler

audio_features_to_normalize = ['Loudness']

scaler = MinMaxScaler()
spotify_data[audio_features_to_normalize] = scaler.fit_transform(spotify_data[audio_features_to_normalize])


In [6]:
# encode label

from sklearn.preprocessing import LabelEncoder

label_encoders = {}
categorical_features = ['Artist (Ind.)', 'Nationality', 'Continent']

for feature in categorical_features:
    le = LabelEncoder()
    spotify_data[feature] = le.fit_transform(spotify_data[feature])
    label_encoders[feature] = le


In [7]:
import json

encoders_dict = {key: le.classes_.tolist() for key, le in label_encoders.items()}
with open('label_encoders.json', 'w') as f:
    json.dump(encoders_dict, f)

### Step 3: create lag feature

In [8]:
from datetime import timedelta

decay_factor = 0.9
window_size = 3

In [9]:
simple_data = spotify_data[['id', 'Date', 'Points (Total)']].copy()
simple_data = simple_data.drop_duplicates()


In [10]:
weights = np.array([decay_factor ** i for i in range(window_size)])
weights = weights / weights.sum() 

In [11]:
# init lag points
simple_data['Lag_Points'] = 0.0

# group by 'id' and compute the lag feature
for song_id, group in simple_data.groupby('id'):
    
    lag_matrix = np.zeros((len(group), window_size))
    
    for i in range(window_size):
        shifted_points = group['Points (Total)'].shift(-i).to_numpy()
        lag_matrix[:, i] = np.nan_to_num(shifted_points) 
    
    lag_points = lag_matrix.dot(weights)
    simple_data.loc[group.index, 'Lag_Points'] = lag_points

simple_data[['id', 'Date', 'Points (Total)', 'Lag_Points']].head(10)

Unnamed: 0,id,Date,Points (Total),Lag_Points
0,3qQbCzHBycnDpGskqOWY0E,2023-05-29,200,200.0
2,7ro0hRteUMfnOioTFI5TG1,2023-05-29,199,198.070111
3,2UW7JaomAMuX9pZrjVpHAU,2023-05-29,198,198.630996
5,7FbrGaHYVDmfr7KoLIZnQ7,2023-05-29,197,196.369004
6,6pD0ufEQq0xdHSsRbg9LBK,2023-05-29,196,196.929889
8,4DHcnVTT87F0zZhRPYmZ3B,2023-05-29,195,195.0
9,1odExI7RdWc4BT515LTAwj,2023-05-29,194,192.738007
10,1Qrg8KqiBpW07V7PNxwwwL,2023-05-29,193,193.332103
11,4Dvkj6JhhA12EX05fT7y2e,2023-05-29,192,192.929889
12,0DWdj2oZMBFSzRsi2Cvfzf,2023-05-29,191,190.667897


In [12]:
spotify_data = pd.merge(
    spotify_data,
    simple_data[['id', 'Date', 'Lag_Points']],
    on=['id', 'Date'],
    how='left'
)

spotify_data

Unnamed: 0,Rank,Title,Artists,Date,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,...,# of Artist,Artist (Ind.),# of Nationality,Nationality,Continent,Points (Total),Points (Ind for each Artist/Nat),id,Song URL,Lag_Points
0,1,Ella Baila Sola,"Eslabon Armado, Peso Pluma",2023-05-29,0.668,0.758,0.814223,0.033,0.483,0.000,...,Artist 1,617,Nationality 1,41,4,200,100.0,3qQbCzHBycnDpGskqOWY0E,https://open.spotify.com/track/3qQbCzHBycnDpGs...,200.000000
1,1,Ella Baila Sola,"Eslabon Armado, Peso Pluma",2023-05-29,0.668,0.758,0.814223,0.033,0.483,0.000,...,Artist 2,1558,Nationality 2,41,4,200,100.0,3qQbCzHBycnDpGskqOWY0E,https://open.spotify.com/track/3qQbCzHBycnDpGs...,200.000000
2,2,WHERE SHE GOES,Bad Bunny,2023-05-29,0.652,0.800,0.846376,0.061,0.143,0.629,...,Artist 1,183,Nationality 1,53,4,199,199.0,7ro0hRteUMfnOioTFI5TG1,https://open.spotify.com/track/7ro0hRteUMfnOio...,198.070111
3,3,La Bebe - Remix,"Yng Lvcas, Peso Pluma",2023-05-29,0.812,0.479,0.800272,0.333,0.213,0.000,...,Artist 1,2121,Nationality 1,41,4,198,99.0,2UW7JaomAMuX9pZrjVpHAU,https://open.spotify.com/track/2UW7JaomAMuX9pZ...,198.630996
4,3,La Bebe - Remix,"Yng Lvcas, Peso Pluma",2023-05-29,0.812,0.479,0.800272,0.333,0.213,0.000,...,Artist 2,1558,Nationality 2,41,4,198,99.0,2UW7JaomAMuX9pZrjVpHAU,https://open.spotify.com/track/2UW7JaomAMuX9pZ...,198.630996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
651958,197,Oh Lord,MiC LOWRY,2017-01-01,0.273,0.794,0.773427,0.068,0.022,0.000,...,Artist 1,1352,Nationality 1,70,3,4,4.0,1sTUEdVO85YU8Ymk2jeAls,https://open.spotify.com/track/1sTUEdVO85YU8Ym...,1.476015
651959,198,Superstition - Single Version,Stevie Wonder,2017-01-01,0.650,0.658,0.654930,0.085,0.089,0.004,...,Artist 1,1841,Nationality 1,71,1,3,3.0,5lXcSvHRVjQJ3LB2rLKQog,https://open.spotify.com/track/5lXcSvHRVjQJ3LB...,1.107011
651960,199,Secrets,The Weeknd,2017-01-01,0.665,0.771,0.797466,0.053,0.016,0.000,...,Artist 1,1958,Nationality 1,11,1,2,2.0,3DX4Y0egvc0slLcLl31h2p,https://open.spotify.com/track/3DX4Y0egvc0slLc...,0.738007
651961,200,Ni**as In Paris,"JAY-Z, Kanye West",2017-01-01,0.757,0.882,0.787850,0.248,0.076,0.000,...,Artist 1,832,Nationality 1,71,1,1,1.0,2KpCpk6HjXXLb7nnXoXA5O,https://open.spotify.com/track/2KpCpk6HjXXLb7n...,0.369004


In [13]:
spotify_data.to_csv('Processed_Spotify_Dataset.csv', index=False)