In [2]:
import pandas as pd
data = pd.read_csv('../../data/Spotify_Dataset_V3.csv', delimiter=';')

# Ensure the date field is of datetime type, with the format DD/MM/YYYY
data['Date'] = pd.to_datetime(data['Date'], dayfirst=True)

# First sort by date in descending order, then by Rank in ascending order
sorted_data = data.sort_values(by=['Date', 'Rank'], ascending=[False, True])
sorted_data = sorted_data.set_index('Date', drop=False)

# Save the sorted data to a new CSV file
sorted_data.to_csv('sorted.csv', index=False, sep=';')

data = sorted_data

In [3]:
# Select only the necessary columns for the artist data
artist_data = data[['Artist (Ind.)', 'Date', 'Points (Ind for each Artist/Nat)']]

# Resample the data by month (Month-End frequency), and sum the numeric values for each artist
artist_data = artist_data.groupby(['Artist (Ind.)']).resample('ME').sum(numeric_only=True)

# Fill any missing months with 0, and assign directly to the column
artist_data = artist_data.reset_index()

# Print the aggregated data for inspection
print(artist_data['Artist (Ind.)'])

# Save the aggregated data to a new CSV file
artist_data.to_csv('artist_monthly_scores.csv', sep=';', header=True, index=True)

0              $NOT
1              $NOT
2              $NOT
3              $NOT
4              $NOT
            ...    
40382    Ñengo Flow
40383    Ñengo Flow
40384    Ñengo Flow
40385    Ñengo Flow
40386    Ñengo Flow
Name: Artist (Ind.), Length: 40387, dtype: object


In [4]:
import pandas as pd
import pmdarima as pm
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Dictionary to store the forecasts of each artist
artist_forecasts = {}
final = {}
unique_artists = artist_data['Artist (Ind.)'].unique()

for artist in unique_artists:
    artist_ts = artist_data[artist_data['Artist (Ind.)'] == artist]
    artist_ts['Date'] = pd.to_datetime(artist_ts['Date'])  # Ensure Date is in datetime format
    artist_ts.set_index('Date', inplace=True)
    
    # Ensure each artist has at least 24 months of data
    if len(artist_ts) < 24:
        continue  # Skip artists with insufficient data points
    
    # Perform ADF test and differencing if necessary to ensure stationarity
    result = adfuller(artist_ts['Points (Ind for each Artist/Nat)'])
    if result[1] > 0.05:  # If not stationary, perform differencing
        artist_ts['Points (Ind for each Artist/Nat)'] = artist_ts['Points (Ind for each Artist/Nat)'].diff().dropna()
    
    # Fill any remaining NaN values after differencing (this part can be improved further)
    artist_ts['Points (Ind for each Artist/Nat)'] = artist_ts['Points (Ind for each Artist/Nat)'].fillna(0)

    if artist_ts['Points (Ind for each Artist/Nat)'].isnull().sum() > 0:
        continue  # Skip artists with remaining missing values after processing
    
    # Fit ARIMA model
    try:
        model = pm.auto_arima(artist_ts['Points (Ind for each Artist/Nat)'], seasonal=False, stepwise=True, trace=False, error_action="ignore")
        forecast_values = model.predict(n_periods=7)  # Predict the next 7 months
        artist_forecasts[artist] = forecast_values
        final[artist] = forecast_values[6]  # Get the forecast value for the 7th month (January 2024)
        
    except ValueError as e:
        print(f"Error fitting model for artist {artist}: {e}")
        continue  # Skip artists that cause errors during model fitting

# Create a DataFrame for the 7th month forecasts of each artist
ranked_artists = pd.DataFrame.from_dict(final, orient='index', columns=['Forecast_7_Months'])

# Sort the artists based on their 7th month forecast scores
ranked_artists = ranked_artists.sort_values(by='Forecast_7_Months', ascending=False)

# Output the rankings
print(ranked_artists)

# Save the results to a CSV file
ranked_artists.to_csv('artist_forecasts_2024.csv', index=True)

# Visualize the top 10 artists based on their forecast scores
top_10_artists = ranked_artists.head(10)
top_10_artists.plot(kind='bar', figsize=(10, 6), legend=False)
plt.title('Top 10 Artists Predicted for January 2024')
plt.xlabel('Artist')
plt.ylabel('Predicted Score for January 2024')
plt.show()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  artist_ts['Date'] = pd.to_datetime(artist_ts['Date'])  # Ensure Date is in datetime format
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  artist_ts['Date'] = pd.to_datetime(artist_ts['Date'])  # Ensure Date is in datetime format
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  artist_ts['Date'] = pd.

KeyboardInterrupt: 

In [None]:
import pandas as pd
import pmdarima as pm
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Store the forecast results and error metrics for each artist
artist_forecasts = {}
final = {}
error_metrics = []  # To store error metrics

unique_artists = artist_data['Artist (Ind.)'].unique()

# Check for stationarity, and difference if non-stationary
# Only select artists with at least 2 years (24 months) of data
for artist in unique_artists:
    # Get the time series data for the artist
    artist_ts = artist_data[artist_data['Artist (Ind.)'] == artist]
    
    # Ensure 'Date' column is of datetime type
    artist_ts.loc[:, 'Date'] = pd.to_datetime(artist_ts['Date'])
    artist_ts.set_index('Date', inplace=True)
    
    # Ensure the artist has at least 24 months of data
    if len(artist_ts) < 24:
        continue    
    
    # Split into training and testing sets
    train_size = int(len(artist_ts) * 0.8)  # 80% for training, 20% for testing
    train_data, test_data = artist_ts[:train_size], artist_ts[train_size:]

    # Perform ADF test and difference to ensure stationarity
    result = adfuller(train_data['Points (Ind for each Artist/Nat)'])
    if result[1] > 0.05:  # If non-stationary, differencing is applied
        train_data.loc[:, 'Points (Ind for each Artist/Nat)'] = train_data['Points (Ind for each Artist/Nat)'].diff().dropna()
    train_data.loc[:, 'Points (Ind for each Artist/Nat)'] = train_data['Points (Ind for each Artist/Nat)'].fillna(0)
    
    # If there are missing values after differencing, skip the artist
    if train_data['Points (Ind for each Artist/Nat)'].isnull().sum() > 0:
        print(f"Artist {artist} has missing values after differencing, skipping this artist.")
        continue
    
    # Fit the ARIMA model
    try:
        model = pm.auto_arima(train_data['Points (Ind for each Artist/Nat)'], seasonal=False, stepwise=True, trace=False, error_action="ignore")
        
        # Fit the model with training data
        model.fit(train_data['Points (Ind for each Artist/Nat)'])
        
        # Forecast on the test set
        forecast = model.predict(n_periods=len(test_data))
        
        # Store forecast results
        artist_forecasts[artist] = forecast
        final[artist] = forecast[-1]  # The last forecast value is the prediction for the 7th month
        
        # Calculate error metrics
        mse = mean_squared_error(test_data['Points (Ind for each Artist/Nat)'], forecast)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(test_data['Points (Ind for each Artist/Nat)'], forecast)
        r2 = r2_score(test_data['Points (Ind for each Artist/Nat)'], forecast)

        # Store error metrics in the list
        error_metrics.append({
            'Artist': artist,
            'MSE': mse,
            'RMSE': rmse,
            'MAE': mae,
            'R2': r2
        })
        
        # Visualization of forecast and actual values
        
        
    except ValueError as e:
        print(f"Error while fitting model: {e}, skipping artist {artist}")
        continue

# Save all error metrics into a DataFrame
error_metrics_df = pd.DataFrame(error_metrics)

# Save the error metrics to a CSV file
error_metrics_df.to_csv('artist_error_metrics.csv', index=False, sep=';', header=True)

# Output the error metrics DataFrame
print(error_metrics_df)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Assuming ranked_artists is a DataFrame containing the predicted results for the top 5 artists
top_10_artists = ranked_artists.head(5)

# Create a figure
plt.figure(figsize=(14, 8))

# Set different colors, each artist gets a different color
colors = plt.cm.get_cmap('tab10', len(top_10_artists))  # Use colormap to get multiple colors

# Iterate over each Top 10 artist, plot their time series trends, and annotate the predicted values for the next 7 months
for i, artist in enumerate(top_10_artists.index):
    # Extract the data for this artist
    artist_ts = artist_data[artist_data['Artist (Ind.)'] == artist]
    artist_ts.set_index('Date', inplace=True)  # Set 'Date' as the index
    artist_ts = artist_ts.resample('M').sum()  # Aggregate by month
    
    # Plot historical data (using blue for historical data)
    plt.plot(artist_ts.index, artist_ts['Points (Ind for each Artist/Nat)'], 
             label=f'{artist} - Historical', color=colors(i), linestyle='-', linewidth=2)
    
    # Get the predicted values for the next 7 months
    forecast_values = artist_forecasts[artist]
    forecast_dates = [artist_ts.index[-1] + pd.DateOffset(months=i+1) for i in range(7)]  # Dates for the next 7 months
    
    # Plot forecast data (using red for forecast data)
    plt.plot(forecast_dates, forecast_values, 
             label=f'{artist} - Forecast', color=colors(i), linestyle='--', linewidth=2)

# Set title and labels
plt.title('Top 5 Artists Predicted for 2024 - Historical and Forecasted Trends', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Points (Ind for each Artist/Nat)', fontsize=12)

# Display the legend
plt.legend(loc='upper left')

# Show the plot
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [16]:
error_metrics_path = 'artist_error_metrics.csv'
error_metrics_df = pd.read_csv(error_metrics_path, delimiter=';')

# Display the first few rows to check its structure
error_metrics_df.head()    
print(error_metrics_df.info())
print(error_metrics_df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 630 entries, 0 to 629
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Artist  630 non-null    object 
 1   MSE     630 non-null    float64
 2   RMSE    630 non-null    float64
 3   MAE     630 non-null    float64
 4   R2      630 non-null    float64
dtypes: float64(4), object(1)
memory usage: 24.7+ KB
None
                MSE          RMSE           MAE             R2
count  6.300000e+02    630.000000    630.000000     630.000000
mean   9.106380e+06   1271.365214    974.599840    -308.852700
std    6.380577e+07   2738.963087   2250.495710    5594.642566
min    2.857143e-01      0.534522      0.142857 -137084.302424
25%    4.214263e+03     64.917351     33.214344      -1.371272
50%    1.556809e+05    394.563668    210.369048      -0.345411
75%    1.837383e+06   1355.500738    933.405698      -0.142857
max    1.272387e+09  35670.529767  24682.299778       0.695607
