In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
import os
import warnings
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score, silhouette_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from enum import Enum
warnings.filterwarnings("ignore")

In [None]:
song_data = pd.read_csv('datasets/Spotify_Dataset_V3.csv', delimiter=';')
song_data.info()
song_data.describe()
spotify_song_data = song_data.copy()

In [None]:
spotify_song_data['Date'] = pd.to_datetime(spotify_song_data['Date'], format='%d/%m/%Y')

spotify_song_data['Data_Month'] = spotify_song_data['Date'].dt.to_period('M')

spotify_song_data.head()

average_monthly_points = spotify_song_data.groupby(['id', 'Data_Month'])['Points (Total)'].mean().reset_index()

average_monthly_points.rename(columns={'Points (Total)': 'Average_Points'}, inplace=True)

monthly_data = pd.merge(spotify_song_data, average_monthly_points, on=['id', 'Data_Month'], how='left')

columns_to_drop = ['Points (Total)', 'Points (Ind for each Artist/Nat)', 'Date', 'Rank']
monthly_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')

monthly_data.drop_duplicates(inplace=True)

monthly_data.head(500)

In [None]:
# Compute the correlation matrix
corr_matrix = monthly_data.corr()

corr_matrix = corr_matrix.iloc[1:, :-1]

mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k = 1)

plt.figure(figsize=(12, 8))

cmap = sns.diverging_palette(230, 20, as_cmap=True)

sns.heatmap(corr_matrix, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True)

plt.title("Numeric Feature Correlation Heatmap", fontsize=15)
plt.show()

In [None]:
# Extracting year from the 'Data_Month' column
monthly_data['Year'] = monthly_data['Data_Month'].dt.year

yearly_avg_points = monthly_data.groupby(['Artist (Ind.)', 'Year'])['Average_Points'].mean().reset_index()

yearly_avg_points_pivot = yearly_avg_points.pivot_table(index='Artist (Ind.)', columns='Year', values='Average_Points').reset_index()

for year in range(2017, 2023):
    yearly_avg_points_pivot[f'Growth_{year+1}'] = ((yearly_avg_points_pivot[year+1] - yearly_avg_points_pivot[year]) / yearly_avg_points_pivot[year]) * 100

yearly_avg_points_pivot.fillna(0, inplace=True)  
yearly_avg_points_pivot.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

#Initializing the label encoder
label_encoder = LabelEncoder()

#Encoding the 'Artist (Ind.)' column
yearly_avg_points_pivot['Artist_Encoded'] = label_encoder.fit_transform(yearly_avg_points_pivot['Artist (Ind.)'])

yearly_avg_points_pivot[['Artist (Ind.)', 'Artist_Encoded']].head()

In [None]:
threshold_75th_percentile = yearly_avg_points_pivot[2023].quantile(0.75)
yearly_avg_points_pivot['Popular'] = (yearly_avg_points_pivot[2023] > threshold_75th_percentile).astype(int)
yearly_avg_points_pivot[['Artist (Ind.)', 2023, 'Popular']].head()
threshold_75th_percentile_2021 = yearly_avg_points_pivot[2021].quantile(0.75)
threshold_75th_percentile_2022 = yearly_avg_points_pivot[2022].quantile(0.75)
yearly_avg_points_pivot['Popular_2021'] = (yearly_avg_points_pivot[2021] > threshold_75th_percentile_2021).astype(int)
yearly_avg_points_pivot['Popular_2022'] = (yearly_avg_points_pivot[2022] > threshold_75th_percentile_2022).astype(int)

yearly_avg_points_pivot[['Artist (Ind.)', 2021, 'Popular_2021', 2022, 'Popular_2022']].head()

In [None]:
data_2022 = monthly_data[monthly_data['Year'] == 2022]
monthly_avg_points_2022 = data_2022.groupby(['Artist (Ind.)', 'Data_Month'])['Average_Points'].mean().reset_index()
monthly_avg_points_2022.sort_values(by=['Artist (Ind.)', 'Data_Month'], inplace=True)

monthly_avg_points_2022['Monthly_Growth_Rate'] = monthly_avg_points_2022.groupby('Artist (Ind.)')['Average_Points'].pct_change().fillna(0) * 100

avg_monthly_growth_rate_2022 = monthly_avg_points_2022.groupby('Artist (Ind.)')['Monthly_Growth_Rate'].mean().reset_index()

avg_monthly_growth_rate_2022.head()

In [None]:
merged_data = avg_monthly_growth_rate_2022.merge(yearly_avg_points_pivot[['Artist (Ind.)', 'Popular']], on='Artist (Ind.)', how='left')

merged_data['Popular'].fillna(0, inplace=True)

X = merged_data[['Monthly_Growth_Rate']]
y = merged_data['Popular']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train.shape, X_test.shape

In [None]:
# Calculating the total average points for each artist in 2022
total_avg_points_2022 = data_2022.groupby('Artist (Ind.)')['Average_Points'].mean().reset_index()
total_avg_points_2022.rename(columns={'Average_Points': 'Total_Avg_Points_2022'}, inplace=True)

merged_data = merged_data.merge(total_avg_points_2022, on='Artist (Ind.)', how='left')

merged_data['Total_Avg_Points_2022'].fillna(0, inplace=True)

merged_data.head()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

accuracy, conf_matrix, class_report

In [None]:
data_2022 = monthly_data[monthly_data['Year'] == 2022]

monthly_avg_points_2022 = data_2022.groupby(['Artist (Ind.)', 'Data_Month'])['Average_Points'].mean().reset_index()

monthly_avg_points_2022.sort_values(by=['Artist (Ind.)', 'Data_Month'], inplace=True)

monthly_avg_points_2022['Monthly_Growth_Rate'] = monthly_avg_points_2022.groupby('Artist (Ind.)')['Average_Points'].pct_change().fillna(0) * 100

avg_monthly_growth_rate_2022 = monthly_avg_points_2022.groupby('Artist (Ind.)')['Monthly_Growth_Rate'].mean().reset_index()

avg_monthly_growth_rate_2022.head()

In [None]:
X_new = merged_data[['Total_Avg_Points_2022']]
y_new = merged_data['Popular']

X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, y_new, test_size=0.3, random_state=42)

log_reg_new = LogisticRegression(random_state=42)
log_reg_new.fit(X_train_new, y_train_new)

y_pred_new = log_reg_new.predict(X_test_new)

accuracy_new = accuracy_score(y_test_new, y_pred_new)
conf_matrix_new = confusion_matrix(y_test_new, y_pred_new)
class_report_new = classification_report(y_test_new, y_pred_new)

accuracy_new, conf_matrix_new, class_report_new

In [None]:
artists_2022 = set(data_2022['Artist (Ind.)'].unique())
artists_2023 = set(monthly_data[monthly_data['Year'] == 2023]['Artist (Ind.)'].unique())

In [None]:
artists_only_in_2022 = artists_2022 - artists_2023

num_artists_only_in_2022 = len(artists_only_in_2022)
num_artists_only_in_2022





In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
artists_both_years = artists_2022.intersection(artists_2023)
data_both_years = monthly_data[monthly_data['Artist (Ind.)'].isin(artists_both_years)]

yearly_avg_points_both_years = data_both_years.groupby(['Artist (Ind.)', 'Year'])['Average_Points'].mean().reset_index()

encoder = LabelEncoder()
yearly_avg_points_both_years['Artist_Code'] = encoder.fit_transform(yearly_avg_points_both_years['Artist (Ind.)'])

threshold_2022_both_years = yearly_avg_points_both_years[yearly_avg_points_both_years['Year'] == 2022]['Average_Points'].quantile(0.75)
yearly_avg_points_both_years['Popular'] = np.where((yearly_avg_points_both_years['Year'] == 2022) & 
                                                  (yearly_avg_points_both_years['Average_Points'] > threshold_2022_both_years), 1, 0)
data_for_model_both_years = yearly_avg_points_both_years.merge(avg_monthly_growth_rate_2022, on='Artist (Ind.)', how='inner')
data_for_model_both_years = data_for_model_both_years.merge(total_avg_points_2022, on='Artist (Ind.)', how='inner')
data_train_both_years = data_for_model_both_years[data_for_model_both_years['Year'] == 2022]
X_final_both_years = data_train_both_years[['Artist_Code', 'Monthly_Growth_Rate', 'Total_Avg_Points_2022']]
y_final_both_years = data_train_both_years['Popular']

X_train_final_both, X_test_final_both, y_train_final_both, y_test_final_both = train_test_split(
    X_final_both_years, y_final_both_years, test_size=0.3, random_state=42, stratify=y_final_both_years)

log_reg_final_both = LogisticRegression(random_state=42, max_iter=1000)
log_reg_final_both.fit(X_train_final_both, y_train_final_both)

y_pred_final_both = log_reg_final_both.predict(X_test_final_both)
accuracy_final_both = accuracy_score(y_test_final_both, y_pred_final_both)
conf_matrix_final_both = confusion_matrix(y_test_final_both, y_pred_final_both)
class_report_final_both = classification_report(y_test_final_both, y_pred_final_both)

accuracy_final_both, conf_matrix_final_both, class_report_final_both