In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from matplotlib.pyplot import MultipleLocator
from matplotlib.colors import ListedColormap
import seaborn as sns
import os
import warnings
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score, silhouette_score, davies_bouldin_score, f1_score, roc_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.model_selection import GridSearchCV
from sklearn.utils import resample
from enum import Enum
from kneed import KneeLocator
warnings.filterwarnings("ignore")

In [None]:
song_data = pd.read_csv('datasets/merged_spotify_dataset.csv')
song_data.info()
song_data.describe()
spotify_song_data = song_data.copy()

In [None]:
spotify_song_data['Date'] = pd.to_datetime(spotify_song_data['Date'], format='%d/%m/%Y')

spotify_song_data['Data_Month'] = spotify_song_data['Date'].dt.to_period('M')

spotify_song_data.head()

average_monthly_points = spotify_song_data.groupby(['id', 'Data_Month'])['Points (Total)'].mean().reset_index()

average_monthly_points.rename(columns={'Points (Total)': 'Average_Points'}, inplace=True)

monthly_data = pd.merge(spotify_song_data, average_monthly_points, on=['id', 'Data_Month'], how='left')

columns_to_drop = ['Points (Total)', 'Points (Ind for each Artist/Nat)', 'Date', 'Rank']
monthly_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')

monthly_data.drop_duplicates(inplace=True)
monthly_unique_songs = monthly_data.drop_duplicates(subset=['id', 'Data_Month'])

monthly_unique_songs = monthly_unique_songs.dropna(subset=monthly_unique_songs.columns.difference(['genres']))

monthly_unique_songs.head(500)

In [None]:
artist_average_points = monthly_unique_songs.groupby('Artists')['Average_Points'].mean()

monthly_unique_songs['Artist_Average_Points'] = monthly_unique_songs['Artists'].map(artist_average_points)

monthly_unique_songs.head()

In [None]:
# Define the main genres for categorization
main_genres = ['pop', 'rock', 'hip hop', 'rap', 'r&b', 'country', 'jazz', 'classical', 'electronic', 'dance', 'latin', 'reggae', 'blues', 'soul', 'funk', 'metal', 'punk', 'folk', 'world', 'indie', 'corrido']

def categorize_genre(sub_genres):
    sub_genres_list = str(sub_genres).lower().split(',')
    for main_genre in main_genres:
        if any(main_genre in sub_genre for sub_genre in sub_genres_list):
            return main_genre
    return 'other'

monthly_unique_songs['General_Genre'] = monthly_unique_songs['genres'].apply(categorize_genre)

columns_to_drop = ['# of Artist', 'Artist (Ind.)', '# of Nationality', 'Nationality', 
                   'Points (Ind for each Artist/Nat)', 'Points (Total)', 'Song URL', 'genres']

monthly_unique_songs = monthly_unique_songs.drop(columns=columns_to_drop, errors='ignore')

monthly_unique_songs.head(500)

In [None]:
# Calculating the mean popularity
mean_popularity = monthly_unique_songs['popularity'].mean()

monthly_unique_songs['is_popular'] = (monthly_unique_songs['popularity'] >= mean_popularity).astype(int)
mean_popularity

In [None]:
from textblob import TextBlob

monthly_unique_songs['sentiment'] = monthly_unique_songs['Title'].map(lambda text: TextBlob(text).sentiment.polarity)

cut = pd.cut(
    monthly_unique_songs['sentiment'],
    [-np.inf, -.01, .01, np.inf],
    labels=['negative', 'neutral', 'positive']
)

monthly_unique_songs['polarity'] = cut.values

monthly_unique_songs[['polarity', 'sentiment']].head()

In [None]:
# Performing one-hot encoding for the specified columns
one_hot_continent = pd.get_dummies(monthly_unique_songs['Continent'], prefix='Continent')
one_hot_key = pd.get_dummies(monthly_unique_songs['key'], prefix='Key')
one_hot_mode = pd.get_dummies(monthly_unique_songs['mode'], prefix='Mode')
one_hot_time_signature = pd.get_dummies(monthly_unique_songs['time_signature'], prefix='Time_Signature')
one_hot_polarity = pd.get_dummies(monthly_unique_songs['polarity'], prefix='Polarity')
one_hot_genre = pd.get_dummies(monthly_unique_songs['General_Genre'], prefix='General_Genre')


# Concatenating the one-hot encoded columns with the original dataframe
monthly_unique_songs = pd.concat([monthly_unique_songs, one_hot_continent, one_hot_key, one_hot_mode, one_hot_time_signature, one_hot_polarity, one_hot_genre], axis=1)
monthly_unique_songs.head(500)

In [None]:
# Predict popularity of this database(75%learn 25%test)
# from sklearn.model_selection import train_test_split

# # Selecting the features for the model
# feature_columns = ['Danceability', 'Energy', 'Loudness', 'Speechiness', 'Acousticness', 
#                            'Instrumentalness', 'Valence', 'tempo', 'Artist_Average_Points', 'duration_ms'] + \
#                   [col for col in monthly_unique_songs.columns if col.startswith('Key_')] + \
#                   [col for col in monthly_unique_songs.columns if col.startswith('Continent_')] + \
#                   [col for col in monthly_unique_songs.columns if col.startswith('Mode_')] + \
#                   [col for col in monthly_unique_songs.columns if col.startswith('Time_Signature_')] + \
#                   [col for col in monthly_unique_songs.columns if col.startswith('Polarity_')] + \
#                   [col for col in monthly_unique_songs.columns if col.startswith('General_Genre_')]


# target_column = 'is_popular'


# X = monthly_unique_songs[feature_columns]
# y = monthly_unique_songs[target_column]

# X = X.fillna(X.mean())

# # Splitting the data into training and testing sets (75% training, 25% testing)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:

# According to the musical feature(2017-2022) to predict future popular music 2023 
monthly_unique_songs['Data_Month'] = pd.to_datetime(monthly_unique_songs['Data_Month'].dt.to_timestamp())

# Splitting the dataset into training (before 2023) and testing (2023)
train_data = monthly_unique_songs[(monthly_unique_songs['Data_Month'] >= '2017-01') & (monthly_unique_songs['Data_Month'] < '2023-01')]
test_data = monthly_unique_songs[monthly_unique_songs['Data_Month'] >= '2023-01']
target_column = 'is_popular'
# Selecting the features for the model
feature_columns = ['Danceability', 'Energy', 'Loudness', 'Speechiness', 'Acousticness', 
                   'Instrumentalness', 'Valence', 'tempo', 'Artist_Average_Points', 'duration_ms'] + \
                  [col for col in monthly_unique_songs.columns if col.startswith('Key_')] + \
                  [col for col in monthly_unique_songs.columns if col.startswith('Continent_')] + \
                  [col for col in monthly_unique_songs.columns if col.startswith('Mode_')] + \
                  [col for col in monthly_unique_songs.columns if col.startswith('Time_Signature_')] + \
                  [col for col in monthly_unique_songs.columns if col.startswith('Polarity_')] + \
                  [col for col in monthly_unique_songs.columns if col.startswith('General_Genre_')]

# Extracting features and target for training and testing data
X_train = train_data[feature_columns].fillna(train_data[feature_columns].mean())
y_train = train_data[target_column]

X_test = test_data[feature_columns].fillna(test_data[feature_columns].mean())
y_test = test_data[target_column]

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.svm import SVC

svm_model = SVC(random_state=42)
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

accuracy, conf_matrix, class_report

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score

rf_model = RandomForestClassifier(random_state=42)

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
class_report_rf = classification_report(y_test, y_pred_rf)
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]
roc_auc_rf = roc_auc_score(y_test, y_prob_rf)

accuracy_rf, conf_matrix_rf, class_report_rf, roc_auc_rf


In [None]:
from numpy import loadtxt
from xgboost import XGBClassifier
from xgboost import plot_importance
from matplotlib import pyplot

model = XGBClassifier()
model.fit(X_train, y_train)

# plot feature importance
ax = plot_importance(model)
fig = ax.figure
fig.set_size_inches(20, 15)
#plot_importance(model)
pyplot.show()

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg_model = LogisticRegression(random_state=42, max_iter=1000)

log_reg_model.fit(X_train, y_train)

y_pred_log_reg = log_reg_model.predict(X_test)

accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
conf_matrix_log_reg = confusion_matrix(y_test, y_pred_log_reg)
class_report_log_reg = classification_report(y_test, y_pred_log_reg)

y_prob_log_reg = log_reg_model.predict_proba(X_test)[:, 1]
roc_auc_log_reg = roc_auc_score(y_test, y_prob_log_reg)

accuracy_log_reg, conf_matrix_log_reg, class_report_log_reg, roc_auc_log_reg

In [None]:
import matplotlib.pyplot as plt

log_reg_model_simplified = LogisticRegression(random_state=42, max_iter=1000)
log_reg_model_simplified.fit(X_train.iloc[:1000], y_train.iloc[:1000])  # Using a subset of the data

y_prob_simplified = log_reg_model_simplified.predict_proba(X_test.iloc[:1000])[:, 1]
fpr_simplified, tpr_simplified, thresholds_simplified = roc_curve(y_test.iloc[:1000], y_prob_simplified)
roc_auc_simplified = auc(fpr_simplified, tpr_simplified)

# Plotting the ROC Curve
plt.figure()
plt.plot(fpr_simplified, tpr_simplified, color='blue', lw=2, label='ROC curve (area = %0.2f)' % roc_auc_simplified)
plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()