In [146]:
'''
IMPORTS
'''

#Imports the pandas library to read and process data tables
import pandas as pd

#Imports numpy library to perform mathematical operations on Arrays
import numpy as np

#Imports the StandardScalar from sklearn model to normalize (scale) the features
from sklearn.preprocessing import StandardScaler

#Imports the train_test_split class from sklearn model to split the dataset
from sklearn.model_selection import train_test_split

#Imports the Decision Tree Classifier from sklearn tree
from sklearn.tree import DecisionTreeClassifier

#Imports the accuracy_score class from sklearn metrics to test accuracy between acutal and predicted data
from sklearn.metrics import accuracy_score

In [178]:
#Creates a pandas Dataframe by reading the list of songs dataset from a local CSV File
df = pd.read_csv('Most Streamed Spotify Songs 2024.csv', encoding = 'utf-8', encoding_errors= 'ignore')

In [180]:
df[0:5]

Unnamed: 0,Track,Album Name,Artist,Release Date,ISRC,All Time Rank,Track Score,Spotify Streams,Spotify Playlist Count,Spotify Playlist Reach,...,SiriusXM Spins,Deezer Playlist Count,Deezer Playlist Reach,Amazon Playlist Count,Pandora Streams,Pandora Track Stations,Soundcloud Streams,Shazam Counts,TIDAL Popularity,Explicit Track
0,MILLION DOLLAR BABY,Million Dollar Baby - Single,Tommy Richman,4/26/2024,QM24S2402528,1,725.4,390470936,30716,196631588,...,684,62.0,17598718,114.0,18004655,22931,4818457.0,2669262,,0
1,Not Like Us,Not Like Us,Kendrick Lamar,5/4/2024,USUG12400910,2,545.9,323703884,28113,174597137,...,3,67.0,10422430,111.0,7780028,28444,6623075.0,1118279,,1
2,i like the way you kiss me,I like the way you kiss me,Artemas,3/19/2024,QZJ842400387,3,538.4,601309283,54331,211607669,...,536,136.0,36321847,172.0,5022621,5639,7208651.0,5285340,,0
3,Flowers,Flowers - Single,Miley Cyrus,1/12/2023,USSM12209777,4,444.9,2031280633,269802,136569078,...,2182,264.0,24684248,210.0,190260277,203384,,11822942,,0
4,Houdini,Houdini,Eminem,5/31/2024,USUG12403398,5,423.3,107034922,7223,151469874,...,1,82.0,17660624,105.0,4493884,7006,207179.0,457017,,1


In [204]:
#Some columns have comma ',' so we must remove the commas before using the data for training and testing

change_col = ['Spotify Streams', 'Spotify Playlist Count', 'Spotify Playlist Reach', 
              'YouTube Views', 'YouTube Likes', 'TikTok Posts', 'TikTok Likes', 
              'TikTok Views', 'YouTube Playlist Reach', 'AirPlay Spins', 'SiriusXM Spins',
              'Deezer Playlist Reach', 'Pandora Streams', 'Pandora Track Stations', 
              'Soundcloud Streams', 'Shazam Counts']

for remove_comma in change_col:
    df[remove_comma] = df[remove_comma].str.replace(',', '')
    
df.head(2)

Unnamed: 0,Track,Album Name,Artist,Release Date,ISRC,All Time Rank,Track Score,Spotify Streams,Spotify Playlist Count,Spotify Playlist Reach,...,SiriusXM Spins,Deezer Playlist Count,Deezer Playlist Reach,Amazon Playlist Count,Pandora Streams,Pandora Track Stations,Soundcloud Streams,Shazam Counts,TIDAL Popularity,Explicit Track
0,MILLION DOLLAR BABY,Million Dollar Baby - Single,Tommy Richman,4/26/2024,QM24S2402528,1,725.4,390470936,30716,196631588,...,684,62.0,17598718,114.0,18004655,22931,4818457,2669262,,0
1,Not Like Us,Not Like Us,Kendrick Lamar,5/4/2024,USUG12400910,2,545.9,323703884,28113,174597137,...,3,67.0,10422430,111.0,7780028,28444,6623075,1118279,,1


In [206]:
#Let's set up a feature matrix and a label vector for the dataset

feature_columns = ['Track Score', 'Spotify Streams', 'Spotify Playlist Count', 'Spotify Playlist Reach', 'Spotify Popularity', 'YouTube Views',	
                   'YouTube Likes',	'TikTok Posts',	'TikTok Likes',	'TikTok Views',	'YouTube Playlist Reach', 'Apple Music Playlist Count',	
                   'AirPlay Spins',	'SiriusXM Spins', 'Deezer Playlist Count', 'Deezer Playlist Reach',	'Amazon Playlist Count', 
                   'Pandora Streams', 'Pandora Track Stations',	'Soundcloud Streams', 'Shazam Counts', 'Explicit Track']

#X will be the feature matrix (dataframe) that stores all of our features' values
X = df[feature_columns]

#Y will be the label vector (dataframe) that will store all of the label values
y = df['All Time Rank']

In [208]:
#Creates an StandardScalar object to normalize (scale) the X train and test values
scaler = StandardScaler()

X = scaler.fit_transform(X)

In [210]:
#Splits up the dataset into testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.24, random_state=9)

In [216]:
print(X_train)

[[-0.43186888 -0.36552623 -0.47079803 ...         nan -0.24201808
  -0.74823178]
 [-0.50192673 -0.82015566 -0.82392999 ...         nan -0.40266156
  -0.74823178]
 [-0.44743729 -0.03246209 -0.37904221 ...         nan -0.3481902
   1.33648426]
 ...
 [ 0.70203047 -0.78103276 -0.82488607 ...         nan -0.41345663
  -0.74823178]
 [-0.56938985 -0.02147902 -0.34774457 ...         nan -0.29735578
  -0.74823178]
 [ 0.94334085  1.67545976  2.25819941 ...  0.14733092  2.27796952
  -0.74823178]]


In [None]:
'''
KEYVAN'S SECTION
'''

#Instantiates Decision Tree Classifer object
decision_tree = DecisionTreeClassifier(criterion='entropy', max_depth=None, random_state=5)

#Trains the DT object
decision_tree.fit(X_train, y_train)

#Tests DT object on testing data 
y_predict = decision_tree.predict(X_test)

#Checks the accuracy of the actual values compared to the predicted values for DT classifier
dt_accuracy = accuracy_score(y_test, y_predict)

In [None]:
print(dt_accuracy)