# **PREDICTING HIT SONGS**
**THERE ARE 3 DATASETS EXTRACTED FROM THE SPOTIFY API:**
* TRAINING DATASET - ONLY HIT SONGS
* TESTING DATASET - RANDOM SONGS
* EVALUATING DATASET - SONGS WITH VIEW COUNT
|

**THE WHOLE GOAL WAS TO TRAIN A MODEL ON METADATA OF HIT SONGS LIKE:**

'tempo','key','mode','danceability','valence','energy','acousticness','instrumentalness','liveness','speechiness'

**PREDICT USING RANDOM SONGS AND MATCH THE RESULT WITH THEIR VIEWS TO CHECK THE ACCURACY OF THE MODEL'S PREDICTION**

# **KAGGLE START**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **CONCATENATING THE TRAINING DATASET**

In [None]:
hits_dataset_filenames=['dataset-of-10s','dataset-of-00s','dataset-of-90s','dataset-of-80s','dataset-of-70s','dataset-of-60s']
hits_dataset_dict={}
for name in hits_dataset_filenames:
    df=pd.read_csv(f'/kaggle/input/the-spotify-hit-predictor-dataset/{name}.csv')
    df.drop(df.iloc[:, 14:18], inplace=True, axis=1)
    
    df.drop(['uri'], inplace=True, axis=1)
    df.drop(['loudness'], inplace=True, axis=1)
    
    print(df.shape)
    hits_dataset_dict[name]=df


# **FILTERING THE FEATURES AND SAVING THE TRACK AND NAME**

In [None]:
full_hits_dataset = pd.concat(hits_dataset_dict.values(), ignore_index=True, axis=0)
track_artist_fhd=full_hits_dataset.pop('track') +full_hits_dataset.pop('artist')
track_artist_fhd.to_csv('/kaggle/working/track_artist_fhd.csv')
full_hits_dataset.info()

# **FILTERING THE VIEWS DATASET**

In [None]:
top_songs_dataset=pd.read_csv('/kaggle/input/top-spotify-songs-2023/spotify-2023.csv',encoding='latin-1')
top_songs_views_dataset=top_songs_dataset.pop('streams')
print(top_songs_dataset.columns)
top_songs_dataset.drop(top_songs_dataset.iloc[:, 2:13], inplace=True, axis=1)
# top_songs_dataset.drop(['track_id','duration_ms'], inplace=True, axis=1)

top_songs_dataset.columns

In [None]:
top_songs_views_dataset

# **FILTERING THE TESTING DATASET**

In [None]:
most_songs_dataset=pd.read_csv('/kaggle/input/30000-spotify-songs/spotify_songs.csv')

most_songs_dataset.drop(most_songs_dataset.iloc[:, 3:11], inplace=True, axis=1)
most_songs_dataset.drop(['track_id','duration_ms','loudness'], inplace=True, axis=1)



most_songs_dataset

# **SORTING THE FEATURES OF THE TRAINGING DATASET AND SAVING THEIR TRACK AND ARTIST**

In [None]:
# for l in most_songs_dataset.columns:
#     for i in most_songs_dataset.columns:
#         if l!=i:
#             print(l,i)
for l in most_songs_dataset.columns:
    if l not in full_hits_dataset.columns:
        print(l)
        
most_songs_dataset.rename(columns={"track_name":"track","track_artist":"artist"}, inplace=True)
for l in most_songs_dataset.columns:
    if l not in full_hits_dataset.columns:
        print(l)
        
track_artist_msd=most_songs_dataset.pop('track') +" "+ most_songs_dataset.pop('artist') 
track_artist_msd.to_csv('/kaggle/working/track_artist_msd.csv')

track_artist_msd
 

# **KEY FEATURES THE MODELS WILL BE BASED ON**

In [None]:
features=['tempo','key','mode','danceability','valence','energy','acousticness','instrumentalness','liveness','speechiness']

# **EXTRACTING THE VIEWS AND THEIR TRACK**

In [None]:
#no loudness in this
for l in top_songs_dataset.columns:
    if l not in full_hits_dataset.columns:
        print(l)
        
# top_songs_dataset.rename(columns={"track_name":"track","track_artist":"artist"}, inplace=True)
top_songs_dataset
top_songs_dataset=top_songs_dataset.set_axis(['track','artist','tempo','key','mode','danceability','valence','energy','acousticness','instrumentalness','liveness','speechiness'], axis="columns")
for l in top_songs_dataset.columns:
    if l not in full_hits_dataset.columns:
        print(l)
track_artist_tsd=top_songs_dataset.pop('track')+" " +top_songs_dataset.pop('artist')
track_artist_tsd.to_csv('/kaggle/working/track_artist_tsd.csv')
top_songs_views_dataset=pd.concat([track_artist_tsd,top_songs_views_dataset], axis=1, join="inner",ignore_index=True)
top_songs_views_dataset=top_songs_views_dataset.set_axis(['track - artist', 'views'],axis='columns')
top_songs_views_dataset.to_csv('/kaggle/working/top_songs_views_dataset.csv')

top_songs_views_dataset     
# top_songs_views_dataset.columns

# **SPOTIFY'S CHORDS FROM THEIR API**

In [None]:
chords=['C','C#','D','D#','E','F','F#','G','G#','A','A#','B']

# **PREPROCESSING THE VIEWS DATASET**

In [None]:
top_songs_dataset.loc[top_songs_dataset['mode'] == 'Major', 'mode'] = 1
top_songs_dataset.loc[top_songs_dataset['mode'] == 'Minor', 'mode'] = 0
top_songs_dataset['key'] = top_songs_dataset['key'].fillna(-1)
for chord in chords:
    top_songs_dataset.loc[top_songs_dataset['key'] == chord, 'key'] = chords.index(chord)
labs = list(top_songs_dataset.columns)

for lab in labs:
    print(top_songs_dataset[lab].unique())

# **SORTING THE FEATURES THROUGH ALL DATASETS**

In [None]:
feat_dict={}
for lab in list(full_hits_dataset.columns):
    if lab in ['target','tempo','key','mode']: 
        feat_dict[lab]=1
        continue
    feat_dict[lab]=100
full_hits_dataset.mul(feat_dict)
# print(len(feat_dict))
feat_dict.pop('target')
print(feat_dict)
top_songs_dataset.mul(feat_dict)
most_songs_dataset.mul(feat_dict)
top_songs_dataset=top_songs_dataset[most_songs_dataset.columns]
full_hits_dataset

# **SPLITTING THE TRAINING DATASET**

In [None]:
from sklearn.model_selection import train_test_split

y=full_hits_dataset.pop('target')
# full_hits_dataset.drop(['mode','key'], inplace=True, axis=1)
y.to_csv('/kaggle/working/target.csv')
y.to_csv('/kaggle/working/target.csv')
full_hits_dataset.to_csv('/kaggle/working/full_hits_dataset.csv')

X_train, X_test, y_train, y_test = train_test_split(full_hits_dataset, y, test_size=0.2, random_state=42)


# **MODEL AND METRICS**

In [None]:
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.linear_model import SGDClassifier

from sklearn.ensemble import AdaBoostClassifier

from sklearn import svm

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import RadiusNeighborsClassifier

from sklearn.gaussian_process import GaussianProcessClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# **WORST MODELS**

In [None]:
# GaussianProcessClassifier() too much memory can't run
# KNeighborsClassifier(),RadiusNeighborsClassifier() no time to try

models=[svm.SVC(),svm.NuSVC()]
for model in models:
    dec = model
    dec=dec.fit(X_train,y_train)
    y_pred=dec.predict(X_test)


    print(classification_report(y_test, y_pred))
    conf_mat=confusion_matrix(y_test, y_pred)
    print(conf_mat)
    print(conf_mat.ravel())

# **BEST MODELS**

In [None]:
# HistGradientBoostingClassifier(),RandomForestClassifier(),ExtraTreesClassifier(),GradientBoostingClassifier(),AdaBoostClassifier() best
# svm.LinearSVC() very good not consistent
#tree.DecisionTreeClassifier(),SGDClassifier() +


models=[HistGradientBoostingClassifier(),RandomForestClassifier(),ExtraTreesClassifier(),GradientBoostingClassifier(),AdaBoostClassifier(),svm.LinearSVC(),SGDClassifier(),tree.DecisionTreeClassifier()]
for model in models:
    dec = model
    dec=dec.fit(X_train,y_train)
    y_pred=dec.predict(X_test)


    print(classification_report(y_test, y_pred))
    conf_mat=confusion_matrix(y_test, y_pred)
    print(conf_mat)
    print(conf_mat.ravel())

# **PREDICTING**

In [None]:
pred=[]
models=[HistGradientBoostingClassifier(),RandomForestClassifier()]
# models=[HistGradientBoostingClassifier()]
for model in models:
    dec = model
    dec=dec.fit(X_train,y_train)
    y_pred=dec.predict(most_songs_dataset)
    pred.append(y_pred)

# full_msd_hgbc=track_artist_msd+most_songs_dataset+y_pred[0]
# full_msd_rfc=track_artist_msd+most_songs_dataset+y_pred[1]

In [None]:
pred[0]=pd.DataFrame(pred[0],columns=['hit'])
pred[1]=pd.DataFrame(pred[1],columns=['hit'])


In [None]:
preds=[]
models=[HistGradientBoostingClassifier(),RandomForestClassifier()]
# models=[HistGradientBoostingClassifier()]
for model in models:
    dec = model
    dec=dec.fit(X_train,y_train)
    y_pred=dec.predict(top_songs_dataset)
    preds.append(y_pred)

In [None]:
preds[0]=pd.DataFrame(pred[0],columns=['hit'])
preds[1]=pd.DataFrame(pred[1],columns=['hit'])

# **FINAL PREDICTIONS DATASET**

In [None]:
# result = pd.concat([df1, df4], axis=1, join="inner")

full_msd_hgbc=pd.concat([track_artist_msd,pred[0]], axis=1, join="inner",ignore_index=True)
full_msd_rfc=pd.concat([track_artist_msd,pred[1]], axis=1, join="inner",ignore_index=True)
# full_msd_hgbc.rename(columns={"0": "track - artist", "1": "Hit"},inplace=True)
# full_msd_rfc.rename(columns={"0": "track - artist", "1": "Hit"},inplace=True)
print(full_msd_hgbc.shape)
full_msd_hgbc=full_msd_hgbc.set_axis(['track - artist', 'hit'], axis='columns')
full_msd_rfc=full_msd_rfc.set_axis(['track - artist', 'hit'], axis='columns')
# full_msd_hgbc.compare(full_msd_rfc)
print(full_msd_hgbc['track - artist'].isin(top_songs_views_dataset['track - artist'].values).unique(),
full_msd_rfc['track - artist'].isin(top_songs_views_dataset['track - artist'].values).unique())

In [None]:
# result = pd.concat([df1, df4], axis=1, join="inner")

full_tsd_hgbc=pd.concat([track_artist_tsd,preds[0]], axis=1, join="inner",ignore_index=True)
full_tsd_rfc=pd.concat([track_artist_tsd,preds[1]], axis=1, join="inner",ignore_index=True)
# full_msd_hgbc.rename(columns={"0": "track - artist", "1": "Hit"},inplace=True)
# full_msd_rfc.rename(columns={"0": "track - artist", "1": "Hit"},inplace=True)
print(full_tsd_hgbc.shape)
full_tsd_hgbc=full_tsd_hgbc.set_axis(['track - artist', 'hit'], axis='columns')
full_tsd_rfc=full_tsd_rfc.set_axis(['track - artist', 'hit'], axis='columns')
# full_msd_hgbc.compare(full_msd_rfc)
print(full_tsd_hgbc['track - artist'].isin(top_songs_views_dataset['track - artist'].values).unique(),
full_tsd_rfc['track - artist'].isin(top_songs_views_dataset['track - artist'].values).unique())

# **COMPARING THE PREDICTIONS WTH THEIR VIEW COUNT**

In [None]:
full_msd_rfc.set_axis(['track - artist', 'hit'],axis='columns')
full_msd_hgbc.set_axis(['track - artist', 'hit'],axis='columns')
print(top_songs_views_dataset.shape,full_msd_hgbc.shape,full_msd_hgbc['hit'].value_counts(),full_msd_rfc.shape,full_msd_rfc['hit'].value_counts())
merged_msd_hgbc = full_msd_hgbc.merge(top_songs_views_dataset,how="inner")
merged_msd_rfc=full_msd_rfc.merge(top_songs_views_dataset,how="inner")
print(merged_msd_hgbc['hit'].value_counts(),merged_msd_hgbc.shape,
merged_msd_rfc['hit'].value_counts(),merged_msd_rfc.shape)

In [None]:
full_tsd_rfc.set_axis(['track - artist', 'hit'],axis='columns')
full_tsd_hgbc.set_axis(['track - artist', 'hit'],axis='columns')
print(top_songs_views_dataset.shape,full_tsd_hgbc.shape,full_tsd_hgbc['hit'].value_counts(),full_tsd_rfc.shape,full_tsd_rfc['hit'].value_counts())
merged_tsd_hgbc = full_tsd_hgbc.merge(top_songs_views_dataset,how="inner")
merged_tsd_rfc=full_tsd_rfc.merge(top_songs_views_dataset,how="inner")
print(merged_tsd_hgbc['hit'].value_counts(),merged_tsd_hgbc.shape,
merged_tsd_rfc['hit'].value_counts(),merged_tsd_rfc.shape)

# **FINAL RESULTS**

In [None]:
merged_msd_rfc

In [None]:
merged_tsd_rfc

# **AFTER THIS:**
* locate songs better
* add another test dataset to check more precisely
* explore more nominal data not just numerical data
* add matplotlib charts for better visualization
* **and finally publish the blog**

# **IN THE FUTURE**
* play around with the modifiers of the two best models
* try it maybe on the milion song dataset