## Notebook for cleaning & exploring the dataset, and then run 6 different models: Decision Tree, Logistic Regression, Nearest Neighbor, Random Forest, XGBoost and SVM. Scores are given at the last cell

## Please scroll down to test our single song prediction! 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.feature_selection import RFE
import seaborn as sns

### First Model: Decision Tree 

In [3]:
df = pd.read_csv('compiled_cleaned_songs.csv', sep = ',')
train = df.drop(['Unnamed: 0','Artist', 'Track', 'Year','Key'], axis=1)
x = train.drop("Hot", axis=1)
y = train['Hot']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20) 


In [5]:
classifier = DecisionTreeClassifier()

classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [6]:
y_pred = classifier.predict(X_test)  

In [7]:
from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))  

[[1855   96]
 [  99  116]]
             precision    recall  f1-score   support

          0       0.95      0.95      0.95      1951
          1       0.55      0.54      0.54       215

avg / total       0.91      0.91      0.91      2166



In [8]:
# Cross Validation Score 
from sklearn.model_selection import cross_val_score

scores = cross_val_score(DecisionTreeClassifier(), X_train, y_train, cv=10, scoring = "roc_auc")
print(scores)
DTscore=scores.mean()
DTf1score=f1_score(y_test, y_pred, average=None)
print("Mean:", DTscore)

[0.76239073 0.71280925 0.7691411  0.71580866 0.74890443 0.7441459
 0.73994806 0.77551532 0.72171071 0.75629676]
Mean: 0.7446670928309218


### Second Model: Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
LR=LogisticRegression()
LR.fit(X_train, y_train)
y_pred2 = LR.predict(X_test)  
scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=10, scoring = "roc_auc")
print(scores)
LRscore=scores.mean()
LRf1score=f1_score(y_test, y_pred2, average=None)
print("Mean:", LRscore)
print(confusion_matrix(y_test, y_pred2))  
print(classification_report(y_test, y_pred2))  

[0.85262238 0.84675866 0.84484972 0.83443259 0.84784501 0.86581677
 0.76642616 0.85711124 0.79379694 0.79238045]
Mean: 0.8302039918530998
[[1926   25]
 [ 183   32]]
             precision    recall  f1-score   support

          0       0.91      0.99      0.95      1951
          1       0.56      0.15      0.24       215

avg / total       0.88      0.90      0.88      2166



### Third Model: Nearest Neighbor Classifier

In [10]:
from sklearn.neighbors import KNeighborsClassifier

nnc = KNeighborsClassifier(n_neighbors = 7)
nnc.fit(X_train, y_train)
y_pred3 = nnc.predict(X_test)  
scores = cross_val_score(nnc, X_train, y_train, cv=10, scoring = "roc_auc")
print(scores)
nncscore=scores.mean()
nncf1score=f1_score(y_test, y_pred3, average=None)
print("Mean:", nncscore)
print(confusion_matrix(y_test, y_pred3))
print(classification_report(y_test, y_pred3))

[0.57956731 0.62512399 0.6060378  0.62240863 0.5814705  0.65925221
 0.63197734 0.61031679 0.64332404 0.67364585]
Mean: 0.6233124453002252
[[1945    6]
 [ 211    4]]
             precision    recall  f1-score   support

          0       0.90      1.00      0.95      1951
          1       0.40      0.02      0.04       215

avg / total       0.85      0.90      0.86      2166



### Fourth Model: Random Forest Classifier

In [12]:
from sklearn.ensemble import RandomForestClassifier

rfe = RandomForestClassifier()
rfe.fit(X_train, y_train)
y_pred4 = rfe.predict(X_test) 
scores = cross_val_score(rfe, X_train, y_train, cv=10, scoring = "roc_auc")
print(scores)
rfescore=scores.mean()
rfef1score=f1_score(y_test, y_pred4, average=None)
print("Mean:", rfescore)
print(confusion_matrix(y_test, y_pred4))  
print(classification_report(y_test, y_pred4))

[0.87918124 0.89484625 0.90013722 0.90843699 0.89789444 0.87601257
 0.8803432  0.94981777 0.86280672 0.90528677]
Mean: 0.8954763174739326
[[1918   33]
 [ 124   91]]
             precision    recall  f1-score   support

          0       0.94      0.98      0.96      1951
          1       0.73      0.42      0.54       215

avg / total       0.92      0.93      0.92      2166



### Fifth Model: XGBOOST CLASSIFIER

In [13]:
from xgboost.sklearn import XGBClassifier

xgb = XGBClassifier()
xgb.fit(X_train, y_train)
y_pred5 = xgb.predict(X_test) 
scores = cross_val_score(xgb, X_train, y_train, cv=10, scoring = "roc_auc")
print(scores)
xgbscore=scores.mean()
xgbf1score=f1_score(y_test, y_pred5, average=None)
print("Mean:", xgbscore)
print(confusion_matrix(y_test, y_pred5))  
print(classification_report(y_test, y_pred5))

  if diff:


[0.95909091 0.94452387 0.93034099 0.94860785 0.95244419 0.94398949
 0.92883597 0.96504508 0.93646437 0.94609948]
Mean: 0.9455442198086393
[[1912   39]
 [  96  119]]
             precision    recall  f1-score   support

          0       0.95      0.98      0.97      1951
          1       0.75      0.55      0.64       215

avg / total       0.93      0.94      0.93      2166



### Sixth Model: SUPPORT VECTOR MACHINE

In [15]:
from sklearn.svm import SVC, LinearSVC

svc = LinearSVC()
svc.fit(X_train, y_train)
y_pred6 = svc.predict(X_test) 
scores = cross_val_score(svc, X_train, y_train, cv=10, scoring = "roc_auc")
print(scores)
svcscore=scores.mean()
svcf1score=f1_score(y_test, y_pred6, average=None)
print("Mean:", svcscore)
print(confusion_matrix(y_test, y_pred6))  
print(classification_report(y_test, y_pred6))

[0.50317599 0.44895845 0.49807445 0.42921222 0.46443274 0.43265017
 0.39616071 0.40665162 0.42754489 0.42001977]
Mean: 0.44268810260682023
[[  22 1929]
 [   0  215]]
             precision    recall  f1-score   support

          0       1.00      0.01      0.02      1951
          1       0.10      1.00      0.18       215

avg / total       0.91      0.11      0.04      2166



# Scores 

In [16]:
pd.DataFrame({'Mean Score': [DTscore, LRscore, nncscore, rfescore, xgbscore, svcscore] ,'F-1 Score': [DTf1score[1], LRf1score[1], nncf1score[1], rfef1score[1], xgbf1score[1],svcf1score[1]], 'Model': ['Decision Tree', 'Logistic Regression', 
              'Nearest Neighbor Classifier','Random Forest Classifier','XGBOOST CLASSIFIER','SUPPORT VECTOR MACHINE']}) 

Unnamed: 0,Mean Score,F-1 Score,Model
0,0.744667,0.543326,Decision Tree
1,0.830204,0.235294,Logistic Regression
2,0.623312,0.035556,Nearest Neighbor Classifier
3,0.895476,0.536873,Random Forest Classifier
4,0.945544,0.63807,XGBOOST CLASSIFIER
5,0.442688,0.182281,SUPPORT VECTOR MACHINE


## Single Song Prediction based on xgboost Model 

In [17]:
# Make sure you run the logistic regression model on top first
# If you dont have songs in mind: Please try: Ed Sheeran - Shape Of You, and Duke Dumont - Ocean Drive
artist = input("What is the artist name? ")
track = input("What is the track name? ")
print('Please run the next cell to see whether the inserted song will be in Billboard or not!')

What is the artist name? Martin Solveig
What is the track name? All stars
Please run the next cell to see whether the inserted song will be in Billboard or not!


In [18]:
import spotipy
import numpy as np
from spotipy.oauth2 import SpotifyClientCredentials

songs_features=[]
client_credentials_manager = SpotifyClientCredentials(client_id="a31e2c1446fd4e6aa11476d8532fc939",client_secret="b52f884f9d4f464eba58daf0d5180ea4")
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

from collections import OrderedDict
try:
    songs=sp.search(q="track:"+str(track)+" "+'artist:'+str(artist)+'*' , type='track')
    items = songs['tracks']['items']
    track2 = items[0]
    song_id = str(track2["id"])
    track_features=sp.audio_features(song_id)
    features = np.array(track_features)[0]
    danceability, energy, key, loudness, mode,speechiness,acousticness, instrumentalness, liveness, valence, tempo, type_, id_,uri, track_href,analysis_url,duration_ms, time_signature=features.values()
    track_info = sp.track(uri)
    track_pop=track_info['popularity']
    artists=sp.search('artist:'+str(artist)+'*' , type='artist')
    try: 
        artist_popularity = artists['artists']['items'][0]['popularity']
    except: 
        artist_popularity = 0 
    songs_features.append((danceability, energy, loudness, mode, speechiness,acousticness, instrumentalness, liveness, valence, tempo,duration_ms,time_signature,track_pop, artist_popularity))
except:
    print('Opps... it seems that the track or artist is incorrect!')

columns_ = ["Danceability","Energy", "Loudness", "Mode", "Speechiness", "Acousticness",
          "Instrumentalness", "Liveness", "Valence", "Tempo","Duration_ms", "Time_Signature",'Track_Popularity',"Artist_Popularity"]
xnew =pd.DataFrame(songs_features, columns=columns_)

ynew = xgb.predict(xnew)

print("Predicted=%s" % (ynew))

if ynew == 0.0: 
    print('We are sorry to tell you that the song you chose, most probably, wont be in Billboard')
else: 
    print('It seems like the song you chose will most probably appear in Billboard')

Predicted=[0]
We are sorry to tell you that the song you chose, most probably, wont be in Billboard


  if diff:
