## Notebook for cleaning & exploring the dataset, and then run 3 different models: Decision Tree, Logistic Regression and Nearest Neighbor. Scores are given at the last cell

## Please scroll down to test our single song prediction! 

In [84]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.feature_selection import RFE
import seaborn as sns

### First Model: Decision Tree 

In [159]:
df = pd.read_csv('compiled_cleaned_songs.csv', sep = ',')
train = df.drop(['Unnamed: 0','Artist', 'Track', 'Year','Key'], axis=1)
x = train.drop("Hot", axis=1)
y = train['Hot']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20) 
X_test

Unnamed: 0,Danceability,Energy,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Duration_ms,Time_Signature,Track_Popularity,Artist_Popularity
3649,0.406,0.7910,-3.804,1,0.0434,0.169000,0.000000,0.1670,0.4930,186.059,189467,4,40,78
5692,0.408,0.5320,-10.427,1,0.5490,0.407000,0.000277,0.1420,0.5560,83.800,304547,4,47,67
6770,0.800,0.6400,-8.671,1,0.1240,0.080400,0.000000,0.1410,0.4880,130.026,166220,4,43,57
1268,0.661,0.3660,-8.176,1,0.0415,0.588000,0.000000,0.1340,0.3570,129.744,262757,4,43,61
3761,0.373,0.2970,-10.456,1,0.0298,0.731000,0.000005,0.1290,0.2120,177.985,267507,4,24,40
8689,0.354,0.3870,-7.066,0,0.0373,0.359000,0.000000,0.2480,0.2080,80.915,203907,4,57,85
6719,0.684,0.6710,-4.639,1,0.0278,0.478000,0.000004,0.0821,0.7260,121.275,168693,4,25,60
10016,0.721,0.4740,-9.365,1,0.0517,0.441000,0.000000,0.0497,0.7330,112.398,181987,3,4,4
6168,0.064,0.0732,-19.996,0,0.0474,0.992000,0.577000,0.0860,0.0557,68.553,679547,3,37,62
6275,0.609,0.8500,-7.957,1,0.0505,0.092700,0.000054,0.6920,0.6170,87.703,219107,4,30,45


In [130]:
classifier2 = DecisionTreeClassifier()

classifier.fit(X_train, y_train)

RFE(estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
  n_features_to_select=None, step=1, verbose=0)

In [131]:
y_pred = classifier.predict(X_test)  

In [132]:
from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))  

[[1850  115]
 [  80  121]]
             precision    recall  f1-score   support

          0       0.96      0.94      0.95      1965
          1       0.51      0.60      0.55       201

avg / total       0.92      0.91      0.91      2166



In [133]:
# Cross Validation Score 
from sklearn.model_selection import cross_val_score

scores = cross_val_score(DecisionTreeClassifier(), X_train, y_train, cv=10, scoring = "roc_auc")
print(scores)
DTscore=scores.mean()
DTf1score=f1_score(y_test, y_pred, average=None)
print("Mean:", DTscore)

[0.74683718 0.72648826 0.7132954  0.7333627  0.73431588 0.71875451
 0.71501811 0.74435474 0.76980227 0.73843161]
Mean: 0.7340660666104399


### Second Model: Logistic Regression

In [134]:
from sklearn.linear_model import LogisticRegression
LR=LogisticRegression()
LR.fit(X_train, y_train)
y_pred2 = LR.predict(X_test)  
scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=10, scoring = "roc_auc")
print(scores)
LRscore=scores.mean()
LRf1score=f1_score(y_test, y_pred2, average=None)
print("Mean:", LRscore)
print(confusion_matrix(y_test, y_pred2))  
print(classification_report(y_test, y_pred2))  

[0.82031137 0.81049074 0.81573323 0.87727824 0.83794518 0.81027411
 0.84103324 0.85417398 0.82428045 0.83899321]
Mean: 0.8330513756419456
[[1909   56]
 [ 163   38]]
             precision    recall  f1-score   support

          0       0.92      0.97      0.95      1965
          1       0.40      0.19      0.26       201

avg / total       0.87      0.90      0.88      2166



### Third Model: Nearest Neighbor Classifier

In [135]:
from sklearn.neighbors import KNeighborsClassifier

nnc = KNeighborsClassifier(n_neighbors = 7)
nnc.fit(X_train, y_train)
y_pred3 = nnc.predict(X_test)  
scores = cross_val_score(nnc, X_train, y_train, cv=10, scoring = "roc_auc")
print(scores)
nncscore=scores.mean()
nncf1score=f1_score(y_test, y_pred3, average=None)
print("Mean:", nncscore)
print(confusion_matrix(y_test, y_pred3))
print(classification_report(y_test, y_pred3))

[0.58839288 0.59077583 0.61250397 0.64780047 0.54468386 0.62129921
 0.59589711 0.64358108 0.59667866 0.58812303]
Mean: 0.602973610568441
[[1954   11]
 [ 195    6]]
             precision    recall  f1-score   support

          0       0.91      0.99      0.95      1965
          1       0.35      0.03      0.06       201

avg / total       0.86      0.90      0.87      2166



### Fourth Model: Random Forest Classifier

In [141]:
from sklearn.ensemble import RandomForestClassifier

rfe = RandomForestClassifier()
rfe.fit(X_train, y_train)
y_pred4 = rfe.predict(X_test) 
scores = cross_val_score(rf, X_train, y_train, cv=10, scoring = "roc_auc")
print(scores)
rfescore=scores.mean()
rfef1score=f1_score(y_test, y_pred4, average=None)
print("Mean:", rfescore)
print(confusion_matrix(y_test, y_pred4))  
print(classification_report(y_test, y_pred4))

[0.88883914 0.85669103 0.90513706 0.90905809 0.91495768 0.91180209
 0.91854843 0.89198695 0.88208582 0.88525214]
Mean: 0.8964358424847543
[[1918   47]
 [  99  102]]
             precision    recall  f1-score   support

          0       0.95      0.98      0.96      1965
          1       0.68      0.51      0.58       201

avg / total       0.93      0.93      0.93      2166



### Fifth Model: XGBOOST CLASSIFIER

In [169]:
from xgboost.sklearn import XGBClassifier

xgb = XGBClassifier()
xgb.fit(X_train, y_train)
y_pred5 = xgb.predict(X_test) 
scores = cross_val_score(xgb, X_train, y_train, cv=10, scoring = "roc_auc")
print(scores)
xgbscore=scores.mean()
xgbf1score=f1_score(y_test, y_pred5, average=None)
print("Mean:", xgbscore)
print(confusion_matrix(y_test, y_pred5))  
print(classification_report(y_test, y_pred5))

  if diff:


[0.94796037 0.94111406 0.93646437 0.95489354 0.95265076 0.92966225
 0.9309312  0.93771856 0.95812492 0.95374264]
Mean: 0.94432626773725
[[1912   38]
 [  89  127]]
             precision    recall  f1-score   support

          0       0.96      0.98      0.97      1950
          1       0.77      0.59      0.67       216

avg / total       0.94      0.94      0.94      2166



### Sixth Model: SUPPORT VECTOR MACHINE

In [140]:
from sklearn.svm import SVC, LinearSVC

svc = LinearSVC()
svc.fit(X_train, y_train)
y_pred6 = svc.predict(X_test) 
scores = cross_val_score(svc2, X_train, y_train, cv=10, scoring = "roc_auc")
print(scores)
svcscore=scores.mean()
svcf1score=f1_score(y_test, y_pred6, average=None)
print("Mean:", svcscore)
print(confusion_matrix(y_test, y_pred6))  
print(classification_report(y_test, y_pred6))

[0.41503712 0.46252275 0.42495884 0.46261662 0.41991855 0.42562318
 0.41832934 0.47108635 0.46400784 0.49428162]
Mean: 0.4458382191067013
[[1965    0]
 [ 201    0]]
             precision    recall  f1-score   support

          0       0.91      1.00      0.95      1965
          1       0.00      0.00      0.00       201

avg / total       0.82      0.91      0.86      2166



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


# Scores 

In [209]:
pd.DataFrame({'Mean Score': [DTscore, LRscore, nncscore, rfescore, xgbscore, svcscore] ,'F-1 Score': [DTf1score[1], LRf1score[1], nncf1score[1], rfef1score[1], xgbf1score[1],svcf1score[1]], 'Model': ['Decision Tree', 'Logistic Regression', 
              'Nearest Neighbor Classifier','Random Forest Classifier','XGBOOST CLASSIFIER','SUPPORT VECTOR MACHINE']}) 

Unnamed: 0,Mean Score,F-1 Score,Model
0,0.734066,0.553776,Decision Tree
1,0.833051,0.257627,Logistic Regression
2,0.602974,0.055046,Nearest Neighbor Classifier
3,0.896436,0.582857,Random Forest Classifier
4,0.944326,0.666667,XGBOOST CLASSIFIER
5,0.445838,0.0,SUPPORT VECTOR MACHINE


## Single Song Prediction based on xgboost Model 

In [217]:
# Make sure you run the logistic regression model on top first
# If you dont have songs in mind: Please try: Ed Sheeran - Shape Of You, and Duke Dumont - Ocean Drive
artist = input("What is the artist name? ")
track = input("What is the track name? ")
print('Please run the next cell to see whether the inserted song will be in Billboard or not!')

What is the artist name? Martin Solveig
What is the track name? All Stars
Please run the next cell to see whether the inserted song will be in Billboard or not!


In [218]:
import spotipy
import numpy as np
from spotipy.oauth2 import SpotifyClientCredentials

songs_features=[]
client_credentials_manager = SpotifyClientCredentials(client_id="a31e2c1446fd4e6aa11476d8532fc939",client_secret="b52f884f9d4f464eba58daf0d5180ea4")
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

from collections import OrderedDict
try:
    songs=sp.search(q="track:"+str(track)+" "+'artist:'+str(artist)+'*' , type='track')
    items = songs['tracks']['items']
    track2 = items[0]
    song_id = str(track2["id"])
    track_features=sp.audio_features(song_id)
    features = np.array(track_features)[0]
    danceability, energy, key, loudness, mode,speechiness,acousticness, instrumentalness, liveness, valence, tempo, type_, id_,uri, track_href,analysis_url,duration_ms, time_signature=features.values()
    track_info = sp.track(uri)
    track_pop=track_info['popularity']
    artists=sp.search('artist:'+str(artist)+'*' , type='artist')
    try: 
        artist_popularity = artists['artists']['items'][0]['popularity']
    except: 
        artist_popularity = 0 
    songs_features.append((danceability, energy, loudness, mode, speechiness,acousticness, instrumentalness, liveness, valence, tempo,duration_ms,time_signature,track_pop, artist_popularity))
except:
    print('Opps... it seems that the track or artist is incorrect!')

columns_ = ["Danceability","Energy", "Loudness", "Mode", "Speechiness", "Acousticness",
          "Instrumentalness", "Liveness", "Valence", "Tempo","Duration_ms", "Time_Signature",'Track_Popularity',"Artist_Popularity"]
xnew =pd.DataFrame(songs_features, columns=columns_)

ynew = xgb.predict(xnew)

print("Predicted=%s" % (ynew))

if ynew == 0.0: 
    print('We are sorry to tell you that the song you chose, most probably, wont be in Billboard')
else: 
    print('It seems like the song you chose will most probably appear in Billboard')

Predicted=[0]
We are sorry to tell you that the song you chose, most probably, wont be in Billboard


  if diff:
