In [1]:
# packages to store and manipulate data
import numpy as np
import pandas as pd

# visualization packages
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

# ml modeling packages
import copy
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from xgboost.sklearn import XGBClassifier

from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.svm import SVC, LinearSVC

In [2]:
# Import Dataset
df = pd.read_csv('C:/Users/607791/Desktop/DS/Practicum/MSD20k_and_BB.csv')
df.head()

Unnamed: 0,artist_id,artist_latitude,artist_familiarity,artist_hotness,artist_location,artist_longitude,artist_name,duration,end_of_fade_in,key,...,mode_confidence,release,song_hotness,start_of_fade_out,tempo,time_signature,time_signature_confidence,title,year,bb_hotsong
0,ARMQHX71187B9890D3,,0.780462,0.574275,"Atlanta, GA",,Mastodon,280.21506,0.238,5,...,0.5,Call of the Mastodon,0.597641,275.528,173.205,5,0.12,Deep Sea Creature,2001,0
1,ARD7TVE1187B99BFB1,,0.581794,0.401998,California - LA,,Casual,218.93179,0.247,1,...,0.636,Fear Itself,0.60212,218.932,92.198,4,0.778,I Didn't Mean To,0,0
2,ARMJAGH1187FB546F3,35.14968,0.63063,0.4175,"Memphis, TN",-90.04892,The Box Tops,148.03546,0.148,6,...,0.43,Dimensions,,137.915,121.274,4,0.384,Soul Deep,1969,1
3,ARKRRTF1187B9984DA,,0.487357,0.343428,,,Sonora Santanera,177.47546,0.282,8,...,0.565,Las Numero 1 De La Sonora Santanera,,172.304,100.07,1,0.0,Amor De Cabaret,0,0
4,AR7G5I41187FB4CE6C,,0.630382,0.454231,"London, England",,Adam Ant,233.40363,0.0,0,...,0.749,Friend Or Foe,,217.124,119.293,4,0.0,Something Girls,1982,0


In [3]:
df_clean=df.drop(['artist_id'], axis=1)
df_clean=df_clean.drop(['artist_latitude'], axis=1)
df_clean=df_clean.drop(['artist_longitude'], axis=1)
df_clean=df_clean.drop(['artist_name'], axis=1)
df_clean=df_clean.drop(['artist_location'], axis=1)
df_clean=df_clean.drop(['end_of_fade_in'], axis=1)
df_clean=df_clean.drop(['start_of_fade_out'], axis=1)
df_clean=df_clean.drop(['release'], axis=1)
df_clean=df_clean.drop(['title'], axis=1)
df_clean=df_clean.drop(['year'], axis=1)
df_clean=df_clean.drop(['key_confidence'], axis=1)
df_clean=df_clean.drop(['mode_confidence'], axis=1)
df_clean=df_clean.drop(['time_signature_confidence'], axis=1)

In [4]:
df.shape

(10001, 23)

In [5]:
df_clean=df_clean.dropna()
df_clean.shape

(5648, 10)

In [6]:
df_clean.head()

Unnamed: 0,artist_familiarity,artist_hotness,duration,key,loudness,mode,song_hotness,tempo,time_signature,bb_hotsong
0,0.780462,0.574275,280.21506,5,-3.306,1,0.597641,173.205,5,0
1,0.581794,0.401998,218.93179,1,-11.197,0,0.60212,92.198,4,0
5,0.651046,0.401724,209.60608,2,-4.501,1,0.604501,129.738,4,0
9,0.426668,0.332276,269.81832,4,-13.496,1,0.265861,86.643,4,0
11,0.360031,0.296269,218.77506,5,-10.021,0,0.0,146.765,1,0


In [7]:
hotness = copy.deepcopy(df_clean.bb_hotsong)
df_hot = df_clean.drop("bb_hotsong", axis=1)

In [8]:
# training/test data, test prediction XGB accuracy
x_train, x_test, y_train, y_test = train_test_split(df_hot, hotness, test_size=0.35, random_state=2)
model = XGBClassifier()
model.fit(x_train, y_train)

y_predict = model.predict(x_test)
accuracy = model.score(x_test, y_test)

print("Prediction Accuracy: %.1f%%" % (accuracy * 100.0))

Prediction Accuracy: 96.0%


In [9]:
# cross validation for each model
def modeling(model, x_train, y_train):
    scores = cross_val_score(model, x_train, y_train, cv=10, scoring = "roc_auc")
    print("Cross Validation Scores:", scores)
    print("Cross Validation Mean:", scores.mean())
    print("Cross Validation Standard Deviation:", scores.std())
    print("Model as Percentage: ", scores.mean()*100)
    return scores.mean()

In [10]:
#vlogistic regression classifier
log_reg = LogisticRegression()
log_reg.fit(df_hot, hotness)
log_reg_res = modeling(log_reg, df_hot, hotness)



Cross Validation Scores: [0.46857671 0.62307455 0.61899261 0.63709181 0.5767098  0.55275724
 0.51994763 0.57763401 0.46983025 0.68117284]
Cross Validation Mean: 0.5725787436388946
Cross Validation Standard Deviation: 0.0672591336101417
Model as Percentage:  57.257874363889464




In [11]:
# knn classifier, best neighbors = 8
k_near_neigh = KNeighborsClassifier(n_neighbors = 10)
k_near_neigh.fit(df_hot, hotness)
k_near_neigh_res = modeling(k_near_neigh, df_hot, hotness)

Cross Validation Scores: [0.53384935 0.56276956 0.59681146 0.57008626 0.42995225 0.43029883
 0.54097351 0.60097043 0.44266975 0.47631173]
Cross Validation Mean: 0.5184693126583145
Cross Validation Standard Deviation: 0.06437846786303919
Model as Percentage:  51.84693126583145


In [12]:
# xgboost classifier
x_grad_boost = XGBClassifier(learning_rate =0.2, n_estimators=80, max_depth=6, min_child_weight=1, gamma=0, subsample=0.7,
    colsample_bytree=0.7, objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=12)
x_grad_boost.fit(df_hot, hotness)
x_grad_boost_res = modeling(x_grad_boost, df_hot, hotness)

Cross Validation Scores: [0.44200555 0.57293592 0.48875539 0.5866451  0.5521411  0.55468269
 0.61960875 0.45964264 0.57114198 0.55432099]
Cross Validation Mean: 0.5401880091736838
Cross Validation Standard Deviation: 0.054609403555114296
Model as Percentage:  54.01880091736838


In [13]:
# random forest classifier
random_forest = RandomForestClassifier(n_estimators=100, oob_score = True)
random_forest.fit(df_hot, hotness)
random_forest_res = modeling(random_forest, df_hot, hotness)

Cross Validation Scores: [0.43037585 0.61856901 0.51209181 0.5526032  0.55048521 0.54513247
 0.59885243 0.61598891 0.58923611 0.47426698]
Cross Validation Mean: 0.5487601977362451
Cross Validation Standard Deviation: 0.058719734128873824
Model as Percentage:  54.876019773624506


In [14]:
# decision tree classifier
decision_tree = DecisionTreeClassifier()
decision_tree.fit(df_hot, hotness)
decision_tree_res = modeling(decision_tree, df_hot, hotness)

Cross Validation Scores: [0.4995764  0.51856131 0.51394023 0.4995764  0.51948552 0.49865219
 0.54124307 0.50747073 0.51574074 0.49212963]
Cross Validation Mean: 0.5106376223728349
Cross Validation Standard Deviation: 0.013622185204860682
Model as Percentage:  51.06376223728349


In [15]:
# linear SVC classifier
linear_svc = LinearSVC()
linear_svc.fit(df_hot, hotness)
linear_svc_res = modeling(linear_svc, df_hot, hotness)



Cross Validation Scores: [0.4762785  0.50123229 0.47989834 0.52487677 0.53943315 0.46233826
 0.53165434 0.5801756  0.43433642 0.43819444]
Cross Validation Mean: 0.49684181100385655
Cross Validation Standard Deviation: 0.04474656845643524
Model as Percentage:  49.68418110038566




In [16]:
# compare scores
comparison = pd.DataFrame({'Model': ['Logistic Regression','K-Nearest Neighbors','Extreme Gradient Boosting',
            'Random Forest','Decision Tree','Linear SVC'],
    'Score': [log_reg_res,k_near_neigh_res,x_grad_boost_res,random_forest_res,decision_tree_res,linear_svc_res]})
df_comparison = comparison.sort_values(by='Score', ascending=False)
df_comparison = df_comparison.set_index('Score')
df_comparison

Unnamed: 0_level_0,Model
Score,Unnamed: 1_level_1
0.572579,Logistic Regression
0.54876,Random Forest
0.540188,Extreme Gradient Boosting
0.518469,K-Nearest Neighbors
0.510638,Decision Tree
0.496842,Linear SVC
