In [2]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
import seaborn as sns
from scipy.stats import pearsonr
import matplotlib.pyplot as plt

import pandas as pd

In [3]:
target = 'popularity'

features =  ['valence',
 'acousticness',
 'artists',
 'danceability',
 'duration_ms',
 'energy',
 'explicit',
 'instrumentalness',
 'liveness',
 'loudness',
 'mode',
 'speechiness',
 'tempo']

data_old_path = "../data/old_VS_new/old_era_data.csv"
data_new_path = "../data/old_VS_new/new_era_data.csv"

In [4]:
def check_answers(answer, actual):
    answer = list(answer)
    actual = list(actual)
    ok = 0
    diff = 15
    ans_list = []
    for i in range(len(answer)):
        if(actual[i] < answer[i]+diff and actual[i] > answer[i]-diff ):
            ok+=1
            ans_list.append(1)
        else:
            ans_list.append(0)
    return ok/len(answer)

def get_accuracy(classifier, test):
    X = test["x"]
    Y = test["y"]
    ans = classifier.predict(X)

    return check_answers(ans, Y)
    

def print_features(classifier):
    feature_importances_df = pd.DataFrame(classifier.feature_importances_, columns=['importance'], 
        index = features).sort_values('importance', ascending=False)

    print(feature_importances_df)

In [5]:
def best_tree_crit(features, df):
    best_options = {"val" : 0, "crit" : "gini", "depth" : 10}
    for cr in ["gini", "entropy"] :
        for dp in range(3,13):
            myTree = tree.DecisionTreeClassifier(criterion=cr, max_depth=dp)

            X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], 
                                                    test_size=0.2, random_state=1)

            myTree.fit(X_train, y_train)

            acc = get_accuracy(myTree, { "x" : X_test, "y" : y_test })

            if acc > best_options["val"] :
                best_options["val"] = acc
                best_options["crit"] = cr
                best_options["depth"] = dp
    return best_options
            

def create_tree(features, path):
    df = pd.read_csv(str(path))
    
    #best_options = best_tree_crit(features, df)

    best_options = {'val': 0.7090627553873284, 'crit': 'gini', 'depth': 7} #best for new data

    myTree = tree.DecisionTreeClassifier(criterion=best_options['crit'], max_depth=best_options['depth'])
    X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], 
                                                    test_size=0.2, random_state=1)

    myTree.fit(X_train, y_train)
    acc = get_accuracy(myTree, { "x" : X_test, "y" : y_test })
    best_options['val']=acc

    print(best_options)
    print_features(myTree)

create_tree(features, data_new_path)

{'val': 0.7089767301819433, 'crit': 'gini', 'depth': 7}
                  importance
loudness            0.316678
duration_ms         0.124791
energy              0.086880
artists             0.072809
instrumentalness    0.071317
valence             0.066864
danceability        0.065826
explicit            0.064932
speechiness         0.039694
acousticness        0.036959
liveness            0.036476
tempo               0.014720
mode                0.002053


In [6]:
def best_ada_crit(features, df):
    best_options = {"val" : 0, "ests" : 10, "alg":"SAMME.R"}
    for al in ["SAMME"] :
        for es in range(148,155,1):
            adb = AdaBoostClassifier(n_estimators=es, algorithm=al)

            X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], 
                                                    test_size=0.2, random_state=1)

            adb.fit(X_train, y_train)

            acc = get_accuracy(adb, { "x" : X_test, "y" : y_test })

            if acc > best_options["val"] :
                best_options["val"] = acc
                best_options["ests"] = es
                best_options["alg"] = al
            print("ACC: ",acc, " Estimators: ",es, " Algorithm: ",al)
    return best_options

def create_ada(features, path):
    df = pd.read_csv(str(path))
    #best_options = best_ada_crit(features, df)

    #best_options = {"val" : 0, "ests" : 51, "alg":"SAMME.R"} #best for old data
    best_options = {"val" : 0.7041593186803734, "ests" : 149,  "alg":  "SAMME"} #best for new data

    adb = AdaBoostClassifier(n_estimators=best_options['ests'], algorithm=best_options['alg'])
    X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], 
                                            test_size=0.2, random_state=1)
    adb.fit(X_train, y_train)
    acc = get_accuracy(adb, { "x" : X_test, "y" : y_test })
    
    best_options['val']=acc

    print(best_options)
    print_features(adb)

create_ada(features, data_new_path)

{'val': 0.7041593186803734, 'ests': 149, 'alg': 'SAMME'}
                  importance
loudness            0.806097
explicit            0.112523
duration_ms         0.051014
acousticness        0.030367
valence             0.000000
artists             0.000000
danceability        0.000000
energy              0.000000
instrumentalness    0.000000
liveness            0.000000
mode                0.000000
speechiness         0.000000
tempo               0.000000


In [7]:
def best_rdf_crit(features, df):
    best_options = {"val" : 0, "ests" : 10, "crit":"gini", "depth":10}

    X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], 
                                                        test_size=0.2, random_state=1)

    for cr in ["gini", "entropy"] :
        for dp in range(7,15):
            for es in range(150,171,2):
                rdf = RandomForestClassifier(n_estimators=es, max_depth=dp, criterion=cr)

                rdf.fit(X_train, y_train)
                acc = get_accuracy(rdf, { "x" : X_test, "y" : y_test })

                if acc > best_options["val"] :
                    best_options["val"] = acc
                    best_options["ests"] = es
                    best_options["crit"] = cr
                    best_options["depth"] = dp

                print("ACC: ",acc, " Estimators: ",es, " Criterion: ",cr, " Depth: ", dp)
    return best_options

def create_rdf(features, path):
    df = pd.read_csv(str(path))
    #best_options = best_rdf_crit(features, df)
    best_options = { "val" :  0.7354724934405781,  "ests" : 166,  "crit" : "gini", "depth" :  10 }

    rdf = RandomForestClassifier(n_estimators=best_options['ests'], max_depth=best_options['depth'],                                    criterion=best_options['crit'])
    X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], 
                                            test_size=0.2, random_state=1)

    rdf.fit(X_train, y_train)
    acc = get_accuracy(rdf, { "x" : X_test, "y" : y_test })

    best_options['val']=acc

    print(best_options)
    print_features(rdf)

create_rdf(features, data_new_path)

{'val': 0.7343971783732633, 'ests': 166, 'crit': 'gini', 'depth': 10}
                  importance
loudness            0.146652
duration_ms         0.102415
danceability        0.090231
valence             0.086262
acousticness        0.083579
energy              0.082736
artists             0.081595
instrumentalness    0.075973
speechiness         0.072249
liveness            0.071319
tempo               0.070042
explicit            0.028550
mode                0.008395


In [11]:
data_10s_path = "../data/decades_new/data_from_10s.csv"
create_rdf(features, data_10s_path)

{'val': 0.8342123366200412, 'ests': 166, 'crit': 'gini', 'depth': 10}
                  importance
duration_ms         0.125359
instrumentalness    0.100613
danceability        0.094266
acousticness        0.090097
tempo               0.086893
energy              0.084578
loudness            0.083023
speechiness         0.082396
liveness            0.077497
valence             0.076002
artists             0.075350
explicit            0.012635
mode                0.011294
