In [1]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostClassifier
import seaborn as sns
from scipy.stats import pearsonr
import matplotlib.pyplot as plt

import pandas as pd

In [20]:
target = 'popularity'

features =  ['valence',
 'acousticness',
 'artists',
 'danceability',
 'duration_ms',
 'energy',
 'explicit',
 'instrumentalness',
 'liveness',
 'loudness',
 'mode',
 'speechiness',
 'tempo']

data_old_path = "data/old_VS_new/old_era_data.csv"
data_new_path = "data/old_VS_new/new_era_data.csv"

In [23]:
def check_answers(answer, actual):
    answer = list(answer)
    actual = list(actual)
    ok = 0
    diff = 15
    ans_list = []
    for i in range(len(answer)):
        if(actual[i] < answer[i]+diff and actual[i] > answer[i]-diff ):
            ok+=1
            ans_list.append(1)
        else:
            ans_list.append(0)
    return ok/len(answer)

def get_accuracy(classifier, test):
    X = test["x"]
    Y = test["y"]
    ans = classifier.predict(X)

    return check_answers(ans, Y)
    

def print_features(classifier):
    feature_importances_df = pd.DataFrame(classifier.feature_importances_, columns=['importance'], 
        index = features).sort_values('importance', ascending=False)

    print(feature_importances_df)

In [28]:
def best_tree_crit(features, df):
    best_options = {"val" : 0, "crit" : "gini", "depth" : 10}
    for cr in ["gini", 'entropy']:
        for dp in range(1,15):
            myTree = tree.DecisionTreeClassifier(criterion=cr, max_depth=dp)

            X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], 
                                                    test_size=0.2, random_state=1)

            myTree.fit(X_train, y_train)

            acc = get_accuracy(myTree, { "x" : X_test, "y" : y_test })

            if acc > best_options["val"] :
                best_options["val"] = acc
                best_options["crit"] = cr
                best_options["depth"] = dp
    return best_options
            

def create_tree(features, path):
    df = pd.read_csv(str(path))
    
    #best_options = best_tree_crit(features, df)

    best_options = {"val" : None, "crit" : "entropy", "depth" : 13} #best for old data

    myTree = tree.DecisionTreeClassifier(criterion=best_options['crit'], max_depth=best_options['depth'])
    X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], 
                                                    test_size=0.2, random_state=1)

    myTree.fit(X_train, y_train)
    acc = get_accuracy(myTree, { "x" : X_test, "y" : y_test })

    print("ACC: ",acc, " Depth: ",best_options['depth'], " Criterion: ",best_options['crit'])
    print_features(myTree)

create_tree(features, data_old_path)

ACC:  0.8596894238720941  Depth:  13  Criterion:  entropy
                  importance
artists             0.193919
acousticness        0.132311
speechiness         0.115604
duration_ms         0.086969
loudness            0.084677
danceability        0.074824
instrumentalness    0.069613
valence             0.066156
energy              0.062243
liveness            0.056045
tempo               0.051040
mode                0.006520
explicit            0.000078


In [32]:
def best_ada_crit(features, df):
    best_options = {"val" : 0, "ests" : 10, "alg":"SAMME.R"}
    for al in ["SAMME.R", "SAMME"] :
        for es in range(1,201,2):
            adb = AdaBoostClassifier(n_estimators=es, algorithm=al)

            X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], 
                                                    test_size=0.2, random_state=1)

            adb.fit(X_train, y_train)

            acc = get_accuracy(adb, { "x" : X_test, "y" : y_test })

            if acc > best_options["val"] :
                best_options["val"] = acc
                best_options["ests"] = es
                best_options["alg"] = al
    return best_options

def create_ada(features, path):
    df = pd.read_csv(str(path))
    best_options = best_ada_crit(features, df)

    adb = AdaBoostClassifier(n_estimators=best_options['ests'], algorithm=best_options['alg'])
    X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], 
                                            test_size=0.2, random_state=1)
    adb.fit(X_train, y_train)
    acc = get_accuracy(adb, { "x" : X_test, "y" : y_test })
    
    print("ACC: ",acc, " Estimators: ",best_options['ests'], " Algorithm: ",best_options['alg'])
    print_features(adb)

create_ada(features, data_old_path)

ACC:  0.8283561517963797  Estimators:  1  Algorithm:  SAMME.R
                  importance
acousticness             1.0
valence                  0.0
artists                  0.0
danceability             0.0
duration_ms              0.0
energy                   0.0
explicit                 0.0
instrumentalness         0.0
liveness                 0.0
loudness                 0.0
mode                     0.0
speechiness              0.0
tempo                    0.0


In [None]:
def best_rdf_crit(features, df):
    best_options = {"val" : 0, "ests" : 10, "crit":"gini", "depth":10}
    for cr in ["gini", "entropy"] :
        for dp in range(1,15):
            for es in range(1,201,2):
                rdf = RandomForestClassifier(n_estimators=es, max_depth=dp, criterion=cr)
                X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], 
                                                        test_size=0.2, random_state=1)

                rdf.fit(X_train, y_train)
                acc = get_accuracy(adb, { "x" : X_test, "y" : y_test })

                if acc > best_options["val"] :
                    best_options["val"] = acc
                    best_options["ests"] = es
                    best_options["crit"] = cr
                    best_options["depth"] = dp
    return best_options

def create_rdf(features, path):
    df = pd.read_csv(str(path))
    best_options = best_ada_crit(features, df)

    rdf = RandomForestClassifier(n_estimators=es, max_depth=dp, criterion=cr)
    X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], 
                                            test_size=0.2, random_state=1)

    rdf.fit(X_train, y_train)
    acc = get_accuracy(rdf, { "x" : X_test, "y" : y_test })
    
    print("ACC: ",acc, " Estimators: ",best_options['ests'], " Criterion: ",best_options['crit'], " Depth: ",                       best_options['depth'])
    print_features(rdf)

create_rdf(features, data_old_path)