In [1]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostClassifier
import seaborn as sns
from scipy.stats import pearsonr
import matplotlib.pyplot as plt

import pandas as pd

In [20]:
target = 'popularity'

features =  ['valence',
 'acousticness',
 'artists',
 'danceability',
 'duration_ms',
 'energy',
 'explicit',
 'instrumentalness',
 'liveness',
 'loudness',
 'mode',
 'speechiness',
 'tempo']

data_old_path = "data/old_VS_new/old_era_data.csv"
data_new_path = "data/old_VS_new/new_era_data.csv"

In [23]:
def check_answers(answer, actual):
    answer = list(answer)
    actual = list(actual)
    ok = 0
    diff = 15
    ans_list = []
    for i in range(len(answer)):
        if(actual[i] < answer[i]+diff and actual[i] > answer[i]-diff ):
            ok+=1
            ans_list.append(1)
        else:
            ans_list.append(0)
    return ok/len(answer)

def get_accuracy(classifier, test):
    X = test["x"]
    Y = test["y"]
    ans = classifier.predict(X)

    return check_answers(ans, Y)
    

def print_features(classifier):
    feature_importances_df = pd.DataFrame(classifier.feature_importances_, columns=['importance'], 
        index = features).sort_values('importance', ascending=False)

    print(feature_importances_df)

In [None]:
def best_tree_crit(features, df):
    best_options = {"val" : 0, "crit" : "gini", "depth" : 10}
    for cr in ["gini", 'entropy']:
        for dp in range(1,15):
            myTree = tree.DecisionTreeClassifier(criterion=cr, max_depth=dp)

            X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], 
                                                    test_size=0.2, random_state=1)

            myTree.fit(X_train, y_train)

            acc = get_accuracy(myTree, { "x" : X_test, "y" : y_test })

            if acc > best_options["val"] :
                best_options["val"] = acc
                best_options["crit"] = cr
                best_options["depth"] = dp
    return best_options
            

def create_tree(features, path):
    df = pd.read_csv(str(path))
    best_options = best_tree_crit(features, df)

    myTree = tree.DecisionTreeClassifier(criterion=best_options['crit'], max_depth=best_options['depth'])

    X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], 
                                                    test_size=0.2, random_state=1)

    myTree.fit(X_train, y_train)

    acc = get_accuracy(myTree, { "x" : X_test, "y" : y_test })

    print("ACC: "+acc, " Depth: "+best_options['depth'], " Criterion: "+best_options['crit'])

    print_features(myTree)

create_tree(features, data_old_path)

In [25]:
def create_boost(features, data_path):
    abc = AdaBoostClassifier(n_estimators=50, learning_rate=1)

    df = pd.read_csv(str(data_path))

    X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=1)

    abc.fit(X_train, y_train)

    ans = abc.predict(X_test)
    print("Accuracy: ", check_answers(ans, y_test, X_test))

def create_forest(features, data_path):
    dp=9
    est=95

    df = pd.read_csv(str(data_path))

    X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=1)

    print(type(X_test))
    forestClassifier = RandomForestClassifier(n_estimators=est, max_depth=dp, random_state=0, criterion="gini") # the best num, crit & depth (decade 10s)
    forestRegressor = RandomForestRegressor(n_estimators=est, max_depth=dp, random_state=0) # the best num, crit & depth (decade 10s)

    forestClassifier.fit(X_train, y_train)
    ansClassifier = forestClassifier.predict(X_test)
    forestRegressor.fit(X_train, y_train)
    ansRegressor = forestRegressor.predict(X_test)
    
    print("Classifier Num:", est, " Depth:", dp, " Accuracy: ", check_answers(ansClassifier, y_test, X_test))
    

    feature_classifier_importances_df = pd.DataFrame(
        {"feature": features, "importance": forestClassifier.feature_importances_}
    ).sort_values("importance", ascending=False)

    # Display
    print(feature_classifier_importances_df)

    print("Regressor Num:", est, " Depth:", dp, " Accuracy: ", check_answers(ansRegressor, y_test, X_test))
    feature_regressor_importances_df = pd.DataFrame(
        {"feature": features, "importance": forestRegressor.feature_importances_}
    ).sort_values("importance", ascending=False)

    # Display
    print(feature_regressor_importances_df)



"""
create_forest(feature_2, data_path_2)
create_forest(feature, data_path_all)
print("BOOST")
print("feature_2")
#create_boost(feature_2, data_path_2)
#create_boost(feature, data_path_all)

#create_forest(feature_2, data_path_3)
#create_forest(feature_2, data_path_all)

"""


SyntaxError: invalid syntax (<ipython-input-25-f5cb4e4ee0fd>, line 34)