## HW4


In [116]:
import pandas as pd
import numpy as np
from utils import ml_utils

from tqdm.contrib.itertools import product

In [117]:
test = pd.read_csv('data/spotify_test.csv')
train = pd.read_csv('data/spotify_train.csv')
print(test)


     popularity  duration_ms  explicit  danceability  energy  key  loudness  \
0             0       169756     False         0.601   0.713    4    -3.758   
1            66        87133     False         0.638   0.324    0    -7.787   
2             1       242640     False         0.660   0.752    6    -5.839   
3            71       242628     False         0.697   0.663    6    -7.246   
4             0       214733     False         0.552   0.823    2    -5.988   
..          ...          ...       ...           ...     ...  ...       ...   
995          82       229525     False         0.689   0.481   10    -7.503   
996           0       243057     False         0.503   0.582    0    -4.324   
997          68       172960     False         0.361   0.871    8    -4.313   
998          66       320040     False         0.465   0.822    8    -4.826   
999           0       208626     False         0.664   0.783    9    -6.602   

     mode  speechiness  acousticness  instrumentaln

In [120]:
test.drop_duplicates(keep=False, inplace=True,subset=test.columns.difference(['track_genre']))
train.drop_duplicates(keep=False, inplace=True, subset=train.columns.difference(['track_genre']))

789
1462


# Part A
**Decision Tree** \
The decision tree modal implementation is in utils\ml_utils\experimental.py.\
The _DTreeNode class is used for representing the nodes on a decision tree which track left and right subtrees as well as information such as the splitting columns. We can use build_tree function to recursively construct the tree given the dataset, and the predict function after the tree is trained.\
Meanwhile, the DecisionTree class is used for processing parameters described in hw4 pdf PartA, to train and predict the decision tree constructed using _DTreeNode class. \
\
The implementation of gini and entropy are resides in utils\ml_utils\metric.py which will be used by our Decision tree modal

In [29]:
# Given the prediction and actual label, calculate the accuracy of our modal
def calc_accuracy(y_pred, y_actual):
    correct_cnt = 0
    total = len(y_actual)
    #assume that y_pred and y_actual have same length

    for i in range(len(y_actual)):
        pre = y_pred[i]
        act = y_actual[i]
        if pre == act:
            correct_cnt += 1
    
    return correct_cnt/total

# function given by lab, used to perform 10 fold cross validation
def n_folds(folds,train):
    for f in range(folds):
        train_fold = train[train.index % folds != f]
        valid_fold = train[train.index % folds == f]
    return train_fold, valid_fold

def predict_data(data,tree):
    # given a subset dataframe as test data, predict its output alongside with its original answer
    y_actual = data['track_genre'].to_numpy()
    #apply decision tree prediction to each of the row 
    y_pred = data.apply(lambda row: tree.predict(row), axis = 1).to_numpy()

    return y_actual, y_pred

# function for apllying modals with different hyperparameters. 
def apply_DTree(train: pd.DataFrame,
                validation: pd.DataFrame,
                test: pd.DataFrame,
                impurity_func: str,
                discrete_threshold: int = 10,
                max_depth: int = None,
                min_instances: int = 2,
                target_impurity: float = 0.0
                ):
    
    impurity_func = ml_utils.metric.entropy if impurity_func == 'entropy'   else  ml_utils.metric.gini
    tree = ml_utils.experimental.DecisionTree(discrete_threshold=discrete_threshold,
                                            max_depth=max_depth,
                                            min_instances=min_instances,
                                            target_impurity=target_impurity,
                                            impurity_func=impurity_func)
    tree.train(train,'track_genre')
    validation_accuracy = 'N/A'
    test_accuracy = 'N/A'
    if validation is not None:
        validation_accuracy = calc_accuracy(*predict_data(validation,tree))
    if test is not None:
        test_accuracy = calc_accuracy(*predict_data(test,tree))

    return validation_accuracy, test_accuracy




# Part B

In [138]:
#An example of the accuracy in 10 fold cross validation on trained modal, comparing with accuracy in test data.
train_fold, valid_fold = n_folds(10,train)
print('Training with: impurity_func: {}, max_depth: {}, min_instances: {}, target_impurity: {}'
          .format( 'entropy',20,2,0))
validation_accuracy, test_accuracy = apply_DTree(train = train_fold,
            validation = valid_fold,
            test = test,
            impurity_func = 'entropy',
            discrete_threshold = 10,
            max_depth = 10,
            min_instances = 4,
            target_impurity = 0)
print('Result: Validation accuray: {} || Test accuray: {} \n'.format(validation_accuracy,test_accuracy))


Training with: impurity_func: entropy, max_depth: 20, min_instances: 2, target_impurity: 0
Result: Validation accuray: 0.7346938775510204 || Test accuray: 0.6730038022813688 



# Part C

In [132]:
def para_tuning():

    #10 fold cross validation
    train_fold, valid_fold = n_folds(10,train)

    # finding the best hyperparameter combo and record them in a dataframe
    impurity_funcs = ['entorpy','gini']
    records = pd.DataFrame(columns= ['impurity_func', 'max_depth', 'min_instances', 'target_impurity','validation_accuracy','test_accuracy'])
    for max_depth ,min_instances, target_impurity, impurity_func in product(range(5,30,2), range(2,12,2),np.arange(0,0.2,0.05),impurity_funcs):   
                    try:
                        validation_accuracy, test_accuracy =apply_DTree(train = train_fold,
                                    validation = valid_fold,
                                    test = None,
                                    impurity_func = impurity_func,
                                    discrete_threshold = 10,
                                    max_depth = max_depth,
                                    min_instances = min_instances,
                                    target_impurity = target_impurity)
                        row = {'impurity_func': impurity_func, 'max_depth': max_depth, 'min_instances': min_instances, 'target_impurity': target_impurity,'validation_accuracy': validation_accuracy,'test_accuracy': test_accuracy}
                        records = pd.concat([records,pd.DataFrame(row,index=[len(records)])])
                    except:
                        print('Failed with: impurity_func: {}, max_depth: {}, min_instances: {}, target_impurity: {}'.format(impurity_func,max_depth,min_instances,target_impurity))
    return records

In [133]:
records =para_tuning()
records

  0%|          | 0/260 [00:00<?, ?it/s]

Unnamed: 0,impurity_func,max_depth,min_instances,target_impurity,validation_accuracy,test_accuracy
0,entorpy,5,2,0.00,0.680272,
1,entorpy,5,2,0.05,0.687075,
2,entorpy,5,2,0.10,0.687075,
3,entorpy,5,2,0.15,0.680272,
4,entorpy,5,4,0.00,0.687075,
...,...,...,...,...,...,...
255,entorpy,29,8,0.15,0.741497,
256,entorpy,29,10,0.00,0.727891,
257,entorpy,29,10,0.05,0.727891,
258,entorpy,29,10,0.10,0.734694,


# Part D

In [134]:
#Did not use test data in tuning or training (N/A in test_accuracy), here is the best hyperparameter we found on training data.
best=records.loc[records['validation_accuracy'].idxmax()]
print(best)

impurity_func           entorpy
max_depth                    11
min_instances                 6
target_impurity             0.1
validation_accuracy    0.755102
test_accuracy               N/A
Name: 70, dtype: object


In [135]:
#applying the best hyperparameters to find the accuracy on test.
print('Training with: impurity_func: {}, max_depth: {}, min_instances: {}, target_impurity: {}'
          .format( best['impurity_func'],best['max_depth'],best['min_instances'],best['target_impurity']))
validation_accuracy, test_accuracy = apply_DTree(train = train_fold,
            validation = valid_fold,
            test = test,
            impurity_func = best['impurity_func'],
            discrete_threshold = 10,
            max_depth = best['max_depth'],
            min_instances = best['min_instances'],
            target_impurity =  best['target_impurity'])
print('Result: Validation accuray: {} || Test accuray: {} \n'.format(validation_accuracy,test_accuracy))

Training with: impurity_func: entorpy, max_depth: 11, min_instances: 6, target_impurity: 0.1
Result: Validation accuray: 0.7551020408163265 || Test accuray: 0.6806083650190115 

