## HW4


In [116]:
import pandas as pd
import numpy as np
from collections import Counter
from utils import ml_utils
from typing import Callable, Union, Any

from tqdm.contrib.itertools import product


In [117]:
test = pd.read_csv('data/spotify_test.csv')
train = pd.read_csv('data/spotify_train.csv')
print(test)


     popularity  duration_ms  explicit  danceability  energy  key  loudness  \
0             0       169756     False         0.601   0.713    4    -3.758   
1            66        87133     False         0.638   0.324    0    -7.787   
2             1       242640     False         0.660   0.752    6    -5.839   
3            71       242628     False         0.697   0.663    6    -7.246   
4             0       214733     False         0.552   0.823    2    -5.988   
..          ...          ...       ...           ...     ...  ...       ...   
995          82       229525     False         0.689   0.481   10    -7.503   
996           0       243057     False         0.503   0.582    0    -4.324   
997          68       172960     False         0.361   0.871    8    -4.313   
998          66       320040     False         0.465   0.822    8    -4.826   
999           0       208626     False         0.664   0.783    9    -6.602   

     mode  speechiness  acousticness  instrumentaln

In [120]:
test.drop_duplicates(keep=False, inplace=True,subset=test.columns.difference(['track_genre']))
train.drop_duplicates(keep=False, inplace=True, subset=train.columns.difference(['track_genre']))

789
1462


# Part A


In [29]:
def calc_accuracy(y_pred, y_actual):
    correct_cnt = 0
    total = len(y_actual)
    #assume that y_pred and y_actual have same length

    for i in range(len(y_actual)):
        pre = y_pred[i]
        act = y_actual[i]
        if pre == act:
            correct_cnt += 1
    
    return correct_cnt/total

def n_folds(folds,train):
    for f in range(folds):
        train_fold = train[train.index % folds != f]
        valid_fold = train[train.index % folds == f]
    return train_fold, valid_fold

# def tokenize(x):
#     #tokenize the predict output to int
#     dic = {'hip-hop' : 0, 'pop': 1, 'rock': 2}
#     y = dic[x]
#     return y

# tokenized_pred = np.array(list(map(tokenize, y_pred)))
# tokenized_actual = np.array(list(map(tokenize, y_actual)))
# print(tokenized_actual)

def predict_data(data,tree):
    # given a subset dataframe as test data, predict its output alongside with its original answer
    y_actual = data['track_genre'].to_numpy()
    #apply decision tree prediction to each of the row 
    y_pred = data.apply(lambda row: tree.predict(row), axis = 1).to_numpy()

    return y_actual, y_pred

def apply_DTree(train: pd.DataFrame,
                validation: pd.DataFrame,
                test: pd.DataFrame,
                impurity_func: str,
                discrete_threshold: int = 10,
                max_depth: int = None,
                min_instances: int = 2,
                target_impurity: float = 0.0
                ):
    
    impurity_func = ml_utils.metric.entropy if impurity_func == 'entropy'   else  ml_utils.metric.gini
    tree = ml_utils.experimental.DecisionTree(discrete_threshold=discrete_threshold,
                                            max_depth=max_depth,
                                            min_instances=min_instances,
                                            target_impurity=target_impurity,
                                            impurity_func=impurity_func)
    tree.train(train,'track_genre')
    validation_accuracy = 'N/A'
    test_accuracy = 'N/A'
    if validation is not None:
        validation_accuracy = calc_accuracy(*predict_data(validation,tree))
    if test is not None:
        test_accuracy = calc_accuracy(*predict_data(test,tree))

    return validation_accuracy, test_accuracy




# Part B

In [131]:
#An example of the accuracy in 10 fold cross validation on trained modal, comparing with accuracy in test data.


train_fold, valid_fold = n_folds(10,train)
print('Training with: impurity_func: {}, max_depth: {}, min_instances: {}, target_impurity: {}'
          .format( 'entropy',20,2,0))
validation_accuracy, test_accuracy = apply_DTree(train = train_fold,
            validation = valid_fold,
            test = test,
            impurity_func = 'entropy',
            discrete_threshold = 10,
            max_depth = 50,
            min_instances = 2,
            target_impurity = 0)
print('Result: Validation accuray: {} || Test accuray: {} \n'.format(validation_accuracy,test_accuracy))


Training with: impurity_func: entropy, max_depth: 20, min_instances: 2, target_impurity: 0


# Part C

In [51]:
def para_tuning():
    train_fold, valid_fold = n_folds(10,train)
    impurity_funcs = ['entorpy']
    records = pd.DataFrame(columns= ['impurity_func', 'max_depth', 'min_instances', 'target_impurity','validation_accuracy','test_accuracy'])
    for max_depth ,min_instances, target_impurity, impurity_func in product(range(5,30,3), range(2,12,2),np.arange(0,0.2,0.05),impurity_funcs):   
                    try:
                        validation_accuracy, test_accuracy =apply_DTree(train = train_fold,
                                    validation = valid_fold,
                                    test = None,
                                    impurity_func = impurity_func,
                                    discrete_threshold = 10,
                                    max_depth = max_depth,
                                    min_instances = min_instances,
                                    target_impurity = target_impurity)
                        row = {'impurity_func': impurity_func, 'max_depth': max_depth, 'min_instances': min_instances, 'target_impurity': target_impurity,'validation_accuracy': validation_accuracy,'test_accuracy': test_accuracy}
                        records = pd.concat([records,pd.DataFrame(row,index=[len(records)])])
                    except:
                        print('Failed with: impurity_func: {}, max_depth: {}, min_instances: {}, target_impurity: {}'.format(impurity_func,max_depth,min_instances,target_impurity))
    return records

In [52]:
records =para_tuning()
records

  0%|          | 0/8 [00:00<?, ?it/s]

Unnamed: 0,impurity_func,max_depth,min_instances,target_impurity,validation_accuracy,test_accuracy
0,entorpy,2,3,0.0,0.605,
1,entorpy,2,3,0.6,0.51,
2,entorpy,2,4,0.0,0.605,
3,entorpy,2,4,0.6,0.51,
4,entorpy,4,3,0.0,0.68,
5,entorpy,4,3,0.6,0.51,
6,entorpy,4,4,0.0,0.68,
7,entorpy,4,4,0.6,0.51,


In [39]:
best=records.loc[records['validation_accuracy'].idxmax()]
print(best)

impurity_func          entorpy
max_depth                    6
min_instances                3
target_impurity              0
validation_accuracy      0.775
test_accuracy              N/A
Name: 6, dtype: object


In [40]:
print('Training with: impurity_func: {}, max_depth: {}, min_instances: {}, target_impurity: {}'
          .format( best['impurity_func'],best['max_depth'],best['min_instances'],best['target_impurity']))
validation_accuracy, test_accuracy = apply_DTree(train = train_fold,
            validation = valid_fold,
            test = test,
            impurity_func = best['impurity_func'],
            discrete_threshold = 10,
            max_depth = best['max_depth'],
            min_instances = best['min_instances'],
            target_impurity =  best['target_impurity'])
print('Result: Validation accuray: {} || Test accuray: {} \n'.format(validation_accuracy,test_accuracy))

Training with: impurity_func: entorpy, max_depth: 6, min_instances: 3, target_impurity: 0
Result: Validation accuray: 0.775 || Test accuray: 0.729 

