In [None]:
# pip install directly into notebook
%pip install sklearn
%pip install pandas
%pip install imbalanced-learn

In [None]:
# TODO: consider making list of dependencies for TA to install when running this notebook
import sklearn as sk
from sklearn import tree
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit, KFold, cross_validate
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [None]:
# disable warning for chained assignment (not necessary but cleans up the project)
pd.options.mode.chained_assignment = None

In [None]:
# next 3 frames taken from kaggle example
class ArtistsTransformer():
  """ This transformer recives a DF with a feature 'artists' of dtype object
      and convert the feature to a float value as follows:
      1. Replace the data with the artists mean popularity
      2. Replace values where artists appear less than MinCnt with y.mean()
      3. Replace values where artists appear more than MaxCnt with 0
      
      PARAMETERS:
      ----------
      MinCnt (int): Minimal treshold of artisits apear in dataset, default = 3
      MaxCnt (int): Maximal treshold of artisits apear in dataset, default = 600

      RERTURN:
      ----------
      A DataFrame with converted artists str feature to ordinal floats
  """

  def __init__(self, MinCnt = 3.0, MaxCnt = 600.0):
      self.MinCnt = MinCnt
      self.MaxCnt = MaxCnt
      self.artists_df = None
  
  def fit (self, X, y):
      self.artists_df =  y.groupby(X.artists).agg(['mean', 'count'])
      self.artists_df.loc['unknown'] = [y.mean(), 1]
      self.artists_df.loc[self.artists_df['count'] <= self.MinCnt, 'mean'] = y.mean()
      self.artists_df.loc[self.artists_df['count'] >= self.MaxCnt, 'mean'] = 0
      return self

  def transform(self, X, y=None):
      X['artists'] = np.where(X['artists'].isin(self.artists_df.index), X['artists'], 'unknown')
      X['artists'] = X['artists'].map(self.artists_df['mean'])
      return X

In [None]:
def instrumentalness_criteria(X):
    X['instrumentalness'] = list(map((lambda x: 1 if x < 0.1 else (3 if x > 0.95 else 2)), X.instrumentalness))

In [None]:
class ReplaceZeroTransformer():
    """Eliminates Zero values from tempo columns and replace it 
       with the median or mean of non-zero values as specified.
       defaut is set to 'median'.
    """

    def __init__(self, method='median'):
        self.method = method

    def transform(self, X):
        if self.method == 'median':
            X.loc[X['tempo']==0, 'tempo'] = X.loc[X['tempo']>0, 'tempo'].median()
        elif self.method == 'mean':
            X.loc[X['tempo']==0, 'tempo'] = X.loc[X['tempo']>0, 'tempo'].mean()
        else:
            raise Exception("Method can be 'median' or 'mean' only!")
        return X

In [None]:
# Import track data
usecols = ['acousticness', 'danceability', 'duration_ms', 'energy', 'explicit', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode','popularity', 'speechiness', 'tempo', 'valence', 'artists']
dataset = pd.read_csv("data.csv", header = 0, usecols=usecols)

# Remove rows duplicated by ignoring some columns
dataset = dataset[~dataset.duplicated()==1]

# Normalize columns having values outside [0, 1]
scaler = MinMaxScaler()
# cols_to_normalize = ['duration_ms', 'key', 'loudness', 'popularity', 'tempo']
cols_to_normalize = ['duration_ms', 'key', 'loudness', 'tempo']
dataset[cols_to_normalize] = scaler.fit_transform(dataset[cols_to_normalize])

# print(dataset)

y = dataset.pop('popularity') # popularity is our class to predict
X_headers = list(dataset.columns.values)
X = dataset

# Create the under sampler
undersample = RandomUnderSampler(sampling_strategy='majority')

# apply the transform
X, y = undersample.fit_resample(X, y)

# need to scale after to treat the individual categories as their own class for the undersampling
y = y/100

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Apply ArtistsTransformer
artists_transformer = ArtistsTransformer(MinCnt=2)
X_train = artists_transformer.fit(X_train, y_train).transform(X_train, y_train)
X_test = artists_transformer.transform(X_test, y_test)

# Instrumentalness Transformer
instrumentalness_tranformer = FunctionTransformer(instrumentalness_criteria)
instrumentalness_tranformer.transform(X_train)
instrumentalness_tranformer.transform(X_test)

# Tempo Transformer
tempo_transformer = ReplaceZeroTransformer()
X_train = tempo_transformer.transform(X_train)
X_test = tempo_transformer.transform(X_test)

In [None]:
# Hyperparameter tuning

# Set the parameters by cross-validation
tuned_parameters = [
    # {
    #     'max_depth': np.arange(5, 15),
    # },
    {
        'max_depth': np.arange(1, 20),
        'ccp_alpha' : np.append(0, np.linspace(0.000001, 0.0001, 50)),
    },
    {
        'ccp_alpha' : np.append(0, np.linspace(0.000001, 0.0001, 50)),
        'max_leaf_nodes': np.arange(256, 324)
    },
    {
        'ccp_alpha' : np.append(0, np.linspace(0.000001, 0.0001, 50)),
        'min_samples_split': np.arange(50, 700)
    }
]

# Available regression metrics are given here: https://scikit-learn.org/stable/modules/classes.html#regression-metric
# https://stackoverflow.com/questions/42228735/scikit-learn-gridsearchcv-with-multiple-repetitions/42230764#42230764
# ensure scikit is >0.18

non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

inner_cv = KFold(n_splits=5, shuffle=True, random_state=0)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=1)
print("Tuning hyper-parameters begin!")
print()

# clf = GridSearchCV(tree.DecisionTreeRegressor(), tuned_parameters, cv=inner_cv, scoring='neg_mean_squared_error', verbose=4, n_jobs=3)
clf = HalvingGridSearchCV(tree.DecisionTreeRegressor(random_state=1), tuned_parameters, cv=inner_cv, scoring='neg_mean_squared_error', verbose=4, n_jobs=3, random_state=1)
print("Classifiers established, training data")
print()

clf.fit(X, y)
non_nested_scores = clf.best_score_
print("Best parameters found:", clf.best_params_)
print("Score (mean squared):", -clf.best_score_)


print("Running cross validation")
print()
clf.best_params_["random_state"] = 1
# cross_val_raw_data = cross_validate(clf, X=X, y=y, cv=outer_cv, verbose=4,  n_jobs=3, return_estimator=True, return_train_score=True)
clf = tree.DecisionTreeRegressor(**clf.best_params_)
cv_score = cross_val_score(clf, X=X, y=y, cv=outer_cv, verbose=4,  n_jobs=3,  scoring='neg_mean_squared_error')
print("Cross validation score (mean squared):", -cv_score.mean())


In [None]:
# Report grid search results
kwargs = {'ccp_alpha': 0.0003, 'criterion': 'mse', 'max_depth': 9, 'max_leaf_nodes': 260}
clf = tree.DecisionTreeRegressor(**kwargs)
clf.fit(X_train, y_train)
print(f"r2: {clf.score(X_test, y_test)}")
print(f"rmse: {mean_squared_error(y_test, clf.predict(X_test), squared=False)}")


In [None]:
criteria = ["mse", "friedman_mse"] # didn't make a big difference
max_depth = 9 # found to result in best accuracy TODO: test over a range

clf = tree.DecisionTreeRegressor(criterion="mse", max_depth=max_depth)
#     clf = clf.fit(X_train, y_train)

# cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
# print(cross_val_score(clf, X_train, y_train, cv=cv))


# Following a tutorial on Cost Complexity Pruning https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html#sphx-glr-auto-examples-tree-plot-cost-complexity-pruning-py
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")

In [None]:
# generating different classifiers having different ccp_alpha values
# WARNING: this takes a very long time to run! Below, ccp_alphas is pruned to cut down on computation time.
# clfs = []
# NUM_CCP_ALPHAS = 5
# ccp_alphas = ccp_alphas[:NUM_CCP_ALPHAS]
# for ccp_alpha in ccp_alphas:   
#     print(f"ccp_alpha: {ccp_alpha}")
#     clf = tree.DecisionTreeRegressor(random_state=0, ccp_alpha=ccp_alpha)
#     clf.fit(X_train, y_train)
#     clfs.append(clf)
# print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
#       clfs[-1].tree_.node_count, ccp_alphas[-1]))

# search for optimal ccp_alpha (seems to be somewhere in range 10**-4 to 10**-6)
# 0 (99%/-0.6%), 0.0005 (34.3%/33.4%), 0.0000005 (88.3%/5.49%)
for x in range(2,7):
    clf = tree.DecisionTreeRegressor(random_state=0, ccp_alpha=10**-x)
    clf.fit(X_train, y_train)
    print(f"alpha: {10**-x}")
    print(clf.score(X_train, y_train))
    print(clf.score(X_test, y_test))


In [None]:
# search for optimal tree depth (9)
# TODO: plot different depths to demonstrate overfitting as depth increases past 9
for md in range(2,25):
    clf = tree.DecisionTreeRegressor(random_state=0, max_depth=md)
    clf.fit(X_train, y_train)
    print(f"max_depth: {md}")
    print(clf.score(X_train, y_train))
    print(clf.score(X_test, y_test))   

In [None]:
# search for optimal max leaf nodes value (maxima somewhere in (256, 324))
for ln in range(2,25):
    max_leaf_nodes = ln**2
    clf = tree.DecisionTreeRegressor(random_state=0, max_leaf_nodes=max_leaf_nodes)
    clf.fit(X_train, y_train)
    print(f"max_leaf_node: {max_leaf_nodes}")
    print(clf.score(X_train, y_train))
    print(clf.score(X_test, y_test))

In [None]:
# search for optimal min no. leaf samples
for msl in range(1,100,5):
    clf = tree.DecisionTreeRegressor(random_state=0, max_leaf_nodes=298, min_samples_leaf=msl)
    clf.fit(X_train, y_train)
    print(f"min_samples_leaf: {msl}")
    print(clf.score(X_train, y_train))
    print(clf.score(X_test, y_test))

In [None]:
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]
print(train_scores)
print(test_scores)

fig, ax = plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker='o', label="train",
        drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o', label="test",
        drawstyle="steps-post")
ax.legend()
plt.show()

In [None]:
# TODO: perform nested cross-validation (https://scikit-learn.org/stable/auto_examples/model_selection/plot_nested_cross_validation_iris.html)

In [None]:
# Draw tree
clf.get_depth()
plt.figure(figsize=(50,12))
tree.plot_tree(clf,  fontsize=10, feature_names=headers)
plt.show()