<a href="https://colab.research.google.com/github/Harshik97/Song-Recommendation/blob/master/Spotify.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:

import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score
from sklearn.tree import export_graphviz
import graphviz
import pydotplus
import io
import imageio
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from google.colab import files


In [11]:
def load_data():
    """ 
        Returns:
            Training data dataframe, training labels, testing data dataframe,
            testing labels, features list
    """
    features=['danceability','loudness','valence','energy','instrumentalness','acousticness','key','speechiness','duration_ms','liveness','mode','tempo','valence']
    uploaded = files.upload()
    data = pd.read_csv(io.BytesIO(uploaded['spotify_data.csv'])) 
    train, test = train_test_split(data, test_size = 0.2)
    train_x = train[features]
    train_y = train['target']
    test_x = test[features]
    test_y = test['target']
    return (train_x, test_x, train_y, test_y), features

In [12]:
def cv_grid_search(training_table, training_labels, nfolds = 5):
    """ Run grid search with cross-validation to try different
    hyperparameters
      Returns:
            Dictionary of best hyperparameters found by a grid search with
            cross-validation
    """

    param_grid = {'criterion': ['gini','entropy'], 'max_depth': np.arange(3, 15), 'class_weight': [None, 'balanced']}
    dt = DecisionTreeClassifier()
    dt_gscv = GridSearchCV(dt, param_grid, cv=nfolds)
    dt_gscv.fit(training_table, training_labels)
    #print(dt_gscv.best_params_)
    return dt_gscv.best_params_


In [13]:
def plot_confusion_matrix(test_labels, pred_labels):
    """Plot confusion matrix
       
        Returns:
            Writes image file of confusion matrix
    """
   
    cm = confusion_matrix(test_labels, pred_labels)
 
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(cm)
    for (i, j), z in np.ndenumerate(cm):
        ax.text(j, i, z, ha='center', va='center')
    plt.title('Confusion matrix of the classifier')
    fig.colorbar(cax)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()


In [14]:
def graph_tree(model, training_features, class_names):
    """ Plot the tree of the trained model
        
        Returns:
            Writes PDF file showing decision tree representation
    """
   
    out = export_graphviz(model, out_file = None, feature_names = training_features, class_names=class_names, filled = True)
    p = graphviz.Source(out)
    p.format = 'jpeg'
    p.render('dtree_out.jpeg')
    

In [15]:
def print_results(predictions, test_y):
    """Print results
       
        Returns:
            Prints precision, recall, F1-score, and accuracy
    """
    
    print('F1 Score:', f1_score(test_y, predictions, average="macro"))
    print('Precision Score:', precision_score(test_y, predictions, average="macro"))
    print('Recall Score:', recall_score(test_y, predictions, average="macro")) 
    print('Accuarcy Score:', accuracy_score(test_y, predictions))

def print_feature_importance(model, features):
    """Print feature importance
        
        Returns:
            Prints ordered list of features, starting with most important,
            along with their relative importance (percentage).
    """
    
    feat_importance = model.tree_.compute_feature_importances(normalize=False)
    mapped = set(zip(features, feat_importance))
    print("Feature Importance = ", mapped)

In [16]:
def main():
    """Run the program"""
    # Load data
    (train_x, test_x, train_y, test_y), features = load_data()

    # Cross Validation Training
    #'criterion', 'max_depth', 'class weight', 'splitter', 'max_features'
    params = cv_grid_search(train_x, train_y, 5)
    # params = ['entropy', 4, 'balanced']

    # Train and test model using hyperparameters
 
    classifier = DecisionTreeClassifier(
        criterion=params['criterion'],
        max_depth=params['max_depth'],
        class_weight=params['class_weight'],
        )
    classifier = DecisionTreeClassifier()
    decision_tree = classifier.fit(train_x, train_y)
    predictions = classifier.predict(test_x)
    
    # Confusion Matrix
    plot_confusion_matrix(test_y, list(predictions))

    # Graph Tree
    graph_tree(classifier, features, ['hate', 'love'])

    # Accuracy, Precision, Recall, F1
    print_results(predictions, test_y)

    # Feature Importance
    print_feature_importance(classifier, features)


if __name__ == '__main__':
    main()