In [19]:
###############################################
#                                             #
#     TO RUN ->                               #
#     Click "Cell" in the above toolbar       #
#        -> Select "Run All" from the menu    #
#                                             #
###############################################

In [20]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
Toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.
</script>''')

In [21]:
# Standard machine learning libraries
%matplotlib inline
import pandas as pd                # dataframe library
pd.set_option('display.width', 950)
pd.set_option('display.max_colwidth', None)

import numpy as np                 # math library
import matplotlib.pyplot as plt    # figure plotting

# Classification Model libraries
import pickle
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.multiclass import OneVsRestClassifier
from collections import Counter
import lightgbm as lgb
import shap

# Web Libraries
import urllib
from IPython.display import HTML

# Interactinve Dashboard
import functools
import qgrid
import ipywidgets as widgets
from ipywidgets import Button, HBox, VBox, Layout, Label

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [22]:
# Import data set into pandas dataframe
df = pd.read_csv("https://raw.githubusercontent.com/KayleighEarly/MachineLearning/main/notebooks/data/labeled_trail_data.csv")

# Identify features columns and target column
X = df.drop(['Cluster Labels','url'], axis=1)
y = df['Cluster Labels']

# Setting the objects to category 
for c in X.select_dtypes(include='object'):
    X[c] = X[c].astype('category')

# Split the data, keeping 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)

In [23]:
def train_model():
    clf = lgb.LGBMClassifier()
    clf.fit(X_train, y_train)
    
    return clf

def load_existing_model():
    clf = pickle.load(urllib.request.urlopen("https://raw.githubusercontent.com/KayleighEarly/MachineLearning/main/notebooks/models/gbm_model_1.pkl"))

    return clf

# load existing model
clf = load_existing_model()

In [24]:
HTML('''
<style type="text/css">
#wrap {
   width:950px;
   margin:0px;
}
</style>
<div id="wrap">
<table style="width:950px">
    <tr>
        <td colspan="2" style="text-align:center"><h1>Classification Model Training</h1></td>
    </tr>
    <tr>
        <td style="width:500px; padding-top:10px; padding-right:50px;">
            <p>By default, this dashboard is loaded with a pre-trained model hosted on 
            <a href="https://github.com/KayleighEarly/MachineLearning/blob/main/notebooks/models/gbm_model_1.pkl">
            GitHub</a>. To retrain the model, please click the "Retrain Model" button at the end of the section.</p>
            <p>The dataset being used for the model contains the columns (features) explained in the table to the 
            right.</p>
        </td>
        <td>
            <table>
                <tr>
                    <th>Feature</th>
                    <th>Description</th>
                </tr>
                <tr>
                    <td>url</td>
                    <td>the FQDN the data was scraped from</td>
                </tr>
                <tr>
                    <td>difficulty</td>
                    <td>the avg reported difficult of the trails</td>
                </tr>
                <tr>
                    <td>dist</td>
                    <td>the length of the trail (miles)</td>
                </tr>
                <tr>
                    <td>type</td>
                    <td>the type of trail (loop, point to point, out and back, lollipop)</td>
                </tr>
                <tr>
                    <td>high_elev</td>
                    <td>the highest elevation reached in the trail (ft)</td>
                </tr>
                <tr>
                    <td>low_elev</td>
                    <td>the lowest elevation reached in the trail (ft)</td>
                </tr>
                <tr>
                    <td>elev_gain</td>
                    <td>the total elevation gained across the trail (ft)</td>
                </tr>
                <tr>
                    <td>elev_lost</td>
                    <td>the total elevation lost across the trail (ft)</td>
                </tr>
                <tr>
                    <td>grade_avg</td>
                    <td>the average grade of the trail (degrees)</td>
                </tr>
                <tr>
                    <td>grade_max</td>
                    <td>the maximum grade of the trail (degrees)</td>
                </tr>
                <tr>
                    <td>Cluster Labels</td>
                    <td>the cluster group the trail belongs to (from 0 to 11)</td>
                </tr>
            </table>
        </td>
    </tr>
</table>
</div>
''')

0,1
Classification Model Training,Classification Model Training
"By default, this dashboard is loaded with a pre-trained model hosted on GitHub. To retrain the model, please click the ""Retrain Model"" button at the end of the section.  The dataset being used for the model contains the columns (features) explained in the table to the right.","Feature  Description  url  the FQDN the data was scraped from  difficulty  the avg reported difficult of the trails  dist  the length of the trail (miles)  type  the type of trail (loop, point to point, out and back, lollipop)  high_elev  the highest elevation reached in the trail (ft)  low_elev  the lowest elevation reached in the trail (ft)  elev_gain  the total elevation gained across the trail (ft)  elev_lost  the total elevation lost across the trail (ft)  grade_avg  the average grade of the trail (degrees)  grade_max  the maximum grade of the trail (degrees)  Cluster Labels  the cluster group the trail belongs to (from 0 to 11)"

Feature,Description
url,the FQDN the data was scraped from
difficulty,the avg reported difficult of the trails
dist,the length of the trail (miles)
type,"the type of trail (loop, point to point, out and back, lollipop)"
high_elev,the highest elevation reached in the trail (ft)
low_elev,the lowest elevation reached in the trail (ft)
elev_gain,the total elevation gained across the trail (ft)
elev_lost,the total elevation lost across the trail (ft)
grade_avg,the average grade of the trail (degrees)
grade_max,the maximum grade of the trail (degrees)


In [25]:
HTML('''
<div id="wrap">
    <center><p><h1>Model Accuracy</h1></p></center>
    <p>The model shows a high level of accuracty, over 99% in default accuracy scoring, as well as
    over a 99% ROC AOC score for both One-vs-One and One-vs-Rest scores. Further, the confusion 
    matrix shows a low level of confusion across the model and a classification report shows high 
    overall precision and accuracy across all of the clusters.</p>
</div>
''')

In [26]:
#def check_model_accuracy(clf, X, y, X_train, X_test, y_train, y_test, accuracy_out):
def check_model_accuracy():
    
    y_prob = clf.predict_proba(X_test)

    macro_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovo",
                                      average="macro")
    weighted_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovo",
                                         average="weighted")
    macro_roc_auc_ovr = roc_auc_score(y_test, y_prob, multi_class="ovr",
                                      average="macro")
    weighted_roc_auc_ovr = roc_auc_score(y_test, y_prob, multi_class="ovr",
                                         average="weighted")

    with accuracy_out:
        accuracy_out.clear_output()
        
        # print the default accuracy scores on training and test set
        print('Training Set Accuracy: {:.2f}%'.format(clf.score(X_train, y_train) * 100))
        print('Test Set Accuracy: {:.2f}%'.format(clf.score(X_test, y_test) * 100))
        print('')

        # Taking too long on binder, can be run locally
        # print the cross validation accuracy score
        #print('Cross-Validated Accuracy: {:.2f}%'.format(np.mean(cross_val_score(clf, X, y)) * 100))
        #print('')

        # print ROC/AUC scores
        print("One-vs-One ROC AUC scores:\n{:.4f}% (macro),\n{:.4f}% "
              "(weighted by prevalence)"
              .format(macro_roc_auc_ovo, weighted_roc_auc_ovo))
        print("One-vs-Rest ROC AUC scores:\n{:.4f}% (macro),\n{:.4f}% "
              "(weighted by prevalence)"
              .format(macro_roc_auc_ovr, weighted_roc_auc_ovr))

#def check_classification_report(X_test, y_test, class_report_out):
def check_classification_report():
    # Classification Report
    y_pred = clf.predict(X_test)

    with class_report_out:
        class_report_out.clear_output()
        print(classification_report(y_test, y_pred))

#def check_confusion_matrix(clf, X_test, y_test, confusion_out):
def check_confusion_matrix():
    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)

    with confusion_out:
        confusion_out.clear_output()
        
        # Plot confusion matrix
        fig = plt.figure(figsize=(8, 7))
        ax = plt.subplot()
        sns.heatmap(cm, annot=True, ax = ax, fmt = 'g')

        # Label matrix
        ax.set_xlabel('Predicted', fontsize=20)
        ax.xaxis.set_label_position('bottom')
        ax.xaxis.tick_bottom()
        ax.set_ylabel('True', fontsize=20)
        plt.yticks(rotation=0)
        plt.title('Confusion Matrix', fontsize=20)
        plt.show()
                
accuracy_out = widgets.Output()
class_report_out = widgets.Output()
confusion_out = widgets.Output()

tabs = widgets.Tab(children=[accuracy_out, class_report_out, confusion_out], 
                  layout=Layout(width="950px"))
tabs.set_title(0, 'Accuracy Scores')
tabs.set_title(1, 'Classification Rpt')
tabs.set_title(2, 'Confusion Matrix')

display(tabs)

check_model_accuracy()
check_classification_report()
check_confusion_matrix()

Tab(children=(Output(), Output(), Output()), layout=Layout(width='950px'), _titles={'0': 'Accuracy Scores', '1…

In [27]:
# shap_values take too long to process on Binder. Uncomment if running locally
#HTML('''
#<div id="wrap">
#    <center><p><h2>Feature Impact on Model</h2></p></center>
#    <p>As shown in the bar graph below, of the 9 features, 8 had an impact on the classification prediction, 
#    with 7 of them having a high impact. This means that the clusters are both distinguishable and informed 
#    by the majority of input.</p>
#</div>
#''')

In [28]:
#def check_feature_impact():
#    feature_out.clear_output()
#    
#    # Clean Up first
#    explainer = shap.TreeExplainer(clf)
#    shap_values = explainer.shap_values(X)
#
#    with feature_out:
#        shap.summary_plot(shap_values, X, plot_size=(12,7))
#    
#feature_out = widgets.Output(layout={'width': '1000px'})
#display(feature_out)
#
#check_feature_impact()

In [29]:
HTML('''
<div id="wrap">
    <center><p><h2>Model Retraining</h2></p></center>
    <p>The classification model can be retrained using the available labled data by clicking the "Retrain 
    Model" button below. Please be aware that this will take a while to complete, and will refresh 
    all accuracy and feature outputs.</p>
</div>
''')

In [30]:
def retrain_eventhandler(event):
    # disable button and update output to say training
    retrain.disabled=True
    with retrain_out:
        retrain_out.clear_output()
        print('Model retraining... please wait.')
    
    # clear existing accuracy stats
    accuracy_out.clear_output()
    class_report_out.clear_output()
    confusion_out.clear_output()
    feature_out.clear_output()
    
    # retrain model
    clf = train_model()
    with retrain_out:
        retrain_out.clear_output()
        print('Model trained... updating accuracy metrics.')
    
    # update outputs
    check_model_accuracy()
    check_confusion_matrix()
    check_classification_report()
    check_feature_impact()
    
    with retrain_out:
        retrain_out.clear_output()
        print('Retrain complete.')
    
    retrain.disabled=False
        
        
retrain = widgets.Button(
    description='Retrain Model',
    disabled=False,
    button_style='',
    tooltip='Retrain Classification model',
    icon='check'
)

retrain_out = widgets.Output()
retrain.on_click(retrain_eventhandler)

HBox([retrain, retrain_out])

HBox(children=(Button(description='Retrain Model', icon='check', style=ButtonStyle(), tooltip='Retrain Classif…

In [31]:
HTML('''
<div id="wrap">
    <center><p><h1>Recommendation Engine</h1></p></center>
    <p>This is where the trained model can be used to make recommendations to users. The user can adjust 
    the various inputs to match the type of trail they have enjoyed previously and the model will attempt to
    match the statistics with a cluster and will update the dataframe with related trails. Due to the fact 
    that there are thousands of trails per cluster, the application will limit output to only displaying 
    a maximum of 50 trails.</p>
    <p>The user then has the additional option of filtering the cluster based on required criteria, for example 
    only trails that are loops, or have distance of less than 2 miles. This will refine the trails displayed in 
    the dataframe, again to a maximum of 50 trails.</p>
</div>
''')

In [32]:
trail_type = widgets.Dropdown(
    options=['Loop', 'Point to Point', 'Out and Back', 'Lollipop', 'Unknown'],
    value='Loop',
)

difficulty = widgets.Dropdown(
    options=['Easy', 'Easy/Intermediate', 'Intermediate', 'Intermediate/Difficult', 'Difficult', 'Very Difficult'],
    value='Easy',
)

dist = widgets.IntSlider(
    min=-0,
    max=100,
    step=1,
    value=4
)
elev_low = widgets.IntSlider(
    min=-1250,
    max=19700,
    step=10,
    value=2740
)
elev_high = widgets.IntSlider(
    min=0,
    max=16150,
    step=10,
    value=3310
)
elev_gain = widgets.IntSlider(
    min=0,
    max=14500,
    step=10,
    value=590
)
elev_lost = widgets.IntSlider(
    min=0,
    max=14500,
    step=10,
    value=430
)
avg_grade = widgets.IntSlider(
    min=0,
    max=14,
    step=1,
    value=3
)
max_grade = widgets.IntSlider(
    min=0,
    max=42,
    step=1,
    value=10
)
calculate = widgets.Button(
    description='Find Trails',
    disabled=False,
    button_style='',
    tooltip='Click me',
    icon='check'
)

prediction_out = widgets.Output(layout={'width':'950px', 'border': '1px solid black'})


In [35]:
def calculate_eventhandler(event) :
    X_new = pd.DataFrame(columns=['difficulty', 'dist', 'type', 'high_elev', 'low_elev', 'elev_gain', 
                           'elev_lost', 'grade_avg', 'grade_max'])
    trail_stats = {'difficulty': difficulty.value, 'dist': dist.value, 'type': trail_type.value, 
                   'high_elev': elev_high.value, 'low_elev': elev_low.value, 'elev_gain': elev_gain.value, 
                   'elev_lost': elev_lost.value, 'grade_avg': avg_grade.value, 'grade_max': max_grade.value}
    X_new = X_new.append(trail_stats, ignore_index=True)
        
    X_new['dist'] = X_new['dist'].astype(float)
    X_new['high_elev'] = X_new['high_elev'].astype(float)
    X_new['low_elev'] = X_new['low_elev'].astype(float)
    X_new['elev_gain'] = X_new['elev_gain'].astype(float)
    X_new['elev_lost'] = X_new['elev_lost'].astype(float)
    X_new['grade_avg'] = X_new['grade_avg'].astype(int)
    X_new['grade_max'] = X_new['grade_max'].astype(int)
    
    # Setting the objects to category 
    for c in X_new.select_dtypes(include='object'):
        X_new[c] = X_new[c].astype('category')

    pred = clf.predict(X_new)
    df_cluster = df.loc[df['Cluster Labels'] == pred[0]]
    df_cluster.drop(['Cluster Labels'],axis=1,inplace=True)
    df_cluster.set_index('url', inplace=True)

    with prediction_out:
        prediction_out.clear_output()
        display(qgrid.show_grid(df_cluster))
        
calculate.on_click(calculate_eventhandler)

In [36]:
labels1 = VBox([Label('Trail Type'),Label('Trail Difficulty'),Label('Distance (miles)'),Label('Average Grade (degrees)')
                ,Label('Maximum Grade (degrees)')])
widgets1 = VBox([trail_type, difficulty, dist, avg_grade, max_grade])
labels2 = VBox([Label('Lowest Elevation (ft)'),Label('Highest Elevation (ft)'),Label('Elevation Gained (ft)')
                ,Label('Elevation Lost (ft)')])
widgets2 = VBox([elev_low, elev_high, elev_gain, elev_lost])

display(HBox([labels1, widgets1, labels2, widgets2]))

display(calculate)
display(prediction_out)

HBox(children=(VBox(children=(Label(value='Trail Type'), Label(value='Trail Difficulty'), Label(value='Distanc…

Button(description='Find Trails', icon='check', style=ButtonStyle(), tooltip='Click me')

Output(layout=Layout(border='1px solid black', width='950px'), outputs=({'output_type': 'display_data', 'data'…