In [9]:
import numpy as np
import pandas as pd
import pickle

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [10]:
df = pd.read_csv('../../data/models/df_featurize.csv')
df.head(3)

Unnamed: 0,trail_id,length,elevation_gain,route_type_out_and_back,route_type_point_to_point,features_ada,features_beach,features_cave,features_city_walk,features_dogs,...,activities_scenic_driving,activities_sea_kayaking,activities_skiing,activities_snowboarding,activities_snowshoeing,activities_surfing,activities_trail_running,activities_walking,activities_whitewater_kayaking,difficulty_class
0,10020048,15610.598,1161.8976,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,hard
1,10236086,6920.162,507.7968,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,moderate
2,10267857,2896.812,81.9912,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,easy


# Train random forest model

In [11]:
def train(trail_id, response, test_size, random_state_split, random_state_model, model_path, x_test_path, y_test_path):
    
    # Train, test, split
    X = df.drop(columns=[response, trail_id], axis=1)
    y = df[response]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state_split)

    # Train the model
    clf = RandomForestClassifier(random_state=random_state_model)
    clf = clf.fit(X_train, y_train)

    # Save the model
    pickle.dump(clf, open(model_path, 'wb'))

    # Save the test features and labels
    np.savetxt(x_test_path, X_test, delimiter=",", fmt='%s')
    np.savetxt(y_test_path, y_test, delimiter=",", fmt='%s')

# Evaluate cross validated accuracy

In [12]:
def evaluate(cv, model_path, x_test_path, y_test_path, scoring, cm_labels, output_path):
    # Load model, x_test, and y_test
    clf = pickle.load(open(model_path, 'rb'))
    X_test = pd.read_csv(x_test_path, sep=',')
    y_test = pd.read_csv(y_test_path, sep=',').values.ravel()

    # Get predictions and probabilities
    ypred_proba_test = clf.predict_proba(X_test)[:, 1]
    ypred_bin_test = clf.predict(X_test)

    # Calculate evaluation metrics
    cv_score = cross_val_score(clf, X_test, y_test, cv=cv, scoring=scoring)
    confusion = confusion_matrix(y_test, ypred_bin_test, labels=cm_labels)

    print('Mean CV score: {:.3f}'.format(cv_score.mean()))
    print('Var CV score  {:.3f}'.format(cv_score.std()))
    print()
    print(pd.DataFrame(confusion,
                       index=cm_labels,
                       columns=cm_labels))

    # Save results to a text file
    f = open(output_path, "w+")
    f.write('Mean CV score: {:.3f}'.format(cv_score.mean()))
    f.write('Var CV score  {:.3f}'.format(cv_score.std()))
    f.write('\n')
    f.write(pd.DataFrame(confusion,
                   index=cm_labels,
                   columns=cm_labels).to_string())
    f.close()

In [30]:
input_vector = np.array(df.drop(columns=['trail_id', 'difficulty_class']).iloc[0]).reshape(1, -1)

In [31]:
model = pickle.load(open('../../models/model.pkl', 'rb'))
model.predict(input_vector)

array(['hard'], dtype=object)

# Train and evaluate model

In [13]:
trail_id = 'trail_id'
response = 'difficulty_class'
test_size = 0.2
random_state_split = 0
random_state_model = 0
model_path = '../../models/model.pkl'
x_test_path = '../../data/models/x_test.csv'
y_test_path = '../../data/models/y_test.csv'

cv = 5
model_path = '../../models/model.pkl'
x_test_path = '../../data/models/x_test.csv'
y_test_path = '../../data/models/y_test.csv'
scoring = 'accuracy'
cm_labels = ['easy', 'moderate', 'hard']
output_path = '../../models/evaluate.txt'

In [14]:
train(trail_id, response, test_size, random_state_split, random_state_model, model_path, x_test_path, y_test_path)
evaluate(cv, model_path, x_test_path, y_test_path, scoring, cm_labels, output_path)

Mean CV score: 0.823
Var CV score  0.026

          easy  moderate  hard
easy       141        28     1
moderate    29       241    25
hard         0        33   164
