In [132]:
import numpy as np
import pandas as pd
import pickle

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [116]:
df = pd.read_csv('../../data/models/df_cleaned.csv')
df.head(3)

Unnamed: 0,trail_id,name,area_name,city_name,state_name,country_name,popularity,length,elevation_gain,difficulty_rating,route_type,visitor_usage,avg_rating,num_reviews,features,activities,units,lat,lng
0,10020048,Harding Ice Field Trail,Kenai Fjords National Park,Seward,Alaska,United States,24.8931,15610.598,1161.8976,5,out and back,3.0,5.0,423,"['dogs-no', 'forest', 'river', 'views', 'water...","['birding', 'camping', 'hiking', 'nature-trips...",i,60.18852,-149.63156
1,10236086,Mount Healy Overlook Trail,Denali National Park,Denali National Park,Alaska,United States,18.0311,6920.162,507.7968,3,out and back,1.0,4.5,260,"['dogs-no', 'forest', 'views', 'wild-flowers',...","['birding', 'camping', 'hiking', 'nature-trips...",i,63.73049,-148.91968
2,10267857,Exit Glacier Trail,Kenai Fjords National Park,Seward,Alaska,United States,17.7821,2896.812,81.9912,1,out and back,3.0,4.5,224,"['dogs-no', 'partially-paved', 'views', 'wildl...","['hiking', 'walking']",i,60.18879,-149.631


In [117]:
df_featurize = pd.read_csv('../../data/models/df_featurize.csv')
df_featurize.head(3)

Unnamed: 0,trail_id,length,elevation_gain,route_type_out_and_back,route_type_point_to_point,features_ada,features_beach,features_cave,features_city_walk,features_dogs,...,activities_scenic_driving,activities_sea_kayaking,activities_skiing,activities_snowboarding,activities_snowshoeing,activities_surfing,activities_trail_running,activities_walking,activities_whitewater_kayaking,difficulty_class
0,10020048,15610.598,1161.8976,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,hard
1,10236086,6920.162,507.7968,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,moderate
2,10267857,2896.812,81.9912,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,easy


In [118]:
with open('../../data/models/features.txt') as f:
    full_features_ls = [line.rstrip() for line in f]

In [119]:
with open('../../data/models/activities.txt') as f:
    full_activities_ls = [line.rstrip() for line in f]

# One hot encode difficulty class

In [120]:
# def one_hot_encode_feature(df, column_name):
    
#     # Get column names of categorical and numerical variables
#     cat_names = df.select_dtypes(include='object').columns
#     num_names = df.select_dtypes(include=np.number).columns

#     # Encode categorical variables
#     enc_columns = pd.get_dummies(df[cat_names], drop_first = True)

#     # Concatenate encoded columns to numerical columns, and tag features
#     df_enc = pd.concat([df[num_names], enc_columns], axis=1)
    
#     return df_enc

# Convert input to feature vector

In [122]:
def input_tag_ohe(feature_name, full_tag_ls, tag_input):
    # feature_name: name of feature to append to column
    # full_tag_ls: entire list of unique tags
    # tag_input: list of tag inputs
    
    split_series = pd.Series([full_tag_ls, tag_input])
    df_split = split_series.str.join('|').str.get_dummies()
    df_split.columns = feature_name + '_' + df_split.columns
    tag_ohe = df_split.iloc[1,:]
    
    return tag_ohe

In [126]:
def column_ohe(df, column_name, col_input):
    # returns series
    full_value_ls = list(df[column_name].unique())
    full_value_ls = [i.replace(' ', '_') for i in full_value_ls]
    split_series = pd.Series([full_value_ls, [col_input]])
    df_split = split_series.str.join('|').str.get_dummies()
    df_split.columns = column_name + '_' + df_split.columns

    # Drop column for first unique value
    ohe_res = df_split.iloc[1, 1:]
    return ohe_res

In [252]:
def create_input_features(features_name, full_features_ls, features_input,
                          activities_name, full_activities_ls, activities_input,
                          df, route_type, route_type_input,
                          length_input,
                          elevation_gain_input):
    # returns pandas.core.series.Series
    
    # One hot encode tags and categorical variable
    features_ohe = input_tag_ohe(features_name, full_features_ls, features_input)
    activities_ohe = input_tag_ohe(activities_name, full_activities_ls, activities_input)
    route_type_ohe = column_ohe(df, route_type, route_type_input)
    
    # Numerical inputs
    num_features = pd.Series({'length': length_input, 'elevation_gain': elevation_gain_input})
    
    # Create input vector
    input_vector = pd.concat([num_features, route_type_ohe, features_ohe, activities_ohe])
    
    return input_vector

In [257]:
def predict_difficulty(input_vector, model_path):
    # Make prediction
    input_arr = np.array(input_vector).reshape(1, -1)
    model = pickle.load(open(model_path, 'rb'))
    #pred = pd.Series({response: model.predict(input_arr)[0]})
    pred = model.predict(input_arr)[0]
    
    return pred

# Cosine similarity

In [273]:
def scaled_cosine_sim(df):
    
    # Standard scaling for numerical data
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)

    # Calculate distance
    cs = cosine_similarity(df_scaled)

    return cs

# Make Recommendations

In [305]:
def recommend_trails(n, df_features, df, trail_id, response, input_vector, input_matrix, feature_list):
    # df_features: one hot encoded
    # df: includes trail info
    
    # Set trail_id to be the index
    input_matrix = df_featurize.set_index(trail_id)
    input_matrix = input_matrix.drop(columns=[response])

    # Save trail ids to reference for recommendation
    input_matrix_ind = input_matrix.index

    # Insert new input in first row, use this to calculate similarity
    input_dist = pd.concat([pd.DataFrame(input_vector).transpose(), input_matrix])

    # Calculate distance
    cs = scaled_cosine_sim(input_dist)

    # Get similarities for the new input
    sim_vec = cs[0,:]

    # Get the n largest similarities
    sim_ind = sim_vec.argsort()[-(n + 1):][::-1][1:]

    # Get the trail id using the index
    most_sim_id = list(input_matrix_ind[sim_ind])
    most_sim_df = df[df[trail_id].isin(most_sim_id)].reset_index(drop=True)

    # Show a subset of columns
    res = most_sim_df[feature_list]
    
    return res

# Test an input

In [306]:
features_name = 'features'
full_features_ls = full_features_ls
activities_name = 'activities'
full_activities_ls = full_activities_ls

features_input = ['ada', 'beach', 'dogs', 'wild_flowers']
activities_input = ['bike_touring', 'birding', 'walking', 'rock_climbing']
df = df
route_type = 'route_type'
route_type_input = 'point_to_point'
length_input = 5000
elevation_gain_input = 20000

# Create user input vector
features_input = ['beach']
activities_input = ['fishing']
user_input = create_input_features(features_name, full_features_ls, features_input,
                                      activities_name, full_activities_ls, activities_input,
                                      df, route_type, route_type_input,
                                      length_input,
                                      elevation_gain_input)

# Predict difficulty
input_pred = predict_difficulty(user_input, '../../models/model.pkl')
print(input_pred)

# Recommend trails
feature_list = ['name', 'area_name', 'city_name', 'state_name', 'length', 'elevation_gain', 'route_type', 'features', 'activities']
recommend_trails(5, df_featurize, df, 'trail_id', 'difficulty_class', user_input, df_featurize, feature_list)

hard


Unnamed: 0,name,area_name,city_name,state_name,length,elevation_gain,route_type,features,activities
0,Lost Horse Mine Loop Trail,Joshua Tree National Park,Twentynine Palms,California,11104.446,277.9776,loop,"['dogs-no', 'views', 'wild-flowers', 'wildlife']","['camping', 'hiking', 'nature-trips', 'trail-r..."
1,Crest View Trail,Joshua Tree National Park,Joshua Tree,California,4667.086,52.7304,out and back,"['forest', 'views']","['birding', 'hiking', 'nature-trips', 'walking']"
2,McCabe Lakes Trail,Yosemite National Park,Bridgeport,California,47958.332,1737.9696,out and back,"['dogs-no', 'forest', 'lake', 'views', 'wild-f...","['backpacking', 'birding', 'camping', 'hiking'..."
3,Porcupine Wash Ruby Lee Mill Site Trail,Joshua Tree National Park,Blythe,California,12230.984,238.9632,out and back,"['views', 'wild-flowers']","['hiking', 'nature-trips']"
4,Swiftcurrent Mountain via Granite Park Trail,Glacier National Park,West Glacier,Montana,19473.014,1291.7424,out and back,"['forest', 'views', 'wild-flowers', 'wildlife']","['birding', 'hiking', 'nature-trips']"


In [172]:
feature_name = 'features'
full_tag_ls = full_features_ls
tag_input = features_input
features_ohe = input_tag_ohe(feature_name, full_tag_ls, tag_input)

feature_name = 'activities'
full_tag_ls = full_activities_ls
tag_input = activities_input
activities_ohe = input_tag_ohe(feature_name, full_tag_ls, tag_input)

df = df
column_name = 'route_type'
col_input = route_type_input
route_type_ohe = column_ohe(df, column_name, col_input)


num_features = pd.Series({'length': length_input, 'elevation_gain': elevation_gain_input})

# Create input vector
input_vector = pd.concat([num_features, route_type_ohe, features_ohe, activities_ohe])

# Make prediction
input_arr = np.array(input_vector).reshape(1, -1)
model = pickle.load(open('../../models/model.pkl', 'rb'))
pred = pd.Series({'difficulty_class': model.predict(input_arr)[0]})


In [223]:
# Set trail_id to be the index
input_matrix = df_featurize.set_index('trail_id')
input_matrix = input_matrix.drop(columns=['difficulty_class'])

# Save trail ids to reference for recommendation
input_matrix_ind = input_matrix.index

# Insert new input in first row, use this to calculate similarity
input_dist = pd.concat([pd.DataFrame(input_vector).transpose(), input_matrix])

# Calculate distance
cs = scaled_cosine_sim(df_scaled)

# Get similarities for the new input
sim_vec = cs[0,:]

# Get the n largest similarities
n = 5
sim_ind = sim_vec.argsort()[-(n + 1):][::-1][1:]

# Get the trail id using the index
most_sim_id = list(input_matrix_ind[sim_ind])
most_sim_df = df[df['trail_id'].isin(most_sim_id)].reset_index(drop=True)

# Show a subset of columns
most_sim_df[['name', 'area_name', 'city_name', 'state_name', 'length', 'elevation_gain', 'route_type', 'features', 'activities']]

Unnamed: 0,name,area_name,city_name,state_name,length,elevation_gain,route_type,features,activities
0,Wellington Ditch Trail,Great Sand Dunes National Park and Preserve,Mosca,Colorado,2896.812,47.8536,out and back,"['dogs-leash', 'kids', 'views', 'wild-flowers'...","['birding', 'hiking', 'mountain-biking', 'natu..."
1,Biscayne National Park Walk,Biscayne National Park,Miami,Florida,15932.466,0.9144,out and back,"['beach', 'dogs-no', 'kids', 'paved', 'views',...","['birding', 'fishing', 'nature-trips', 'paddle..."
2,Finley Cane Trail,Great Smoky Mountains National Park,Townsend,Tennessee,8368.568,190.8048,out and back,"['dogs-no', 'forest', 'kids', 'views', 'wildli...","['birding', 'hiking', 'nature-trips', 'trail-r..."
3,Cascade Pass Trail,North Cascades National Park,Rockport,Washington,10460.71,542.8488,out and back,"['dogs-no', 'forest', 'views', 'wildlife']","['birding', 'hiking']"
4,Goat Island Mountain Trail,Mount Rainier National Park,Paradise Inn,Washington,14001.258,1178.9664,loop,"['dogs-no', 'forest', 'views', 'wild-flowers',...","['birding', 'hiking', 'nature-trips', 'rock-cl..."
