In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('../../data/models/df_cleaned.csv')
df.head(3)

Unnamed: 0,trail_id,name,area_name,city_name,state_name,country_name,popularity,length,elevation_gain,difficulty_rating,route_type,visitor_usage,avg_rating,num_reviews,features,activities,units,lat,lng
0,10020048,Harding Ice Field Trail,Kenai Fjords National Park,Seward,Alaska,United States,24.8931,15610.598,1161.8976,5,out and back,3.0,5.0,423,"['dogs-no', 'forest', 'river', 'views', 'water...","['birding', 'camping', 'hiking', 'nature-trips...",i,60.18852,-149.63156
1,10236086,Mount Healy Overlook Trail,Denali National Park,Denali National Park,Alaska,United States,18.0311,6920.162,507.7968,3,out and back,1.0,4.5,260,"['dogs-no', 'forest', 'views', 'wild-flowers',...","['birding', 'camping', 'hiking', 'nature-trips...",i,63.73049,-148.91968
2,10267857,Exit Glacier Trail,Kenai Fjords National Park,Seward,Alaska,United States,17.7821,2896.812,81.9912,1,out and back,3.0,4.5,224,"['dogs-no', 'partially-paved', 'views', 'wildl...","['hiking', 'walking']",i,60.18879,-149.631


In [3]:
df_featurize = pd.read_csv('../../data/models/df_featurize.csv')
df_featurize.head(3)

Unnamed: 0,trail_id,length,elevation_gain,route_type_out and back,route_type_point to point,features_ada,features_beach,features_cave,features_city_walk,features_dogs,...,activities_scenic_driving,activities_sea_kayaking,activities_skiing,activities_snowboarding,activities_snowshoeing,activities_surfing,activities_trail_running,activities_walking,activities_whitewater_kayaking,difficulty_class
0,10020048,15610.598,1161.8976,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,hard
1,10236086,6920.162,507.7968,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,moderate
2,10267857,2896.812,81.9912,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,easy


# One hot encode difficulty class

In [4]:
def one_hot_encode_feature(df, column_name):
    
    # Get column names of categorical and numerical variables
    cat_names = df.select_dtypes(include='object').columns
    num_names = df.select_dtypes(include=np.number).columns

    # Encode categorical variables
    enc_columns = pd.get_dummies(df[cat_names], drop_first = True)

    # Concatenate encoded columns to numerical columns, and tag features
    df_enc = pd.concat([df[num_names], enc_columns], axis=1)
    
    return df_enc

# Convert input to feature vector

In [None]:
length = 1
elevation_gain = 1
route_type = 'out_and_back'
features = ['ada', 'beach']
activities = ['bike_touring', 'birding']

# Cosine similarity

In [5]:
def cosine_sim_matrix(df, trail_id):
    
    # Standard scaling for numerical data
    scaler = StandardScaler()
    features = df.drop(columns=trail_id, axis=1)
    df_scaled = scaler.fit_transform(features)

    # Calculate distance
    cs = cosine_similarity(df_scaled, df_scaled)
    
    return cs

In [8]:
df_ohe = one_hot_encode_feature(df_featurize, column_name)

In [9]:
df_ohe = one_hot_encode_feature(df_featurize, column_name)
dist = cosine_sim_matrix(df_ohe, trail_id)

In [15]:
df_featurize.columns

Index(['trail_id', 'length', 'elevation_gain', 'route_type_out and back',
       'route_type_point to point', 'features_ada', 'features_beach',
       'features_cave', 'features_city_walk', 'features_dogs',
       'features_dogs_leash', 'features_dogs_no', 'features_forest',
       'features_historic_site', 'features_hot_springs', 'features_kids',
       'features_lake', 'features_partially_paved', 'features_paved',
       'features_rails_trails', 'features_river', 'features_strollers',
       'features_views', 'features_waterfall', 'features_wild_flowers',
       'features_wildlife', 'activities_backpacking',
       'activities_bike_touring', 'activities_birding', 'activities_camping',
       'activities_canoeing', 'activities_cross_country_skiing',
       'activities_fishing', 'activities_fly_fishing', 'activities_hiking',
       'activities_horseback_riding', 'activities_ice_climbing',
       'activities_mountain_biking', 'activities_nature_trips',
       'activities_off_road_drivin

In [None]:
# Storing indices of the data
indices = pd.Series(trail_info.index)
  
def recommendations(title, indices, cosine_sim):
    recommended_movies = []
    index = indices[indices == title].index[0]
    similarity_scores = pd.Series(cosine_sim[index]).sort_values(ascending = False)
    top_10_movies = list(similarity_scores.iloc[1:11].index)
    for i in top_10_movies:
        recommended_movies.append(list(finaldata.index)[i])
    return recommended_movies

# Join trail information

In [11]:
trail_info = df[['name', 'area_name', 'city_name', 'state_name']] 
trail_info = trail_info.set_index('name')
trail_info

Unnamed: 0_level_0,area_name,city_name,state_name
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Harding Ice Field Trail,Kenai Fjords National Park,Seward,Alaska
Mount Healy Overlook Trail,Denali National Park,Denali National Park,Alaska
Exit Glacier Trail,Kenai Fjords National Park,Seward,Alaska
Horseshoe Lake Trail,Denali National Park,Denali National Park,Alaska
Triple Lakes Trail,Denali National Park,Denali National Park,Alaska
...,...,...,...
Silversword Loop Via Halemau'u Trail,Haleakala National Park,Kula,Maui
Keonehe'ehe'e Trail,Haleakala National Park,Kula,Maui
Red Hill Overlook Summit Trail,Haleakala National Park,Kula,Maui
Kaupo Trail,Haleakala National Park,Kula,Maui


# Make Recommendations

In [7]:
# ohe
column_name = 'difficulty_class'

# cosine similarity
trail_id = 'trail_id'

In [16]:
df_ohe = one_hot_encode_feature(df_featurize, column_name)
dist = cosine_sim_matrix(df_ohe, trail_id)

In [18]:
np.argsort(dist[0])[-5:][::-1]

array([   0, 1899,  478,  225,  220])