In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [136]:
hike_data = pd.read_csv('data/clean_hike_data.csv', index_col=0)

In [137]:
hike_data.head()

Unnamed: 0,HikeName,Region,Subregion,TotalDistance,ElevationGain,MaximumElevation,Description,PassNeeded,LatLong,TimeFromSeattle,...,Rivers,Summits,Waterfalls,WildflowersMeadows,Wildlife,Url,Rating,NumVotes,countTripReports,distance
0,Dirty Face Lookout and Peak,Central Cascades,Central Cascades,"9.0 miles, roundtrip",3950,5989,Dirty Face Lookout is one of the earliest high...,Northwest Forest Pass,"47.8379,-120.7976",125.6,...,0,1,0,1,0,https://www.wta.org/go-hiking/hikes/dirtyface-...,3.82,17,217,9.0
1,Perry Creek,North Cascades,North Cascades,"10.5 miles, roundtrip",3400,5250,"Perry Creek proves you can have it all, and yo...",Northwest Forest Pass,"48.0541,-121.4907",93.083333,...,0,1,1,1,0,https://www.wta.org/go-hiking/hikes/perry-creek,4.28,29,617,10.5
2,Hannegan Pass and Peak,North Cascades,North Cascades,"10.4 miles, roundtrip",3100,6200,Begin hiking from the parking area at the end ...,Northwest Forest Pass,"48.9101,-121.5927",163.533333,...,0,1,0,0,1,https://www.wta.org/go-hiking/hikes/hannegan-p...,4.35,23,512,10.4
3,Eruption Trail,South Cascades,South Cascades,"0.5 miles, roundtrip",25,4200,"A barrier-free, paved hike of less than one mi...",National Monument Fee,"46.2765,-122.2165",161.75,...,0,0,0,1,0,https://www.wta.org/go-hiking/hikes/eruption-t...,3.0,5,18,0.5
4,Copper Ridge Loop,North Cascades,North Cascades,"34.0 miles, roundtrip",8600,6260,Experience the variety offered by the North Ca...,Northwest Forest Pass,"48.9102,-121.5917",163.8,...,1,1,0,0,1,https://www.wta.org/go-hiking/hikes/copper-rid...,4.36,11,122,34.0


In [138]:
def norm(df, col):
    """Normalized the give column of the provided dataframe"""
    df[col] = pd.to_numeric(df[col],errors='coerce')
    df[f"{col}_normalized"] = (df[col] - df[col].mean())/(df[col].max() - df[col].min())

In [140]:
norm(hike_data, 'ElevationGain')
norm(hike_data, 'TimeFromSeattle')
norm(hike_data, 'countTripReports')
norm(hike_data, 'distance')

In [156]:

hike_data.replace(to_replace=["None"], value=np.nan, inplace=True)
hike_data.fillna(0, inplace=True)

In [157]:
hike_data[hike_data['HikeName']=="Dirty Harry's Balcony"]

Unnamed: 0,HikeName,Region,Subregion,TotalDistance,ElevationGain,MaximumElevation,Description,PassNeeded,LatLong,TimeFromSeattle,...,Wildlife,Url,Rating,NumVotes,countTripReports,distance,ElevationGain_normalized,TimeFromSeattle_normalized,countTripReports_normalized,distance_normalized
52,Dirty Harry's Balcony,Snoqualmie Region,Snoqualmie Region,"4.4 miles, roundtrip",1300.0,2600,Get a workout and marvel at rock work complete...,Discover Pass,"47.4312,-121.6324",39.866667,...,1,https://www.wta.org/go-hiking/hikes/dirty-harr...,3.47,45,702,4.4,-0.024958,-0.235166,0.252308,-0.002338


In [158]:
hike_data.columns

Index(['HikeName', 'Region', 'Subregion', 'TotalDistance', 'ElevationGain',
       'MaximumElevation', 'Description', 'PassNeeded', 'LatLong',
       'TimeFromSeattle', 'Coast', 'DogsAllowed', 'EstablishedCampsites',
       'FallFoliage', 'GoodForKids', 'Lakes', 'MountainViews', 'OldGrowth',
       'RidgesPasses', 'Rivers', 'Summits', 'Waterfalls', 'WildflowersMeadows',
       'Wildlife', 'Url', 'Rating', 'NumVotes', 'countTripReports', 'distance',
       'ElevationGain_normalized', 'TimeFromSeattle_normalized',
       'countTripReports_normalized', 'distance_normalized'],
      dtype='object')

In [173]:
recommender_cols = ['Coast', 'DogsAllowed', 'EstablishedCampsites',
       'FallFoliage', 'GoodForKids', 'Lakes', 'MountainViews', 'OldGrowth',
       'RidgesPasses', 'Rivers', 'Summits', 'Waterfalls', 'WildflowersMeadows',
       'Wildlife', 
       'ElevationGain_normalized', 'TimeFromSeattle_normalized',
       'countTripReports_normalized', 'distance_normalized']

In [174]:
rec_test_data = hike_data[recommender_cols]

In [175]:
rec_test_data['hike_id'] = rec_test_data.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [176]:
"""DOC strings"""

# import packages
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity



class hikeRecommender(object):

    def __init__(self, hikes, weights=None):
        self.hike_matrix = hikes
        self.feature_weights = weights
        self.hikes_liked = pd.DataFrame(columns=hikes.columns)
        self.hikes_disliked = pd.DataFrame(columns=hikes.columns)
        if not weights:
            self.weights={c:1 for c in hikes.columns}
        else: 
            self.weights=weights
            self.weighted_hike_matrix = self.apply_weights()

    def like_hike(self, hike_id):
        indx = self.hike_matrix[self.hike_matrix['hike_id']==hike_id].index[0]
        self.hikes_liked = self.hikes_liked.append(self.hike_matrix.iloc[indx])
        self.hike_matrix = self.hike_matrix.drop(indx, axis=0).reset_index(drop=True)

    def recommend(self, n=5, apply_weights=True):
        # Calculate similarity to all of the hikes
        # average similarities
        # return top 5
        indx_id = self.hike_matrix['hike_id']
        X = self.hike_matrix.drop('hike_id', axis=1)
        y = self.hikes_liked.drop('hike_id', axis=1)
        cs = cosine_similarity(X, y).mean(axis=1)
        rec_index= np.argsort(cs)[-n:][::-1]
        recommendations = indx_id.iloc[rec_index]
        return recommendations
        
    def apply_weights(self):
        weighted_hike_matrix = pd.DataFrame(columns=self.hike_matrix.columns)
        for col, weight in self.weights.items():
            weighted_hike_matrix[col] = self.hike_matrix[col].apply(lambda x: x*weight) 
        return weighted_hike_matrix
    
    

In [182]:
# import weights
import json
with open('data/weights.json', 'r') as weights_fp:
    weight = json.load(weights_fp)

In [183]:
weight

{'ElevationGain': 0.08088926374096386,
 'TimeFromSeattle': 0.15267880383054017,
 'Coast': 0.002390998999926175,
 'DogsAllowed': 0.0012553190403816,
 'EstablishedCampsites': 0.01015433268815869,
 'FallFoliage': 0.03621965292388979,
 'GoodForKids': 0.0,
 'Lakes': 0.008223032157539938,
 'MountainViews': 0.018512043506946005,
 'OldGrowth': 0.025258605813801176,
 'RidgesPasses': 0.004037668934122245,
 'Rivers': 0.007060788267084692,
 'Summits': 0.010542753598486509,
 'Waterfalls': 0.00019599592911147094,
 'WildflowersMeadows': 0.002537626643713041,
 'Wildlife': 0.017885814526906914,
 'countTripReports': 0.5443232877124791,
 'distance': 0.07783401168594875}

In [213]:
weight = {'ElevationGain_normalized': 0.2088926374096386,
 'TimeFromSeattle_normalized': 0.5267880383054017,
 'Coast': 0.002390998999926175,
 'DogsAllowed': 0.0012553190403816,
 'EstablishedCampsites': 0.01015433268815869,
 'FallFoliage': 0.03621965292388979,
 'GoodForKids': 0.0,
 'Lakes': 0.008223032157539938,
 'MountainViews': 0.018512043506946005,
 'OldGrowth': 0.025258605813801176,
 'RidgesPasses': 0.004037668934122245,
 'Rivers': 0.007060788267084692,
 'Summits': 0.010542753598486509,
 'Waterfalls': 0.00019599592911147094,
 'WildflowersMeadows': 0.002537626643713041,
 'Wildlife': 0.017885814526906914,
 'countTripReports_normalized': 0.5443232877124791,
 'distance_normalized': 0.07783401168594875}


In [242]:
hr=hikeRecommender(rec_test_data, weights=weight)

In [243]:
hike_data[hike_data['HikeName'].str.contains("Mount Townsend")]

Unnamed: 0,HikeName,Region,Subregion,TotalDistance,ElevationGain,MaximumElevation,Description,PassNeeded,LatLong,TimeFromSeattle,...,Wildlife,Url,Rating,NumVotes,countTripReports,distance,ElevationGain_normalized,TimeFromSeattle_normalized,countTripReports_normalized,distance_normalized
1124,Mount Townsend,Olympic Peninsula,Olympic Peninsula,"8.0 miles, roundtrip",3010.0,6260,Is Mount Townsend popular because there are fo...,0,"47.8564,-123.0359",152.233333,...,0,https://www.wta.org/go-hiking/hikes/mount-town...,4.26,53,992,8.0,0.036133,0.024561,0.368262,-0.000837
1125,Mount Townsend - Silver Lakes Traverse,Olympic Peninsula,Olympic Peninsula,"14.0 miles, roundtrip",3200.0,6280,Who doesn’t love a secret passageway? While th...,0,"47.8576,-123.0946",199.966667,...,0,https://www.wta.org/go-hiking/hikes/mount-town...,4.0,4,62,14.0,0.042921,0.134893,-0.00359,0.001663
3707,Mount Townsend Snowshoe,Olympic Peninsula,Olympic Peninsula,"13.5 miles, roundtrip",2500.0,5500,0,0,"47.8559,-123.0360",152.25,...,0,https://www.wta.org/go-hiking/hikes/mount-town...,1.0,2,27,13.5,0.017913,0.0246,-0.017584,0.001454


In [244]:
mailbox = 121
townsend = 1124

In [245]:
hr.like_hike(mailbox)

In [246]:
hr.like_hike(townsend)

In [247]:
recs = hike_data.iloc[list(hr.recommend(10))]

In [248]:
recs[['HikeName', 'Region', 'TotalDistance', 'ElevationGain',
       'TimeFromSeattle', 'countTripReports']].sort_values('countTripReports', ascending=True)

Unnamed: 0,HikeName,Region,TotalDistance,ElevationGain,TimeFromSeattle,countTripReports
1619,Cornell Butte,Eastern Washington,"3.25 miles, roundtrip",600.0,316.616667,1
1738,Summit Springs,South Cascades,"5.6 miles, roundtrip",1738.0,0.0,10
660,West Tiger No. 1 via Dwight's Way,Issaquah Alps,"8.0 miles, roundtrip",2500.0,20.483333,21
1331,Phils Trail - Thrush Gap Loop,Issaquah Alps,"8.2 miles, roundtrip",1606.0,29.0,30
1700,Grassy Knoll,Southwest Washington,"4.4 miles, roundtrip",1048.0,255.066667,33
728,Sunrise Peak,South Cascades,"3.0 miles, roundtrip",1400.0,181.3,42
2447,South Tiger Mountain Loop,Issaquah Alps,"8.6 miles, roundtrip",1550.0,32.533333,60
648,Thorp Mountain via Knox Creek,Snoqualmie Region,"4.4 miles, roundtrip",1734.0,118.283333,99
714,Carne Mountain,Central Cascades,"7.3 miles, roundtrip",3600.0,186.483333,213
406,Granite Mountain,Snoqualmie Region,"8.6 miles, roundtrip",3800.0,46.433333,1818
