In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [7]:
item_data = pd.read_csv('data/itemData.csv')
item_data = item_data.drop(labels=['Unnamed: 0', 'hike_name'], axis=1)

In [8]:
item_data.head()

Unnamed: 0,hike_id,numReports,total_dist,elevation gain,time_from_seattle,Coast,stars,Dogs allowed on leash,Established campsites,Fall foliage,Good for kids,Lakes,Mountain views,Old growth,Ridges/passes,Rivers,Summits,Waterfalls,Wildflowers/Meadows,Wildlife
0,0,212.0,8.0,1100.0,116.533333,0,4.25,0,1,0,0,0,1,0,0,0,0,0,0,1
1,1,2.0,,,,0,2.33,1,0,0,0,0,0,0,0,0,0,0,0,0
2,2,24.0,7.0,2500.0,227.333333,0,3.0,0,0,0,0,1,0,1,0,0,0,0,0,0
3,3,5.0,5.6,1400.0,207.433333,0,3.67,1,0,0,0,0,1,0,0,0,0,0,0,0
4,4,5.0,19.2,2800.0,115.75,0,2.75,1,1,0,1,0,1,0,1,0,0,0,0,1


In [10]:
def norm(df, col):
    """Normalized the give column of the provided dataframe"""
    df[col] = (df[col] - df[col].mean())/(df[col].max() - df[col].min())

In [11]:
norm(item_data, 'elevation gain')
norm(item_data, 'time_from_seattle')
norm(item_data, 'numReports')
norm(item_data, 'total_dist')

In [12]:
item_data.dropna(inplace=True)

In [25]:
hikes_liked = pd.DataFrame(item_data.iloc[65]).transpose()

In [26]:
hikes_liked

Unnamed: 0,hike_id,numReports,total_dist,elevation gain,time_from_seattle,Coast,stars,Dogs allowed on leash,Established campsites,Fall foliage,Good for kids,Lakes,Mountain views,Old growth,Ridges/passes,Rivers,Summits,Waterfalls,Wildflowers/Meadows,Wildlife
73,73.0,-0.015021,-0.095594,-0.044155,0.030779,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
hikes_liked = hikes_liked.append(item_data.iloc[87])

In [28]:
hikes_liked

Unnamed: 0,hike_id,numReports,total_dist,elevation gain,time_from_seattle,Coast,stars,Dogs allowed on leash,Established campsites,Fall foliage,Good for kids,Lakes,Mountain views,Old growth,Ridges/passes,Rivers,Summits,Waterfalls,Wildflowers/Meadows,Wildlife
73,73.0,-0.015021,-0.095594,-0.044155,0.030779,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
99,99.0,-0.025598,0.035247,0.397317,-0.015716,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [136]:
hikes_liked.columns

Index([u'hike_id', u'numReports', u'total_dist', u'elevation gain',
       u'time_from_seattle', u'Coast', u'stars', u'Dogs allowed on leash',
       u'Established campsites', u'Fall foliage', u'Good for kids', u'Lakes',
       u'Mountain views', u'Old growth', u'Ridges/passes', u'Rivers',
       u'Summits', u'Waterfalls', u'Wildflowers/Meadows', u'Wildlife'],
      dtype='object')

In [29]:
"""DOC strings"""

# import packages
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity



class hikeRecommender(object):

    def __init__(self, hikes, weights=None):
        self.hike_matrix = hikes
        self.feature_weights = weights
        self.hikes_liked = []
        self.hikes_disliked = []

    def like_hike(self, hike_id):
        indx = self.hike_matrix[self.hike_matrix['hike_id']==hike_id].index[0]
        if len(self.hikes_liked)==0:
            self.hikes_liked = pd.DataFrame(self.hike_matrix.ix[indx]).transpose()
            self.hike_matrix = self.hike_matrix.drop(indx, axis=0).reset_index(drop=True)
        else:
            self.hikes_liked = self.hikes_liked.append(self.hike_matrix.ix[indx])
            self.hike_matrix = self.hike_matrix.drop(indx, axis=0).reset_index(drop=True)

    def recommend(self, n=5):
        # Calculate similarity to all of the hikes
        # average similarities
        # return top 5
        indx_id = self.hike_matrix['hike_id']
        X = self.hike_matrix.drop('hike_id', axis=1)
        y = self.hikes_liked.drop('hike_id', axis=1)
        cs = cosine_similarity(X, y).mean(axis=1)
        rec_index= np.argsort(cs)[-n:][::-1]
        recommendations = indx_id.ix[rec_index]
        return recommendations


In [30]:
hikeRecommender(item_data)

<__main__.hikeRecommender at 0x119571518>

In [31]:
hr=hikeRecommender(item_data)

In [32]:
hr.likeHike(65)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [33]:
hr.likeHike(4)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [135]:
hr.recommend(10)

962     1215
811     1001
1044    1332
371      421
960     1213
1237    1774
1187    1609
479      552
316      359
1031    1311
Name: hike_id, dtype: int64