In [1]:
from __future__ import division
import graphlab
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split

In [2]:
df = pd.read_csv(open('../data/tripReports.csv','rU'), encoding='utf-8', engine='c')
df = df.drop(labels=['Unnamed: 0', 'Text', 'Date','Creator'], axis=1)
df.head()

Unnamed: 0,Rating,hike_id,hike_name,author_id
0,3.0,0.0,Grand Park via Lake Eleanor,12474.0
1,3.0,0.0,Grand Park via Lake Eleanor,19586.0
2,1.0,0.0,Grand Park via Lake Eleanor,4317.0
3,3.0,0.0,Grand Park via Lake Eleanor,6676.0
4,3.0,0.0,Grand Park via Lake Eleanor,10392.0


In [3]:
df = df.dropna()

In [4]:
df['hike_id'] = df['hike_id'].fillna(np.nan).astype(int)
df['author_id'] = df['author_id'].fillna(np.nan).astype(int)

In [5]:
df = graphlab.SFrame(df)

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1471209371.log
INFO:graphlab.cython.cy_server:GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1471209371.log


This non-commercial license of GraphLab Create for academic use is assigned to jat257@cornell.edu and will expire on June 30, 2017.


## Factorization Recommender

https://turi.com/products/create/docs/generated/graphlab.recommender.factorization_recommender.FactorizationRecommender.html

With ALS

In [6]:
df.shape

(76966, 4)

In [7]:
# Create training and test set
df_train = df[:60000]
df_test = df[60000:]

In [8]:
df_train.shape

(60000, 4)

In [9]:
model = graphlab.recommender.factorization_recommender.create(df_train, user_id='author_id', item_id='hike_id', target='Rating', solver='als' )

In [10]:
predicted = model.predict(df_test)



In [11]:
df_test.head()

Rating,hike_id,hike_name,author_id
5.0,875,Heliotrope Ridge,15685
5.0,875,Heliotrope Ridge,6105
3.0,875,Heliotrope Ridge,6837
5.0,875,Heliotrope Ridge,5218
5.0,875,Heliotrope Ridge,4770
3.0,875,Heliotrope Ridge,15232
3.0,875,Heliotrope Ridge,5774
5.0,875,Heliotrope Ridge,14435
3.0,875,Heliotrope Ridge,1295
1.0,875,Heliotrope Ridge,18756


In [12]:
from sklearn.metrics import mean_squared_error

In [13]:
print 'MSE: ', mean_squared_error(df_test['Rating'], predicted)
print 'RMSE: ', np.sqrt(mean_squared_error(df_test['Rating'], predicted))

MSE:  1.99959502278
RMSE:  1.41407037406


## Factorization Recommender

https://turi.com/products/create/docs/generated/graphlab.recommender.factorization_recommender.FactorizationRecommender.html

With SGD

In [14]:
model = graphlab.recommender.factorization_recommender.create(df_train, user_id='author_id', item_id='hike_id', target='Rating', solver='sgd' )

In [15]:
predicted = model.predict(df_test)

In [16]:
print 'MSE: ', mean_squared_error(df_test['Rating'], predicted)
print 'RMSE: ', np.sqrt(mean_squared_error(df_test['Rating'], predicted))

MSE:  1.83835298532
RMSE:  1.35585876304


In [17]:
model.recommend(users=[2], k=5)

author_id,hike_id,score,rank
2,238,5.25873636212,1
2,539,5.17415454928,2
2,783,5.11422454323,3
2,515,5.09451678991,4
2,803,5.08860352207,5


## Factorization model incorporating item data


In [19]:
item_data = pd.read_csv('../data/itemData.csv')

item_data = item_data.drop(labels=['Unnamed: 0'], axis=1)
item_sf = item_data.dropna()


In [20]:
item_sf.head()

Unnamed: 0,hike_name,hike_id,numReports,total_dist,elevation gain,time_from_seattle,Coast,stars,Dogs allowed on leash,Established campsites,...,Good for kids,Lakes,Mountain views,Old growth,Ridges/passes,Rivers,Summits,Waterfalls,Wildflowers/Meadows,Wildlife
0,Grand Park via Lake Eleanor,0,212.0,8.0,1100.0,116.533333,0,4.25,0,1,...,0,0,1,0,0,0,0,0,0,1
2,Soda Peaks Lake,2,24.0,7.0,2500.0,227.333333,0,3.0,0,0,...,0,1,0,1,0,0,0,0,0,0
3,Shorthorn,3,5.0,5.6,1400.0,207.433333,0,3.67,1,0,...,0,0,1,0,0,0,0,0,0,0
4,West Fork Teanaway River,4,5.0,19.2,2800.0,115.75,0,2.75,1,1,...,1,0,1,0,1,0,0,0,0,1
5,Fishtrap Lake - North,5,4.0,4.0,126.0,233.516667,0,1.5,0,0,...,0,1,0,0,0,0,0,0,0,1


In [21]:
def norm(df, col):
    df[col] = (df[col] - df[col].mean())/(df[col].max() - df[col].min())

In [22]:
norm(item_sf, 'elevation gain')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [23]:
norm(item_sf, 'elevation gain')
norm(item_sf, 'time_from_seattle')
norm(item_sf, 'numReports')
norm(item_sf, 'total_dist')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [24]:
itemData = graphlab.SFrame(item_sf)

In [25]:
model = graphlab.recommender.ranking_factorization_recommender.create(df_train, user_id='author_id', 
                                                                      item_id='hike_id', target='Rating', 
                                                                      item_data=itemData, ranking_regularization=0,
                                                                        regularization=1e-10,
                                                                      linear_regularization=1e-8)

In [26]:
my_recs = model.get_similar_items(items=[13], k=5)

In [27]:
my_recs

hike_id,similar,score,rank
13,2021,0.553878247738,1
13,1422,0.552374899387,2
13,1936,0.533952891827,3
13,575,0.515299618244,4
13,14,0.484716594219,5


## Item Content recommender
https://turi.com/products/create/docs/generated/graphlab.recommender.item_content_recommender.ItemContentRecommender.html

In [29]:
item_content_model = graphlab.recommender.item_content_recommender.create(itemData, 'hike_id')

('Applying transform:\n', Class             : AutoVectorizer

Model Fields
------------
Features          : ['hike_name', 'numReports', 'total_dist', 'elevation gain', 'time_from_seattle', 'Coast', 'stars', 'Dogs allowed on leash', 'Established campsites', 'Fall foliage', 'Good for kids', 'Lakes', 'Mountain views', 'Old growth', 'Ridges/passes', 'Rivers', 'Summits', 'Waterfalls', 'Wildflowers/Meadows', 'Wildlife']
Excluded Features : ['hike_id']

Column                 Type   Interpretation  Transforms                         Output Type
---------------------  -----  --------------  ---------------------------------  -----------
hike_name              str    short_text      3-Character NGram Counts -> TFIDF  dict       
numReports             float  numerical       None                               float      
total_dist             float  numerical       None                               float      
elevation gain         float  numerical       None                               flo


Defaulting to brute force instead of ball tree because there are multiple distance components.




In [30]:
recs = item_content_model.get_similar_items(items=[87])

In [31]:
for i in recs['similar']:
    print item_data[item_data['hike_id']==i]

                 hike_name  hike_id  numReports  total_dist  elevation gain  \
196  Miners Ridge (Entiat)      196         1.0         4.0           850.0   

     time_from_seattle  Coast  stars  Dogs allowed on leash  \
196             181.35      0    3.8                      1   

     Established campsites    ...     Good for kids  Lakes  Mountain views  \
196                      0    ...                 0      0               1   

     Old growth  Ridges/passes  Rivers  Summits  Waterfalls  \
196           0              0       0        0           0   

     Wildflowers/Meadows  Wildlife  
196                    0         0  

[1 rows x 21 columns]
           hike_name  hike_id  numReports  total_dist  elevation gain  \
1309  Anderson Butte     1309        24.0         3.0          1000.0   

      time_from_seattle  Coast  stars  Dogs allowed on leash  \
1309         149.433333      0   3.29                      1   

      Established campsites    ...     Good for kids  Lak