In [10]:
from __future__ import division
import graphlab
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split

In [11]:
df = pd.read_csv(open('data/tripReports.csv','rU'), encoding='utf-8', engine='c')
df = df.drop(labels=['Unnamed: 0', 'Name', 'Text', 'Date','Creator', 'TextBlobSentiment', 'GraphLabSentiment', 'TrainedModelSentiment'], axis=1)
df.head()

Unnamed: 0,author_id,hike_id,Rating
0,12474.0,0.0,2.0
1,19583.0,0.0,3.0
2,4317.0,0.0,1.0
3,6676.0,0.0,4.0
4,10392.0,0.0,4.0


In [12]:
df = df.dropna()

In [13]:
df['hike_id'] = df['hike_id'].fillna(np.nan).astype(int)
df['author_id'] = df['author_id'].fillna(np.nan).astype(int)

In [14]:
df = graphlab.SFrame(df)

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1470253318.log
INFO:graphlab.cython.cy_server:GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1470253318.log


This non-commercial license of GraphLab Create for academic use is assigned to jat257@cornell.edu and will expire on June 30, 2017.


## Factorization Recommender

https://turi.com/products/create/docs/generated/graphlab.recommender.factorization_recommender.FactorizationRecommender.html

With ALS

In [15]:
df.shape

(76962, 3)

In [16]:
# Create training and test set
df_train = df[:60000]
df_test = df[60000:]

In [17]:
df_train.shape

(60000, 3)

In [18]:
model = graphlab.recommender.factorization_recommender.create(df_train, user_id='author_id', item_id='hike_id', target='Rating', solver='als' )

In [19]:
predicted = model.predict(df_test)



In [20]:
df_test.head()

author_id,hike_id,Rating
15682,875,5.0
6105,875,4.0
6837,875,4.0
5218,875,5.0
4770,875,5.0
15229,875,4.0
5774,875,2.0
14435,875,5.0
1295,875,3.0
18753,875,1.0


In [21]:
from sklearn.metrics import mean_squared_error

In [22]:
print 'MSE: ', mean_squared_error(df_test['Rating'], predicted)
print 'RMSE: ', np.sqrt(mean_squared_error(df_test['Rating'], predicted))

MSE:  2.00984223972
RMSE:  1.41768904902


## Factorization Recommender

https://turi.com/products/create/docs/generated/graphlab.recommender.factorization_recommender.FactorizationRecommender.html

With SGD

In [23]:
model = graphlab.recommender.factorization_recommender.create(df_train, user_id='author_id', item_id='hike_id', target='Rating', solver='sgd' )

In [24]:
predicted = model.predict(df_test)

In [25]:
print 'MSE: ', mean_squared_error(df_test['Rating'], predicted)
print 'RMSE: ', np.sqrt(mean_squared_error(df_test['Rating'], predicted))

MSE:  1.84434131097
RMSE:  1.35806528229


In [26]:
model.recommend(users=[2], k=5)

author_id,hike_id,score,rank
2,297,6.19237823283,1
2,164,6.05389792716,2
2,802,6.0152325133,3
2,804,5.92787809168,4
2,329,5.90850181376,5


## Visualizing the difference between the two factorization methods
https://turi.com/products/create/docs/generated/graphlab.show_comparison.html#graphlab.show_comparison

## Factorization model incorporating item data


In [27]:
item_data = pd.read_csv('data/itemData.csv')

item_data = item_data.drop(labels=['Unnamed: 0'], axis=1)
item_sf = item_data.dropna()


In [28]:
item_sf.head()

Unnamed: 0,hike_name,hike_id,numReports,total_dist,elevation gain,time_from_seattle,Wildlife,Rivers,Summits,Good for kids
0,Grand Park via Lake Eleanor,0,212.0,8.0,1100.0,116.533333,1,0,0,0
2,Soda Peaks Lake,2,24.0,7.0,2500.0,227.333333,0,0,0,0
3,Shorthorn,3,5.0,5.6,1400.0,207.433333,0,0,0,0
4,West Fork Teanaway River,4,5.0,19.2,2800.0,115.75,1,0,0,1
5,Fishtrap Lake - North,5,4.0,4.0,126.0,233.516667,1,0,0,0


In [29]:
def norm(df, col):
    df[col] = (df[col] - df[col].mean())/(df[col].max() - df[col].min())

In [30]:
norm(item_sf, 'elevation gain')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [31]:
norm(item_sf, 'elevation gain')
norm(item_sf, 'time_from_seattle')
norm(item_sf, 'numReports')
norm(item_sf, 'total_dist')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [32]:
itemData = graphlab.SFrame(item_sf)

In [33]:
model = graphlab.recommender.ranking_factorization_recommender.create(df_train, user_id='author_id', 
                                                                      item_id='hike_id', target='Rating', 
                                                                      item_data=itemData, ranking_regularization=0,
                                                                        regularization=1e-10,
                                                                      linear_regularization=1e-8)

In [52]:
my_recs = model.get_similar_items(items=[13], k=5)

In [53]:
my_recs

hike_id,similar,score,rank
13,14,0.626574397087,1
13,1394,0.464548319578,2
13,310,0.451806217432,3
13,368,0.444441497326,4
13,1294,0.435032874346,5


In [40]:
my_recs=list(sim['similar'])

In [41]:
from pymongo import MongoClient
client = MongoClient()
db = client['wta']
hikes = db['hikes']

In [54]:
recs = db.hikes.find({"hike_id": {"$in": list(my_recs['similar'])}})

In [62]:
recs = db.hikes.find({"hike_id": {"$in": list(my_recs['similar'])}})

In [66]:
for r in recs:
    print r.elevation

AttributeError: 'dict' object has no attribute 'elevation'

In [67]:
my_recs = model.recommend(users=[13],  k=5)

In [70]:
recs = db.hikes.find({"hike_id": {"$in": list(my_recs['hike_id'])}})

In [73]:
for r in recs:
    print r['name']

McCormick Forest Park
Mount Spokane State Park - Mount Kit Carson Loop Road
Burnt Bridge Creek - Eastern Section


In [154]:
def printRecs(recs):
    for h in [f for f in recs['hike_id']]:
        row = item_data[item_data['hike_id']==h]
        print '**', row['hike_name'].values[0], '**'
        print 'Total Distance: ', row['total_dist'].values[0]
        print 'Elevation Gain: ', row['elevation gain'].values[0]
        print 'Driving time from Seattle (minutes): ', row['time_from_seattle'].values[0]
        print '----------------------------------------------------------------'

In [155]:
item_data[item_data['hike_name']=='Mailbox Peak']

Unnamed: 0,hike_name,hike_id,numReports,total_dist,elevation gain,time_from_seattle,Wildlife,Rivers,Summits,Good for kids
83,Mailbox Peak,83,220.0,9.4,4000.0,42.816667,0,0,1,0


In [156]:
new_instance = pd.DataFrame.from_dict({'hike_id': [87], 'author_id': [2222], 'Rating':[5] } )
sf = graphlab.SFrame(new_instance)
recs = model.recommend(users=[2222], new_observation_data=sf)
print recs

+-----------+---------+---------------+------+
| author_id | hike_id |     score     | rank |
+-----------+---------+---------------+------+
|    2222   |   238   | 4.97114866371 |  1   |
|    2222   |   525   | 4.91055590831 |  2   |
|    2222   |   642   | 4.88502812808 |  3   |
|    2222   |   710   | 4.51497164788 |  4   |
|    2222   |   499   | 4.51125926319 |  5   |
|    2222   |   515   | 4.49509553859 |  6   |
|    2222   |   242   | 4.47599709247 |  7   |
|    2222   |   121   | 4.47233295856 |  8   |
|    2222   |   295   | 4.43095226274 |  9   |
|    2222   |   643   | 4.40443453403 |  10  |
+-----------+---------+---------------+------+
[10 rows x 4 columns]



In [157]:
printRecs(recs)

** Lincoln Park **
Total Distance:  1.85
Elevation Gain:  160.0
Driving time from Seattle (minutes):  18.9833333333
----------------------------------------------------------------
** Dosewallips State Park - Steam Donkey Trail **
Total Distance:  3.5
Elevation Gain:  400.0
Driving time from Seattle (minutes):  125.5
----------------------------------------------------------------
** Jungle Hill Loop **
Total Distance:  12.0
Elevation Gain:  2400.0
Driving time from Seattle (minutes):  331.316666667
----------------------------------------------------------------
** Damfino Lakes - Excelsior Peak **
Total Distance:  5.6
Elevation Gain:  1500.0
Driving time from Seattle (minutes):  187.716666667
----------------------------------------------------------------
** Big Rock **
Total Distance:  2.5
Elevation Gain:  600.0
Driving time from Seattle (minutes):  270.4
----------------------------------------------------------------
** Watershed Park **
Total Distance:  1.36
Elevation Gain:  150

In [160]:
recs = model.recommend(users=[670000000])

In [161]:
printRecs(recs)

** Lincoln Park **
Total Distance:  1.85
Elevation Gain:  160.0
Driving time from Seattle (minutes):  18.9833333333
----------------------------------------------------------------
** Jungle Hill Loop **
Total Distance:  12.0
Elevation Gain:  2400.0
Driving time from Seattle (minutes):  331.316666667
----------------------------------------------------------------
** Watershed Park **
Total Distance:  1.36
Elevation Gain:  150.0
Driving time from Seattle (minutes):  63.0166666667
----------------------------------------------------------------
** Rowena Plateau and Tom McCall Point **
Total Distance:  3.4
Elevation Gain:  1200.0
Driving time from Seattle (minutes):  229.583333333
----------------------------------------------------------------
** Big Rock **
Total Distance:  2.5
Elevation Gain:  600.0
Driving time from Seattle (minutes):  270.4
----------------------------------------------------------------
** McCormick Forest Park **
Total Distance:  3.0
Elevation Gain:  200.0
Drivin

## Item Content recommender
https://turi.com/products/create/docs/generated/graphlab.recommender.item_content_recommender.ItemContentRecommender.html

In [8]:
item_content_model = graphlab.recommender.item_content_recommender.create(itemData, 'hike_id')

NameError: name 'itemData' is not defined

In [166]:
recs = item_content_model.get_similar_items(items=[87])

In [169]:
for i in recs['similar']:
    print item_data[item_data['hike_id']==i]

      hike_name  hike_id  numReports  total_dist  elevation gain  \
424  Mount Zion      424       120.0         4.6          1300.0   

     time_from_seattle  Wildlife  Rivers  Summits  Good for kids  
424              141.3         0       0        0              0  
         hike_name  hike_id  numReports  total_dist  elevation gain  \
1521  Red Mountain     1521        96.0         2.0           700.0   

      time_from_seattle  Wildlife  Rivers  Summits  Good for kids  
1521         142.266667         0       0        0              0  
          hike_name  hike_id  numReports  total_dist  elevation gain  \
846  Table Mountain      846        81.0         2.6           725.0   

     time_from_seattle  Wildlife  Rivers  Summits  Good for kids  
846              171.7         0       0        0              0  
            hike_name  hike_id  numReports  total_dist  elevation gain  \
2093  Tongue Mountain     2093        19.0         3.0          1200.0   

      time_from_seattl