In [1]:
from __future__ import division
import graphlab
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split

In [153]:
df = pd.read_csv(open('data/tripReports.csv','rU'), encoding='utf-8', engine='c')
df = df.drop(labels=['Unnamed: 0', 'Name', 'Text', 'Data','Author', 'TextBlobSentiment'], axis=1)
df.head()

Unnamed: 0,HikeId,AuthorId,Rating
0,0.0,10377.0,4.0
1,0.0,2487.0,4.0
2,0.0,1948.0,2.0
3,0.0,492.0,4.0
4,0.0,12210.0,4.0


In [81]:
df = df.dropna()

In [82]:
df['HikeId'] = df['HikeId'].fillna(np.nan).astype(int)
df['AuthorId'] = df['AuthorId'].fillna(np.nan).astype(int)

In [83]:
df = graphlab.SFrame(df)

## Factorization Recommender

https://turi.com/products/create/docs/generated/graphlab.recommender.factorization_recommender.FactorizationRecommender.html

With ALS

In [94]:
df.shape

(77296, 3)

In [95]:
# Create training and test set
df_train = df[:60000]
df_test = df[60000:]

In [9]:
df_train.shape

(60000, 9)

In [11]:
model = graphlab.recommender.factorization_recommender.create(df_train, user_id='AuthorId', item_id='HikeId', target='Rating', solver='als' )

In [12]:
predicted = model.predict(df_test)



In [13]:
df_test.head()

Unnamed: 0,Name,Text,Data,HikeId,Author,AuthorId
60604,Heliotrope Ridge,"On Saturday (6/21), I hiked up Heliotrope R ...","Jun 21, 2008",899,Guy M,4144
60605,Heliotrope Ridge,We were looking for good views of Mt. Baker ...,"Nov 23, 2007",899,GregWhitePhoto,4092
60606,Heliotrope Ridge,Six of us hiked the Heliotrope Ridge trai ...,"Aug 24, 2007",899,Joel Askey,5523
60607,Heliotrope Ridge,"Road is in fairly good shape, a few deep ...","Aug 05, 2007",899,Billy Sunballs,1290
60608,Heliotrope Ridge,Trail is clear and in good shape. Reports from ...,"Jul 28, 2007",899,KJH,5862
60609,Heliotrope Ridge,"Beautiful day for hiking, first couple of miles ...","Jun 26, 2007",899,Orion Ahrensfeld,8450
60610,Heliotrope Ridge,I did a two-day backpacking trip into ...,"Sep 26, 2006",899,Slugman,10236
60611,Heliotrope Ridge,Started our hike at 7:45 AM. Stream crossings not ...,"Aug 11, 2006",899,Dave V,2690
60612,Heliotrope Ridge,"A shorter hike, but to some of the most insane ...","Aug 06, 2006",899,PT Hiker,8584
60613,Heliotrope Ridge,WTA recommended Heliotrope Ridge earlier ...,"Aug 04, 2006",899,SlowTech,10233

TextBlobSentiment,Rating
0.121367521368,2.0
0.103787878788,2.0
0.154542124542,3.0
0.141666666667,3.0
0.201388888889,4.0
0.0623429232804,1.0
0.44,5.0
-0.0284722222222,1.0
-0.05,1.0
0.159456058846,3.0


In [14]:
from sklearn.metrics import mean_squared_error

In [15]:
print 'MSE: ', mean_squared_error(df_test['Rating'], predicted)
print 'RMSE: ', np.sqrt(mean_squared_error(df_test['Rating'], predicted))

MSE:  2.08433053103
RMSE:  1.44372107106


## Factorization Recommender

https://turi.com/products/create/docs/generated/graphlab.recommender.factorization_recommender.FactorizationRecommender.html

With SGD

In [16]:
model = graphlab.recommender.factorization_recommender.create(df_train, user_id='AuthorId', item_id='HikeId', target='Rating', solver='sgd' )

In [17]:
predicted = model.predict(df_test)

In [18]:
print 'MSE: ', mean_squared_error(df_test['Rating'], predicted)
print 'RMSE: ', np.sqrt(mean_squared_error(df_test['Rating'], predicted))

MSE:  0.410823656208
RMSE:  0.640955268492


In [21]:
model.recommend(users=[2], k=5)

AuthorId,HikeId,score,rank
2,109,3.49901976938,1
2,315,3.38948461964,2
2,7,3.38586916941,3
2,190,3.37125956818,4
2,506,3.36920658837,5


## Visualizing the difference between the two factorization methods
https://turi.com/products/create/docs/generated/graphlab.show_comparison.html#graphlab.show_comparison

## Factorization model incorporating item data


In [96]:
item_data = pd.read_csv('data/itemData.csv')

item_data = item_data.drop(labels=['Unnamed: 0'], axis=1)
item_data.rename(columns={'hike_id':'HikeId'}, inplace=True)
item_data.dropna(inplace=True)


In [173]:
item_data.head()

Unnamed: 0,hike_name,HikeId,numReports,total_dist,elevation gain,time_from_seattle
0,Grand Park via Lake Eleanor,0,0.152186,-0.001062,-0.019968,-0.011472
2,Soda Peaks Lake,2,-0.028583,-0.007853,0.030048,0.02737
3,Shorthorn,3,-0.046853,-0.017361,-0.00925,0.020394
4,West Fork Teanaway River,4,-0.046853,0.074999,0.040766,-0.011746
5,Fishtrap Lake - North,5,-0.047814,-0.028227,-0.054765,0.029537


In [169]:
def norm(df, col):
    df[col] = (df[col] - df[col].mean())/(df[col].max() - df[col].min())

In [170]:
norm(item_data, 'elevation gain')

In [172]:
norm(item_data, 'time_from_seattle')
norm(item_data, 'numReports')
norm(item_data, 'total_dist')

In [183]:
item_data.drop(labels=['hike_name'], axis=1, inplace=True)

In [184]:
itemData = graphlab.SFrame(item_data)

In [185]:
model = graphlab.recommender.factorization_recommender.create(df_train, user_id='AuthorId', item_id='HikeId', target='Rating', solver='sgd', item_data=itemData)

In [159]:
sim = model.get_similar_items(items=[87], k=5)

In [123]:
hike_ids = [f for f in sim['similar']]

In [124]:
def printRecs(recs):
    for h in [f for f in recs['HikeId']]:
        row = item_data[item_data['HikeId']==h]
        print '**', row['hike_name'].values[0], '**'
        print 'Total Distance: ', row['total_dist'].values[0]
        print 'Elevation Gain: ', row['elevation gain'].values[0]
        print 'Driving time from Seattle (minutes): ', row['time_from_seattle'].values[0]
        print '----------------------------------------------------------------'

In [125]:
item_data[item_data['hike_name']=='Mailbox Peak']

Unnamed: 0,hike_name,HikeId,numReports,total_dist,elevation gain,time_from_seattle
87,Mailbox Peak,87,220.0,9.4,4000.0,42.816667


In [193]:
new_instance = pd.DataFrame.from_dict({'HikeId': [87], 'AuthorId': [222222222], 'Rating':[1] } )
sf = graphlab.SFrame(new_instance)
recs = model.recommend(users=[222222222], new_observation_data=sf)
print recs

+-----------+--------+---------------+------+
|  AuthorId | HikeId |     score     | rank |
+-----------+--------+---------------+------+
| 222222222 |  1015  |  101.54740636 |  1   |
| 222222222 |  1543  | 45.8437484228 |  2   |
| 222222222 |  1808  | 40.0291962489 |  3   |
| 222222222 |  926   | 23.0548094973 |  4   |
| 222222222 |  915   | 8.65549886645 |  5   |
| 222222222 |  805   | 8.43030057117 |  6   |
| 222222222 |  1236  | 8.22051243639 |  7   |
| 222222222 |  521   | 8.20443616668 |  8   |
| 222222222 |  994   | 7.88041437223 |  9   |
| 222222222 |  924   | 7.79381720553 |  10  |
+-----------+--------+---------------+------+
[10 rows x 4 columns]



In [187]:
printRecs(recs)

**

KeyError: 'hike_name'