In [5]:
import pandas as pd
import numpy as np
import pyspark as ps
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    IntegerType, StringType, IntegerType, FloatType, 
    StructField, StructType, DoubleType
)
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
import matplotlib.pyplot as plt
import pickle

from ALS import ALSRecommender

### Import the main restaurant dataframe

In [51]:
als_df = pd.read_pickle('als_df.pkl')
als_df.shape

(5216, 4)

# Get user_factor and item_factor databases
### SparkSession


In [44]:
spark = SparkSession.builder.getOrCreate()
spark_als_df = spark.createDataFrame(als_df) 
pandas_als_df = spark_als_df.toPandas()
spark_als_df['user_id', 'item_id', 'rating'].show()

+-------+-------+------+
|user_id|item_id|rating|
+-------+-------+------+
|   1520|    596|   3.0|
|   1520|    592|   4.0|
|   1369|    480|   4.0|
|   1369|    601|   5.0|
|   1369|    488|   5.0|
|   1470|    544|   4.0|
|   1499|    615|   3.0|
|    293|    530|   4.0|
|   1369|    370|   5.0|
|    293|    479|   2.0|
|   1523|    613|   3.0|
|   1499|    538|   1.0|
|   1048|    202|   5.0|
|   1523|    620|   3.0|
|   1470|    601|   3.0|
|   1048|    584|   3.0|
|   1048|    483|   4.0|
|   1469|    486|   4.0|
|   1501|    544|   3.0|
|   1465|    525|   4.0|
+-------+-------+------+
only showing top 20 rows



In [53]:
pandas_als_df = pandas_als_df[['user_id', 'item_id', 'rating']]

### Train an ALS Model

In [9]:
als_model = ALS(
    itemCol='item_id',
    userCol='user_id',
    ratingCol='rating',
    nonnegative=True,    
    regParam=0.1,
    rank=10,
    coldStartStrategy = 'drop')

In [10]:
recommender = als_model.fit(spark_als_df)

In [40]:
user_factors = recommender.userFactors.collect()
item_factors = recommender.itemFactors.collect()

In [92]:
#Create user factor and item factor databases
uf_df = recommender.userFactors.toPandas()
if_df = recommender.itemFactors.toPandas()
if_df.to_pickle('item_factors_df.pkl')

In [54]:
#Need to create the inverse_alias_dictionary to translate rest_ids to rest_names
more_than_one_review = pd.read_pickle('more_than_one_review_df.pkl')
aliases = more_than_one_review['alias'].unique()
alias_ids = zip(aliases, range(len(aliases)))
alias_dict = dict(alias_ids)
inv_alias_dict = {v: k for k, v in alias_dict.items()}

In [93]:
with open('inv_alias_dict.pickle', 'wb') as handle:
    pickle.dump(inv_alias_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [48]:
#Import survey results
from survey_results import survey

### Run recommender to get recs

In [15]:
a_recommender = ALSRecommender(uf_df, if_df, inv_alias_dict)

In [55]:
recs_database = a_recommender.compile_preds_database(survey)
recs_database

Unnamed: 0,amazon-go-seattle-5,the-pink-door-seattle-4,nirmals-seattle,biscuit-bitch-seattle-5,damn-the-weather-seattle,bad-bishop-seattle,pike-place-chowder-seattle,tsukushinbo-seattle,taylor-shellfish-oyster-bar-seattle-2,elliotts-oyster-house-seattle-2,...,mediterranean-cuisine-seattle,ricenroll-seattle-2,union-deli-la-puget-sound-plaza-building-seattle,measure-seattle,subway-seattle-3,u-bank-cafe-seattle,subway-seattle-31,subway-seattle-66,organic-to-go-seattle-4,chez-dave-seattle-2
gabe,-0.204846,2.369452,-0.278558,1.278298,-0.894885,1.060026,1.726565,2.129003,1.576984,2.789261,...,2.205422,3.259448,0.120804,0.572778,2.354436,-0.095351,1.013983,1.0897,0.746585,-1.937756
nicole,9.093305,6.580937,-4.190717,-5.896802,-5.976097,-0.266371,-6.41553,5.934456,-0.968214,3.428758,...,4.747425,5.53044,-1.404166,-0.036775,-2.406282,0.885326,-1.920965,-8.70126,1.77446,-8.446063
jordan,10.363048,9.807997,6.880987,0.911852,3.639931,7.735248,-1.040732,3.549685,4.529747,4.78754,...,2.928741,3.561614,5.326628,4.137985,3.913624,2.524035,-0.579715,0.99084,2.999946,0.36833
jonny,6.366547,5.833867,7.154695,5.221021,6.295663,4.55765,3.629668,3.1945,5.641347,3.918737,...,2.407974,3.203024,4.831695,5.65318,5.896283,1.96092,3.021735,5.422743,3.269316,5.055296
alex,7.487336,5.196887,1.918431,0.673735,2.07041,0.582364,-1.481194,4.339519,2.132065,2.473254,...,2.609621,3.211571,2.079912,3.531593,1.144817,1.39501,1.902854,-0.513489,3.034323,0.764786
grant,-3.421549,-0.195937,11.30791,16.026771,12.325163,3.462374,11.613417,1.713476,9.546372,0.837611,...,1.345313,0.034766,8.277325,8.699176,9.531417,0.5226,7.94084,16.847457,4.508735,15.828395
lea,8.993033,8.312597,-2.233648,-6.408272,-5.509439,-0.240913,-5.822183,4.414948,0.276907,2.907891,...,2.074596,3.019662,2.768264,0.597823,-2.919805,1.152469,-2.944867,-7.641419,1.538949,-9.027822


In [17]:
def sort_recs_for_two(user1, user2, recs_database):    
    u1 = recs_database.loc[user1]
    u2 = recs_database.loc[user2]
    double_df = pd.concat([u1, u2], axis=1, sort=False)
    double_df['mean'] = double_df.mean(axis=1)
    return double_df.sort_values(by=['mean'], ascending=False)

In [91]:
sort_recs_for_two('gabe', 'lea', recs_database).head(20)

Unnamed: 0,gabe,lea,mean
daawat-grill-seattle-2,6.120159,15.170543,10.645351
sushi-kudasai-seattle-3,3.03946,13.826014,8.432737
chan-seattle-seattle-2,4.77389,10.906977,7.840434
subway-seattle-34,2.53591,12.815684,7.675797
mangia-me-seattle-2,2.397617,12.936123,7.66687
georges-sausage-and-delicatessen-seattle,5.267859,10.040895,7.654377
café-frieda-seattle-3,5.433702,9.689345,7.561523
café-paloma-seattle-2,3.351841,11.645865,7.498853
main-street-gyros-seattle-2,2.145768,12.35676,7.251264
huong-binh-seattle,3.126656,11.369953,7.248304


In [20]:
sort_recs_for_two('gabe', 'grant', recs_database).head()

Unnamed: 0,gabe,grant,mean
intermezzo-carmine-seattle-2,0.327712,25.385533,12.856622
le-pichet-seattle,-0.496189,24.716886,12.110349
din-tai-fung-seattle-17,1.881809,21.739707,11.810758
aloha-plates-seattle,0.387957,22.257418,11.322688
rolls-n-sushi-seattle,1.932768,20.250382,11.091575


In [21]:
sort_recs_for_two('gabe', 'alex', recs_database).head()

Unnamed: 0,gabe,alex,mean
daawat-grill-seattle-2,6.120159,4.98605,5.553104
georges-sausage-and-delicatessen-seattle,5.267859,4.643186,4.955522
main-street-gyros-seattle-2,2.145768,7.646054,4.895911
chan-seattle-seattle-2,4.77389,4.888661,4.831276
cocoa-banana-seattle,4.508626,4.931584,4.720105


In [22]:
sort_recs_for_two('gabe', 'jonny', recs_database).head()

Unnamed: 0,gabe,jonny,mean
happy-garden-seattle-3,3.49764,6.477037,4.987338
mae-phim-thai-seattle-4,4.386894,5.273234,4.830064
kidd-valley-seattle-6,5.157261,4.395158,4.77621
poke-lover-seattle-10,4.832974,4.5,4.666487
delicatus-seattle-3,2.650775,6.618532,4.634654


## Just give me a rec

In [24]:
nicole_and_gabe = sort_recs_for_two('nicole', 'gabe', recs_database).head(50)

In [84]:
def get_a_rec(user1, user2, recs_database):
    sorted_recs = sort_recs_for_two(user1, user2, recs_database).head(50)
    normalized_weights = sorted_recs['mean'] / sorted_recs['mean'].sum()
    return sorted_recs.sample(1, weights=(sorted_recs['mean'] / normalized_weights))

In [85]:
get_a_rec('nicole', 'gabe', recs_database)

Unnamed: 0,nicole,gabe,mean
georges-sausage-and-delicatessen-seattle,12.769926,5.267859,9.018893


In [90]:
get_a_rec('gabe', 'lea', recs_database)

Unnamed: 0,gabe,lea,mean
mangia-me-seattle-2,2.397617,12.936123,7.66687


# HAVING SOME ISSUES / NEXT STEPS

### Issue 1: Why are some users' ranges are so different?  
##### At worst, I have a problem in the code.  At best, I just need to standardize.


### Issue 2: Are these good recs!??  How would I improve them?
##### I have all this other information - the categories of food, the categories of ambience types, could I use these?  

### Issue 3: I need a working website, bad.

### Stretch Goal 1: Create filters 

### Stretch Goal 2: Figure out how to update my database with REAL reviews (after users go to my recommended restaurants)

### Stretch Goal 3: Expand to all of Seattle

In [72]:
recs_database.loc['nicole'].min(), recs_database.loc['nicole'].max()

(-17.281448243764615, 17.74951800702868)

In [73]:
recs_database.loc['gabe'].min(), recs_database.loc['gabe'].max()

(-5.0691192390433688, 7.047557579467906)

In [74]:
recs_database.loc['grant'].min(), recs_database.loc['grant'].max()

(-18.742116822604082, 25.385532821259282)

- What is interesting, if anything, about your feature engineering?
- Avoid mentioning machine learning or probability when talking about the solution you're developing.  Just say what data you're collecting and what service you're providing.
