In [1]:
import pandas as pd
import numpy as np
import pyspark as ps
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    IntegerType, StringType, IntegerType, FloatType, 
    StructField, StructType, DoubleType
)
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
import matplotlib.pyplot as plt
import pickle

## Import the pandas dataframe containing the info from the review scrape 

In [4]:
als_df = pd.read_pickle('../data/als_df.pkl')

In [5]:
als_df.reset_index(drop=True, inplace=True)

In [6]:
als_df = als_df[['user_id', 'item_id', 'rating', 'date']]

In [7]:
als_df.shape

(5216, 4)

In [8]:
len(als_df['user_id'].unique())

1524

## Start a spark session to train an ALS model

In [9]:
spark = SparkSession.builder.getOrCreate()


In [10]:
spark_als_df = spark.createDataFrame(als_df) 


In [11]:
spark_als_df.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- item_id: long (nullable = true)
 |-- rating: double (nullable = true)
 |-- date: timestamp (nullable = true)



In [12]:
spark_als_df.limit(5).show()

+-------+-------+------+-------------------+
|user_id|item_id|rating|               date|
+-------+-------+------+-------------------+
|   1520|    596|   3.0|2005-08-02 00:00:00|
|   1520|    592|   4.0|2005-09-14 00:00:00|
|   1369|    480|   4.0|2006-05-13 00:00:00|
|   1369|    601|   5.0|2006-05-19 00:00:00|
|   1369|    488|   5.0|2006-05-22 00:00:00|
+-------+-------+------+-------------------+



In [13]:
pandas_als_df = spark_als_df.toPandas()

In [14]:
# unnecessary to split into test and train with ALS

# train = pandas_als_df[:int(len(pandas_als_df)*.8)]
# test = pandas_als_df[int(len(pandas_als_df)*.8):]

In [15]:
test = test[['user_id', 'item_id', 'rating']]
test.head()

Unnamed: 0,user_id,item_id,rating
4172,115,478,4.0
4173,346,296,5.0
4174,149,78,4.0
4175,533,308,5.0
4176,59,39,4.0


In [16]:
train = train[['user_id', 'item_id', 'rating']]

In [17]:
train.head()

Unnamed: 0,user_id,item_id,rating
0,1520,596,3.0
1,1520,592,4.0
2,1369,480,4.0
3,1369,601,5.0
4,1369,488,5.0


In [18]:
train.shape

(4172, 3)

In [19]:
als_model = ALS(
    itemCol='item_id',
    userCol='user_id',
    ratingCol='rating',
    nonnegative=True,    
    regParam=0.1,
    rank=10,
    coldStartStrategy = 'drop') 

In [20]:
spark_train = spark.createDataFrame(train) 

In [21]:
spark_test = spark.createDataFrame(test)

In [22]:
recommender = als_model.fit(spark_train)

In [None]:
train_preds = recommender.transform(spark_train)

In [None]:
train_preds

In [None]:
test_preds = recommender.transform(spark_test)

In [None]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")

In [None]:
rmse_train = evaluator.evaluate(train_preds)

In [None]:
rmse_train

In [None]:
rmse_test = evaluator.evaluate(test_preds)

In [None]:
rmse_test

In [None]:
#restaurant_recs = recommender.recommendForAllUsers(10)

In [None]:
restaurant_recs.limit(5).toPandas()

## Addressing the Cold Start Problem

In [None]:
user_factors = recommender.userFactors.collect()

In [None]:
item_factors = recommender.itemFactors.collect()

In [None]:
user_factors[0]

In [None]:
item_factors[0]

In [None]:
uf_df = recommender.userFactors.toPandas()

In [None]:
if_df = recommender.itemFactors.toPandas()

In [None]:
if_df

In [None]:
user_factors_array = np.array(uf_df['features'].tolist())

In [None]:
item_factors_array = np.array(if_df['features'].tolist())

In [None]:
user_factors_array.shape

In [None]:
item_factors_array.shape

In [None]:
def predict_rating(user_idx, item_idx):
    """Return the predicted rating of item by user (by iloc)."""
    user_vector = user_factors_array[user_idx, :]
    item_vector = item_factors_array[item_idx, :].T
    return user_vector @ item_vector

In [None]:
def predict_rating_by_id(user_id, item_id):
    """Return the predicted rating of item by user (by id)."""
    user_idx = uf_df.index[uf_df['id'] == user_id][0]
    item_idx = if_df.index[if_df['id'] == item_id][0]
    return predict_rating(user_idx, item_idx)

In [None]:
predict_rating_by_id(471, 59)

In [None]:
predict_rating(1, 1)

In [None]:
user_idx = uf_df.index[uf_df['id'] == 471][0]

In [None]:
item_idx = if_df.index[if_df['id'] == 59][0]
item_idx

In [None]:
item_factors_array[item_idx]

In [None]:
pred_ratings = user_factors_array[user_idx, :] @ item_factors_array[554, :].T

In [None]:
def get_restaurant_indexes(user_ratings, item_factors_df):
    rest_idxs = []
    for item in user_ratings['item_id']:
        rest_idx = if_df.index[if_df['id']==item]
        rest_idxs.append(rest_idx[0])
    return np.array(rest_idxs)
        

In [None]:
rest_idxs = []
for item in user_ratings['item_id']:
    rest_idx = if_df.index[if_df['id']==item]
    rest_idxs.append(rest_idx[0])

In [None]:
#restaurant_indexes = get_restaurant_indexes(user_ratings, if_df)

In [None]:
item_factors_array[rest_idxs]

### X is our new user's factors vector; use to generate a predicted ratings row for new user

In [None]:
X, residuals, rank, s = np.linalg.lstsq(item_factors_array[rest_idxs], ratings.values)

In [None]:
newuser_factors = X

In [None]:
def newuser_predict(newuser_factors, item_factors_array):
    new_factor_list =[]
    for i in range(len(item_factors_array)):
        new_factor_list.append(np.dot(newuser_factors, item_factors_array[i]))
    newuser_preds = pd.DataFrame([new_factor_list], index=['newuser'])
    return newuser_preds

In [None]:
example_new_user = newuser_predict(newuser_factors, item_factors_array)

In [None]:
example_new_user

### Now... get the actual restaurant names here and sort the predictions to get the top preds

In [None]:
inv_alias_dict = {v: k for k, v in alias_dict.items()}
inv_alias_dict

In [None]:
example_new_user_rest_names = example_new_user.rename(inv_alias_dict, axis=1)

In [None]:
example_new_user_rest_names.sort_values(by='newuser', axis=1, ascending=False)

## Ok... so now try to use this with actual survey results

In [None]:
from survey_results import survey_results, usernames

In [None]:
gabe = survey_results[0]

In [None]:
gabe

In [None]:
#divide ratings by two to put on the same scale
#convert aliases -> ids -> indexes for those ids
#find the item feature vectors for each of those restaurants by indexing into the item_factors_array
#make a "user ratings df" with the user, restaurant ids, and ratings

In [None]:
#divide ratings by two to put on the same scale
gabe_standardized = {k: v / 2 for k, v in gabe.items()}
gabe_standardized

In [None]:
#inv_alias_dict

In [None]:
#convert aliases -> ids -> indexes for those ids
id_to_rating = {k: gabe_standardized[v] for k, v in inv_alias_dict.items() if v in gabe_standardized}

In [None]:
user_ratings_df = pd.DataFrame.from_dict(id_to_rating, orient='index')

In [None]:
user_ratings_df.reset_index(inplace=True)

In [None]:
user_ratings_df.rename(columns={'index':'item_id', 0:'rating'}, inplace=True)


In [None]:
survey_ratings = user_ratings_df['rating'].values.reshape(-1, 1)

In [None]:
survey_ratings_notreshaped = user_ratings_df['rating'].values

In [None]:
rest_idx = get_restaurant_indexes(user_ratings_df, if_df)

In [None]:
latent_item_features = item_factors_array[rest_idx]

In [None]:
X, residuals, rank, s = np.linalg.lstsq(latent_item_features, survey_ratings_notreshaped)

In [None]:
X

In [None]:
gabes_preds = newuser_predict(X, item_factors_array)

In [None]:
gabes_preds

In [None]:
gabe_sorted_preds = gabes_preds.sort_values(by='newuser', axis=1, ascending=False).rename(inv_alias_dict, axis=1)

In [None]:
list(gabe_sorted_preds.columns)

## Make some functions that do all this, import it, try it out

In [None]:
from ALS import ALSRecommender

In [None]:
nicole = survey_results[1]

In [None]:
a_recommender = ALSRecommender(uf_df, if_df, inv_alias_dict)

In [None]:
nicole_preds = a_recommender.get_preds_from_survey_results(nicole)

In [None]:
nicole_preds

In [None]:
jonny = survey_results[3]

In [None]:
jonny_preds = a_recommender.get_preds_from_survey_results(jonny)
jonny_preds

In [None]:
jonny_preds

In [None]:
alex = survey_results[2]

In [None]:
alex_preds = a_recommender.get_preds_from_survey_results(alex, inv_alias_dict)
alex_preds

In [None]:
all_users = pd.DataFrame(index=range(len(survey_results)), columns=inv_alias_dict.values())
all_users

In [None]:
a = ALSRecommender(uf_df, if_df, inv_alias_dict)
#all_users = pd.DataFrame(index=range(len(survey_results)), columns=inv_alias_dict.values())

for survey in survey_results:
    results = a.get_preds_from_survey_results(survey)
results

#### bring in the alias_dict to translate rest aliases to ids

In [None]:
more_than_one_review = pd.read_pickle('more_than_one_review_df.pkl')

In [None]:
aliases = more_than_one_review['alias'].unique()
alias_ids = zip(aliases, range(len(aliases)))
alias_dict = dict(alias_ids)

In [None]:
with open("alias_dict.txt", "wb") as myFile:
    pickle.dump(alias_dict, myFile)

In [None]:
gabe_df = pd.DataFrame(columns=alias_dict.keys(), index=usernames)

In [None]:
gabe_df

In [None]:
def create_user_df(survey, df):
    for key, value in survey.items():
        df.iloc[0][key] = survey[key]
    return df

In [None]:
gabe = create_user_df(gabe, gabe_df).loc['gabe']