### Disclaimer

This notebook was made using **Azure Databricks** so to use this notebook you must use Azure Databricks or simillar. Refer to the following link: https://azure.microsoft.com/es-es/products/databricks

In [None]:
# You need to replace the "REPLACE ME" text with a Secret Key. 
# You have to contact me at gaston.orphant@hotmail.com asking for it.

SECRET_KEY = "REPLACE ME"


In [None]:
%pip install gradio
%pip install plotly

Python interpreter will be restarted.
Collecting gradio
  Downloading gradio-3.23.0-py3-none-any.whl (15.8 MB)
Collecting pydantic
  Downloading pydantic-1.10.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
Collecting fsspec
  Downloading fsspec-2023.3.0-py3-none-any.whl (145 kB)
Collecting altair>=4.2.0
  Downloading altair-4.2.2-py3-none-any.whl (813 kB)
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
Collecting fastapi
  Downloading fastapi-0.95.0-py3-none-any.whl (57 kB)
Collecting websockets>=10.0
  Downloading websockets-10.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (106 kB)
Collecting python-multipart
  Downloading python_multipart-0.0.6-py3-none-any.whl (45 kB)
Collecting orjson
  Downloading orjson-3.8.8-cp39-cp39-manylinux_2_28_x86_64.whl (143 kB)
Collecting huggingface-hub
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
Co

In [None]:
spark.conf.set("spark.databricks.io.cache.enabled", "true")

# Connecting with Blob Storage

### Set the data location and type

There are two ways to access Azure Blob storage: **account keys** and shared access signatures (SAS).

To get started, we need to set the location and type of the file.

In [None]:
storage_account_name = "datawarehousegoogle"
storage_account_access_key = SECRET_KEY # Send me an e-mail to gaston.orphant@hotmail.com requesting this key

spark.conf.set(
  "fs.azure.account.key."+ storage_account_name +".blob.core.windows.net",
  storage_account_access_key)

In [None]:
file_location = "wasbs://datasets@datawarehousegoogle.blob.core.windows.net/"
file_type = "parquet"

### Read the data

Now that we have specified our file metadata, we can create a DataFrame. Notice that we use an *option* to specify that we want to infer the schema from the file. We can also explicitly set this to a particular schema if we have one already.

First, let's create a DataFrame in Python.

In [None]:
df = spark.read.format(file_type).option("inferSchema", "true").load(file_location)

In [None]:
# drops rows where main_category is not restaurant
restaurants = df.filter(df.main_category == "food services") 

## Hashing Strings

ALS only accepts numerics inputs so we need to transform some columns from string to numbers.

We can do it by hashing the strings or by indexing the strings

In [None]:
#Import the functions library as F to do a hash of user_id
from pyspark.sql import functions as F

#Hashing these columns so they have integers type
restaurants = restaurants.withColumn("user_id_hash", F.hash(restaurants.user_id))

restaurants = restaurants.withColumn("business_id_hash", F.hash(restaurants.business_id))

In [None]:
#Drop the columns that we don't need for the recommendation system
df_ml = restaurants.drop("latitude", "longitude", "main_category", "date", "resp", "opinion", "platform")


# Machine Learning Model

## Training the model

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

# split into training and testing sets

(training, test) = df_ml.randomSplit([.7, .3])


In [None]:
# Build the recommendation model using ALS on the training data
# With cold start strategy set to 'drop' we ensure we don't get NaN evaluation metrics
# We need to be careful using this strategy because we could be losing a lot of data if there are a lot of NULL values.
als = ALS(maxIter=10, rank=50, regParam=0.15, userCol='user_id_hash', itemCol='business_id_hash', ratingCol='rating', coldStartStrategy='drop')

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-2943557071618452>[0m in [0;36m<cell line: 4>[0;34m()[0m
[1;32m      2[0m [0;31m# With cold start strategy set to 'drop' we ensure we don't get NaN evaluation metrics[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      3[0m [0;31m# We need to be careful using this strategy because we could be losing a lot of data if there are a lot of NULL values.[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 4[0;31m [0mals[0m [0;34m=[0m [0mALS[0m[0;34m([0m[0mmaxIter[0m[0;34m=[0m[0;36m10[0m[0;34m,[0m [0mrank[0m[0;34m=[0m[0;36m50[0m[0;34m,[0m [0mregParam[0m[0;34m=[0m[0;36m0.15[0m[0;34m,[0m [0muserCol[0m[0;34m=[0m[0;34m'user_id_hash'[0m[0;34m,[0m [0mitemCol[0m[0;34m=[0m[0;34m'business_id_hash'[0m[0;34m,[0m [0mratingCol[0m[0;34m=[0m[0;34m'rating'[

In [None]:
# fit the ALS model to the training set
model=als.fit(training)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-2943557071618453>[0m in [0;36m<cell line: 2>[0;34m()[0m
[1;32m      1[0m [0;31m# fit the ALS model to the training set[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 2[0;31m [0mmodel[0m[0;34m=[0m[0mals[0m[0;34m.[0m[0mfit[0m[0;34m([0m[0mtraining[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
[0;31mNameError[0m: name 'als' is not defined

In [None]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
print(rmse)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-2943557071618454>[0m in [0;36m<cell line: 2>[0;34m()[0m
[1;32m      1[0m [0;31m# Evaluate the model by computing the RMSE on the test data[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 2[0;31m [0mpredictions[0m [0;34m=[0m [0mmodel[0m[0;34m.[0m[0mtransform[0m[0;34m([0m[0mtest[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      3[0m [0mevaluator[0m [0;34m=[0m [0mRegressionEvaluator[0m[0;34m([0m[0mmetricName[0m[0;34m=[0m[0;34m'rmse'[0m[0;34m,[0m [0mlabelCol[0m[0;34m=[0m[0;34m'rating'[0m[0;34m,[0m [0mpredictionCol[0m[0;34m=[0m[0;34m'prediction'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m      4[0m [0mrmse[0m [0;34m=[0m [0mevaluator[0m[0;34m.[0m[0mevaluate[0m[0;34m([0m[0mpredictions[0m[0;34m)[0m[0;34m[0m[0;34m[0m

## Param Optimization

In [None]:
# Used to know what the best hyperparameters were. 
# We skip this because we already done it and replaced the best params before.
# If you want you can delete the '#' from all lines of code to re-run this.

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

## Initialize the ALS model

#als_model = ALS(maxIter=10, regParam=0.15, userCol='user_id_hash', itemCol='business_id_hash', ratingCol='rating', coldStartStrategy='drop')

## Create the parameters grid

#params = ParamGridBuilder().addGrid(als_model.regParam, [.01, .05, .1, .15]).addGrid(als_model.rank, [10, 50, 100, 150]).build()

## Instantiating crossvalidator estimator

#cv = CrossValidator(estimator=als_model, estimatorParamMaps=params, evaluator=evaluator, parallelism=4)
#best_model = cv.fit(df_ml)
#model = best_model.bestModel

## Saving the model

In [None]:
path = "/model/modelALS"
model.write().overwrite().save(path)

## Loading the model

In [None]:
from pyspark.ml.recommendation import ALS, ALSModel

path = "/model/modelALS"
model = ALSModel.load(path)

# Recommendation System

We are going to recommend some restaurants to an user using his ID and checking his previus reviews.

In [None]:
# Function to retrieve the user_id hashed

def id_hashed(user_id):
  return df_ml.where(df_ml.user_id == user_id).take(1)[0]['user_id_hash']

In [None]:
# Function to retrieve the restaurant info

def name_retriever(business_id_hash, restaurants):
    return (restaurants.where(restaurants.business_id_hash == business_id_hash).take(1)[0]['local_name'], restaurants.where(restaurants.business_id_hash == business_id_hash).take(1)[0]['latitude'], restaurants.where(restaurants.business_id_hash == business_id_hash).take(1)[0]['longitude'])

In [None]:
from pyspark.sql.functions import rand

# Selecting a Random user for now we are using user_id_hash 
def random_user():
  usr_id = df_ml.select('user_id').orderBy(rand()).limit(1).collect()
  my_user = [val.user_id for val in usr_id][0]
  return my_user

In [None]:
def user_recommendation(my_user):
  # Opening the dataframe previusly saved
  try:
    recommendations = spark.read.table("recommendations")
  except:
    # make recommendations for all users using the recommendForAllUsers method
    # we stablish the number of recommendations to show
    num_recs = 5
    recommendations = model.recommendForAllUsers(num_recs)
    #Saving the dataframe
    recommendations.write.format("parquet").saveAsTable("recommendations")
  
  # hash the id inserted
  my_user = id_hashed(my_user)
  # get recommendations specifically for the user
  recs_for_user = recommendations.where(recommendations.user_id_hash == my_user).take(1)
  
  string = ""  
  for ranking, (business_id_hash, rating) in enumerate(recs_for_user[0]['recommendations']):
    local_name, latitude, longitude = name_retriever(business_id_hash, restaurants)
    string = string + "Recommendation "+ str(ranking+1) + ": " + str(local_name) + ". Coordenates: " + str(latitude) + ", " + str(longitude) + "\n"
  return string

In [None]:
import plotly.graph_objects as go

# Mapping the recommended business
def mapped_coor(my_user):
    # Opening the dataframe previusly saved
  try:
    recommendations = spark.read.table("recommendations")
  except:
    # make recommendations for all users using the recommendForAllUsers method
    # we stablish the number of recommendations to show
    num_recs = 5
    recommendations = model.recommendForAllUsers(num_recs)
    #Saving the dataframe
    recommendations.write.format("parquet").saveAsTable("recommendations")
  
  # hash the id inserted
  my_user = id_hashed(my_user)
  # get recommendations specifically for the user
  recs_for_user = recommendations.where(recommendations.user_id_hash == my_user).take(1)
  
  names = []
  lat = []
  lon = []
  for ranking, (business_id_hash, rating) in enumerate(recs_for_user[0]['recommendations']):
    local_name, latitude, longitude = name_retriever(business_id_hash, restaurants)
    names.append(local_name)
    lat.append(latitude)
    lon.append(longitude)
    
  fig = go.Figure(go.Scattermapbox(
            customdata=names,
            lat=lat,
            lon=lon,
            mode='markers',
            marker=go.scattermapbox.Marker(
                size=8
            ),
            hoverinfo="text",
            hovertemplate='Local Name: %{customdata}'
        ))
  
  fig.update_layout(
        mapbox_style="open-street-map",
        hovermode='closest',
        mapbox=dict(
            bearing=0,
            center=go.layout.mapbox.Center(
                lat=lat[0],
                lon=lon[0]
            ),
            pitch=0,
            zoom=9
        ),
    )
  
  return fig

In [None]:
# Demo Interface using gradio
import gradio as gr

title = str("Recommendation System")

with gr.Blocks(title= title) as demo:
    text = gr.components.HTML("""
    <center><h1>Welcome to the Demo for the Restaurants Recommendation System!</h1></center>
    """)
    text = gr.components.HTML("""
    <center><h3>You can use the following button to get a random user id, or you can instead put your own user id </h3>
    <h4>(Note: You must be registered on Google Maps or Yelp and have rated at least one restaurant)</h4></center>
    """)
    get_random = gr.Button("Get a Random User ID!")
    userId = gr.Textbox(label="Enter your ID:")
    get_random.click(fn=random_user, inputs=None, outputs=userId)
    text = gr.components.HTML("""
    <center><h3>Use the following button to get five restaurants in your zone.</h3></center>
    """)
    get_recommendation_btn = gr.Button("Get recommendations!")
    output = gr.Textbox(label="You can go to the following restaurants:")
    get_recommendation_btn.click(fn=user_recommendation, inputs=userId, outputs=output)
    text = gr.components.HTML("""
    <center><h3>Use the button bellow to get a map with the location of the restaurants previusly mentioned.</h3></center>
    """)
    btn = gr.Button(value="Show on map")
    map = gr.Plot().style()
    demo.load(mapped_coor, userId, map)
    btn.click(mapped_coor, userId, map)

demo.launch(share=True)

Running on local URL:  http://127.0.0.1:7866
Running on public URL: https://766610b71c42034097.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces


Out[20]: 