Before starting we import all the libraries that we need

In [3]:
from MyFunctions.crawler import Crawler
from MyFunctions.parser import Parser
from MyFunctions.preprocessor import Preprocessor
import requests
from bs4 import BeautifulSoup
import re
import os
import csv
import pandas as pd
import numpy as np
import json
from collections import defaultdict
import math
from nltk import word_tokenize

# <strong> Data collection

## <strong> 1.1 Get the list of Michelin restaraunts

Before scraping all the restaurant URLs, let's first determine the maximum page number. It's simple to find the correct CSS selector for the page list: just inspect the list of pages in your browser and identify the corresponding class or element name.

<p>
    <img title = "list of pages" src="./images/pages_number.png"/>
</p>

In [2]:
response = requests.get('https://guide.michelin.com/en/it/restaurants')
soup = BeautifulSoup(response.content, "html.parser")
page_links = soup.select('ul.pagination li a') #name of the pages list
page_numbers = [int(a.get_text()) for a in page_links if a.get_text().isdigit()]

# Get the maximum page number
total_pages = max(page_numbers) if page_numbers else 0
print(f'There are in total: {total_pages} pages')

There are in total: 100 pages


Now we can very easily get the URL of each page

In [3]:
pages = ['https://guide.michelin.com/en/it/restaurants'] #Initial page

for i in range(2, total_pages+1): #get all other pages from 2 to total_pages included
    pages.append('https://guide.michelin.com/en/it/restaurants/page/'+str(i))

Now in order to get the URLs of all the restaurants, we proceed the same by identifying the name of the corresponding class in the webpage.

<p>
<img title = "Class of a restaraunt" src="images/restaurant_link.png"/>
</p>

We can clearly see that the restaurant URLs follow a consistent pattern, which can be expressed using the regular expression:

```bash
BASE_URL/en/region/city/restaurant/name_of_restaurant
```


In [4]:
total_urls = [] #save all urls
base = 'https://guide.michelin.com' #base url to use

In [5]:
for p in pages: #loop all pages
    response = requests.get(p) #get the page
    soup = BeautifulSoup(response.content, "html.parser") # we use BeautifulSoup to get the content
    links = soup.select('a.link') #select all the class 'a link'
    pattern = re.compile(r'^/en/[^/]+/[^/]+/restaurant/[^/]+$') #pattern of restaurants
    restaurant_links = [base+link.get('href') for link in links if pattern.match(link.get('href', ''))] #get all the restaurants links
    total_urls.append(restaurant_links)

Now we save all the urls inside a txt called 'restaurant_urls.txt'

In [None]:
with open('dataset/restaurant_urls.txt', 'w') as f: 
    page_count = 1  # Initialize the page count
    for urls in total_urls:
        f.write(f'{page_count}\n')  # Add a label for the page number
        for url in urls: # Write each URL from the current page
            f.write(f'{url}\n')  
        
        page_count += 1 # Increment the page count

In [11]:
print(sum([len(u) for u in total_urls])) # how many restaurants we got

1983


## <strong> 1.2. Crawl Michelin restaurant pages

Now we download all the HTML from the urls and save them in a folder and divide each of them in separate folder_pages

In [None]:
crawler = Crawler()
crawler.save_all_as_html('dataset/restaurant_urls.txt') # See actual implementation inside 'crawler.py'

In [3]:
path = 'restaurants_html'
count = crawler.count_files(path)
print('file count:', count)

file count: 1983


The save_all_as_html function utilizes multi-threading to achieve optimal performance, generating approximately 20 threads concurrently. Within each loop for a page, each thread is tasked with downloading around a single URL, making it extremely efficient. Consequently, the function successfully downloaded 2,034 out of 2,037 files in under one minute.

## <strong> 1.3 Parse downloaded pages

The list of the information we desire for each restaurant and their format is as follows:

    Restaurant Name (to save as restaurantName): string;
    Address (to save as address): string;
    City (to save as city): string;
    Postal Code (to save as postalCode): string;
    Country (to save as country): string;
    Price Range (to save as priceRange): string;
    Cuisine Type (to save as cuisineType): string;
    Description (to save as description): string;
    Facilities and Services (to save as facilitiesServices): list of strings;
    Accepted Credit Cards (to save as creditCards): list of strings;
    Phone Number (to save as phoneNumber): string;
    URL to the Restaurant Page (to save as website): string.


To parse those information we can just inspect one html to see how those information are stored as we did before.<br>
Most of the information can be retrieved in the following json script at the end of each HTML file:
```js
<script type="application/ld+json">{"@context":"http://schema.org","address":{"@type":"PostalAddress","streetAddress":"Piazza Salvo d'Acquisto 16","addressLocality":"Lamezia Terme","postalCode":"88046","addressCountry":"ITA","addressRegion":"Calabria"},"name":"Abbruzzino Oltre","image":"https://axwwgrkdco.cloudimg.io/v7/__gmpics3__/f19d37d6b9da437fa06b6f9406645056.jpg?width=1000","@type":"Restaurant","review":{"@type":"Review","datePublished":"2024-09-11T07:32","name":"Abbruzzino Oltre","description":"This restaurant, the new home of young chef Luca Abbruzzino, occupies the first floor of a historic palazzo in the town centre which has recently been converted into a small hotel offering six ...","author":{"@type":"Person","name":"Michelin Inspector"}},"telephone":"+39 0968 188 8038","knowsLanguage":"en-IT","acceptsReservations":"No","servesCuisine":"Contemporary","url":"https://guide.michelin.com/en/calabria/lamezia-terme/restaurant/abbruzzino-oltre","currenciesAccepted":"EUR","paymentAccepted":"American Express credit card, Credit card / Debit card accepted, Mastercard credit card, Visa credit card","award":"Selected: Good cooking","brand":"MICHELIN Guide","hasDriveThroughService":"False","latitude":38.9770969,"longitude":16.3202202,"hasMap":"https://www.google.com/maps/search/?api=1&query=38.9770969%2C16.3202202"}</script>
```

<img src = "images/restaurant_page.png" />

Now we create a parse_restaurant function that given a html, it parses all the information we need and returns it as a dictionary, we also decided to keep region as an extra column

In [41]:
parser = Parser()
info = parser.parse_restaurant('restaurants_html/1/la-trattoria-enrico-bartolini.html') #Test
parser.show_restaurant_info(info)

restaurantName: La Trattoria Enrico Bartolini
address: Località Badiola
city: Castiglione della Pescaia
postalCode: 58043
country: ITA
region: Tuscany
priceRange: €€€€
cuisineType: Mediterranean Cuisine, Grills
description: After a majestic picture-postcard approach via a long avenue lined with cypress trees and maritime pines, passing vineyards and Maremma cattle along the way, you finally arrive at this restaurant which serves trattoria-style cuisine full of intense, familiar and reassuring flavours. The decor here is elegant with the occasional rustic touch, while the service is of the highest level yet pleasantly friendly and informal. Welcome to Bartolini’s Maremma restaurant! Here, resident chef Bruno De Moura Cossio offers a choice of dishes with one common denominator, namely charcoal grilling. All the dishes served here have been grilled in some way, so that they have a distinctive barbecued flavour. However, although the chef’s Brazilian origins are obvious in many different 

Now we can create a tsv file with all the informations of all the restaurants

In [42]:
root = 'restaurants_html'
output= 'dataset/restaurant_info.tsv'
parser.save_all_restaurant_info_to_tsv(root, output) #actual implementation in Parser class

Data saved to dataset/restaurant_info.tsv


In [43]:
df = pd.read_table('dataset/restaurant_info.tsv', index_col=0)

# <strong> Search Engine </strong>

## <strong> 2.0.0. Preprocessing the Text

Before building the search engine, we need to prepare and clean the restaurant descriptions in our dataset. To accomplish this, we created a class named Preprocessor in preprocessor.py. This class leverages the nltk library to process the text in the description column. It removes stopwords and punctuation, converts the text to lowercase, and applies stemming to reduce words to their base forms. This preprocessing step ensures that the descriptions are standardized, making them more suitable for efficient search and retrieval.

In [44]:
df = pd.read_table('dataset/restaurant_info.tsv')
preprocessor = Preprocessor()
df = preprocessor.filter(df)


[nltk_data] Downloading package stopwords to /home/pavka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/pavka/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Let's apply this function in the "description" column of our dataset.

In [45]:
df.head(2)

Unnamed: 0,restaurantName,address,city,postalCode,country,region,priceRange,cuisineType,description,facilitiesServices,creditCards,phoneNumber,website,description_filtered
0,Aqua,lungolago Conca d'Oro 11,Torbole,38069,ITA,Trentino-South Tyrol,€€,"Italian Contemporary, Italian",The most attractive space in this restaurant i...,"Air conditioning, Great view, Wheelchair access","Amex, Mastercard, Visa",+39 0464 505142,https://aquaristorante.com/,attract space restaur contemporari almost mila...
1,Le Colonne,viale Giulio Douhet 7/9,Caserta,81100,ITA,Campania,€€€,"Campanian, Creative",You’d recognise this cuisine as southern Itali...,"Air conditioning, Car park","Amex, Mastercard, Visa",+39 0823 467494,https://www.lecolonnemarziale.it/,’ recognis cuisin southern italian even blindf...


Now let's drop the column of description as we don't need it anymore and save the result

In [None]:
df.drop(columns='description',inplace=True)
df.to_csv('dataset/restaurant_info_filtered.tsv', sep="\t")

## <strong> 2.1. Conjunctive Query

In [4]:
df = pd.read_table("dataset/restaurant_info_filtered.tsv", index_col=0)

In [29]:
df.head(5)

Unnamed: 0_level_0,address,city,postalCode,country,region,priceRange,cuisineType,facilitiesServices,creditCards,phoneNumber,website,description_filtered
restaurantName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Casa Fantini/Lake Time,piazza Motta,Pella,28010,ITA,Piedmont,€€€,"Country cooking, Contemporary","Air conditioning, Terrace, Wheelchair access","Amex, Mastercard, Visa",+39 0322 969893,https://www.casafantinilaketime.com/it/il-rist...,situat lakefront attract town pella modern wel...
Le Colonne,viale Giulio Douhet 7/9,Caserta,81100,ITA,Campania,€€€,"Campanian, Creative","Air conditioning, Car park","Amex, Mastercard, Visa",+39 0823 467494,https://www.lecolonnemarziale.it/,’ recognis cuisin southern italian even blindf...
Aqua,lungolago Conca d'Oro 11,Torbole,38069,ITA,Trentino-South Tyrol,€€,"Italian Contemporary, Italian","Air conditioning, Great view, Wheelchair access","Amex, Mastercard, Visa",+39 0464 505142,https://aquaristorante.com/,attract space restaur contemporari almost mila...
Osteria Mondo d'Oro,via Mondo d'Oro 4,Verona,37121,ITA,Veneto,€,"Italian, Seasonal Cuisine","Air conditioning, Terrace","Amex, Mastercard, Visa",+39 045 894 9290,http://www.osteriamondodoro.it,describ “ osteria ” inn famou multi-michelin-s...
La Taverna di Bacco,"largo Luigi Trafelli, 5",Nettuno,48,ITA,Lazio,€€,Modern Cuisine,"Air conditioning, Interesting wine list, Terra...","Amex, Mastercard, Visa",+39 366 905 3795,https://www.latavernadibacconettuno.it,small restaur stone ’ throw sea memor chic dec...


## <strong> 2.1.1. Create Your Index!

Let's create a vocabulary that maps each word to a unique integer (term_id) and save it in a csv file.

In [5]:
all_descriptions = df['description_filtered'].str.cat(sep=' ')
all_descriptions = list(set(all_descriptions.split(" ")))
vocabulary = {word:id for id, word in enumerate(all_descriptions)}

In [6]:
# Save this vocabulary to a file with utf-8 encoding in order to be able to handle all the characters
with open('dataset/vocabulary.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['word', 'term_id'])  # Header
    for word, id in vocabulary.items():
        writer.writerow([word, id])

Now let's create an inverted index.

In [7]:
next_id = max(vocabulary.values()) + 1  # Start IDs from the next available ID

inverted_index = {}
for _, row in df.iterrows():
    restaurant_id = row['restaurantName']
    description = row['description_filtered']
    
    for i in description:
        if i not in vocabulary: # Handling missing values
            vocabulary[i] = next_id
            term_id = next_id
            next_id += 1
        else:
            term_id = vocabulary[i]
        
        if term_id not in inverted_index:
            inverted_index[term_id] = []
        if restaurant_id not in inverted_index[term_id]:
            inverted_index[term_id].append(restaurant_id)

Save inverted index into a file 

In [39]:
with open('inverted_index.json', 'w') as jsonfile:
    json.dump(inverted_index, jsonfile)

## <strong> 2.1.2. Execute the Query

In [62]:

def searching(query, df, vocabulary, inverted_index):
    # Process the query terms
    query_text = word_tokenize(query.lower())  # Tokenize and lowercase query
    term_ids = []  # List to store term IDs for query text
    
    # Check each query term in the vocabulary
    for term in query_text:
        if term in vocabulary:  # Check if term is included in vocabulary
            term_ids.append(vocabulary[term])
    
    # Find restaurants containing all query terms
    if not term_ids:
        return []  # No terms matched in vocabulary 
    
    # Get the initial list of restaurant IDs that contain the first term
    ideal_restaurants = set(inverted_index.get(str(term_ids[0]), []))
    
    # Narrow down results for each additional term
    for term_id in term_ids[1:]:
        ideal_restaurants.intersection_update(inverted_index.get(str(term_id), []))
        
    
    # Retrieve restaurant details from the DataFrame
    result = []
    for restaurant_id in ideal_restaurants:
        restaurant = df.loc[df["restaurantName"] == restaurant_id].iloc[0]
        result.append({
            "restaurantName": restaurant["restaurantName"],
            "address": restaurant["address"],
            "description_filtered": restaurant["description_filtered"],
            "website": restaurant["website"]
        })
    
    return result

In [66]:
ideal_restaurants = searching("modern seasonal cuisine", df, vocabulary, inverted_index)
ideal_restaurants

[]

In [65]:
# Test of the function
ideal_restaurants = searching("modern seasonal cuisine", df, vocabulary, inverted_index)

for restaurant in ideal_restaurants:
    print(f"Restaurant Name: {restaurant['restaurantName']}")
    print(f"Address: {restaurant['address']}")
    print(f"Description: {restaurant['description_filtered']}")
    print(f"Website: {restaurant['website']}")
    print("-" * 40)

# <strong> 2.2. Ranked Search Engine with TF-IDF and Cosine Similarity

## <strong> 2.2.1 Inverted Index with TF-IDF Scores

In [48]:
# For each restaurant description, calculate the term frequency for each word
def tf(df, vocabulary):
    tf_dict = defaultdict(lambda: defaultdict(int))  # Nested dictionary to store term counts per document
    for idx, row in df.iterrows():
        description = row["description_filtered"]
        restaurant_id = row["restaurantName"]
        
        
        for term in description:
            if term in vocabulary:  
                term_id = vocabulary[term]
                tf_dict[restaurant_id][term_id] += 1  # Increment count for term_id in this restaurant
    
    
    # Convert raw counts to term frequencies (TF)
    for restaurant_id, term_counts in tf_dict.items():
        total_terms = sum(term_counts.values())
        for term_id in term_counts:
            term_counts[term_id] /= total_terms  # Normalize by dividing by total terms
    
    return tf_dict

In [49]:
# Calculate inverse document frequency (IDF) for each term
def idf(tf_dict, total_documents):
    idf_dict = {}
    for term_id in vocabulary.values():
        doc_count = sum(1 for doc_terms in tf_dict.values() if term_id in doc_terms)
        idf_dict[term_id] = math.log(total_documents / (1 + doc_count))  # Use 1 + doc_count to avoid division by zero
    return idf_dict

In [50]:
# Calculate TF-IDF scores and create an updated inverted index
def tf_idf_inverted_index(tf_dict, idf_dict):
    tf_idf_index = defaultdict(list)
    
    for restaurant_id, term_counts in tf_dict.items():
        for term_id, tf in term_counts.items():
            tf_idf_score = tf * idf_dict[term_id]  # TF-IDF calculation
            tf_idf_index[term_id].append((restaurant_id, tf_idf_score))
    
    return tf_idf_index

Let's combine all the functions above in order to calculate the TF, IDF, and TF-IDF scores, and store the updated inverted index.

In [51]:
# Step 1: Calculate TF
tf_dict = tf(df, vocabulary)

# Step 2: Calculate IDF
total_documents = len(df)
idf_dict = idf(tf_dict, total_documents)

# Step 3: Build the TF-IDF Inverted Index
tf_idf_inverted_index = tf_idf_inverted_index(tf_dict, idf_dict)

## <strong> 2.2.2. Execute the Ranked Query