# Eat Safe, Love

## Notebook Set Up

In [1]:
from pymongo import MongoClient
import pandas as pd
from pprint import pprint

In [2]:
# Create an instance of MongoClient
mongo = MongoClient(port=27017)

In [3]:
# assign the uk_food database to a variable name
db = mongo['uk_food']

In [4]:
# review the collections in our database
print(db.list_collection_names())

['establishments']


In [5]:
# assign the collection to a variable
establishments = db['establishments']

## Part 3: Exploratory Analysis
Unless otherwise stated, for each question: 
* Use `count_documents` to display the number of documents contained in the result.
* Display the first document in the results using `pprint`.
* Convert the result to a Pandas DataFrame, print the number of rows in the DataFrame, and display the first 10 rows.

### 1. Which establishments have a hygiene score equal to 20?

In [6]:
# Find the establishments with a hygiene score of 20
query = {"scores.Hygiene": 20}  #setting a filter
field = {"_id": 0, "BusinessName": 1, "RatingValue": 1,"scores.Hygiene": 1}  # Limiting fields to show in the output to cleaner display
results = establishments.find(query, field)

# Use count_documents to display the number of documents in the result
count = establishments.count_documents(query)
print(f"There {count} establishments with a hygiene score of 20.")

# Display the first document in the results using pprint
first_document = establishments.find_one(query, field)
first_document

There 27 establishments with a hygiene score of 20.


{'BusinessName': 'The Chase Rest Home',
 'RatingValue': 0,
 'scores': {'Hygiene': 20}}

In [7]:
# Convert the result to a Pandas DataFrame to convert it to a list of dictionaries that allows to save as a Pandas DataFrame
df = pd.DataFrame(list(results))

# Display the number of rows in the DataFrame
rows_count = len(df)
print(f"There are {rows_count} rows in Data Frame")

# Display the first 10 rows of the DataFrame
print(df.head(10))

There are 27 rows in Data Frame
                                        BusinessName  RatingValue  \
0                                The Chase Rest Home            0   
1                                         Brenalwood            0   
2                                      Melrose Hotel            0   
3                                      Seaford Pizza            1   
4                                   Ashby's Butchers            0   
5                                      Golden Palace            0   
6                                              F & S            0   
7  Westview Playgroup Based At Downsview Comm Pri...            1   
8                        Whatever The Weather Coffee            0   
9                        Kings Restaurant (Oriental)            0   

            scores  
0  {'Hygiene': 20}  
1  {'Hygiene': 20}  
2  {'Hygiene': 20}  
3  {'Hygiene': 20}  
4  {'Hygiene': 20}  
5  {'Hygiene': 20}  
6  {'Hygiene': 20}  
7  {'Hygiene': 20}  
8  {'Hygiene': 20} 

### 2. Which establishments in London have a `RatingValue` greater than or equal to 4?

In [10]:

# Find the establishments with London as the Local Authority and has a RatingValue greater than or equal to 4.
query = {
    "RatingValue": {"$gte": 4}, # set ratingValue equal or greater than 4
    "LocalAuthorityName": {"$regex": "London", "$options": "i"}  # case-insensitive regex match for "London"
}

fields = {    # setting filters so the dispaly does not look too long, but contains imprtant fields
    "_id": 0,
    "BusinessName": 1,
    "RatingValue": 1,
    "scores.Hygiene": 1,
    "geocode.longitude": 1,
    "geocode.latitude": 1,
    "LocalAuthorityName": 1
}

# Execute the query
rest_data = establishments.find(query, fields)

# Use count_documents to display the number of documents in the result
rest_count = establishments.count_documents(query)
print(f"There are {rest_count} restaurants in our dataset with ratings of 4 or higher and located in London.")
print("")

# Display the first document in the results using pprint
first_rest = rest_data[0]  # defining the first item (zero index) in already filtered data
pprint(first_rest)  # displaying the first restaurant

There are 27 restaurants in our dataset with ratings of 4 or higher and located in London.

{'BusinessName': "Charlie's",
 'LocalAuthorityName': 'City of London Corporation',
 'RatingValue': 4,
 'geocode': {'latitude': 51.369321, 'longitude': 0.508551},
 'scores': {'Hygiene': 5}}


In [11]:
# Convert the result to a Pandas DataFrame
rest_db = pd.DataFrame(list(rest_data))

# Display the number of rows in the DataFrame
rows_number = len(rest_db)
print(f"There are {rows_number} rows in DataFrame.")

# Display the first 10 rows of the DataFrame
print(rest_db.head(10))


There are 27 rows in DataFrame.
                    BusinessName  RatingValue          LocalAuthorityName  \
0                      Charlie's            4  City of London Corporation   
1        Mv City Cruises Erasmus            5  City of London Corporation   
2      Benfleet Motor Yacht Club            4  City of London Corporation   
3                     Mv Valulla            5  City of London Corporation   
4                  Tereza Joanne            5  City of London Corporation   
5  The Nuance Group (UK) Limited            5  City of London Corporation   
6                       WH Smith            5  City of London Corporation   
7         Mv Sunborn Yacht Hotel            5  City of London Corporation   
8              Good Hotel London            5  City of London Corporation   
9      Wake Up Docklands Limited            5  City of London Corporation   

           scores                                            geocode  
0  {'Hygiene': 5}     {'longitude': 0.508551, 'la

### 3. What are the top 5 establishments with a `RatingValue` rating value of 5, sorted by lowest hygiene score, nearest to the new restaurant added, "Penang Flavours"?

In [58]:
# Search within 0.01 degree on either side of the latitude and longitude.
# Rating value must equal 5
# Sort by hygiene score

degree_search = 0.01 # setting limits of search parameters
latitude = 51.49014200 # setting latitute of the restaurant "Penanag Flavours" near which we do our search
longitude = 0.08384000 # setting longitute of the restaurant "Penanag Flavours" near which we do our search 
fields = {"_id": 0, "BusinessName": 1, "RatingValue": 1,"scores.Hygiene": 1, "geocode.longitude": 1, "geocode.latitude": 1, "distance": 1} # setting filters for fields to be displayed

# Define the query based on required rating value and search (within 0.01 degree on either side of the latitude and longitude).
query = {
    "RatingValue": 5,
    "geocode.latitude": {"$gte": latitude - degree_search, "$lte": latitude + degree_search},
    "geocode.longitude": {"$gte": longitude - degree_search, "$lte": longitude + degree_search}
}

# Sort by hygiene score (ascending) and then by distance (geospatial)
sort = [
    ("scores.Hygiene", 1),   # Ascending order of Hygiene score (start with the lowest)
    ("Distance", 1)          # Ascending order of Distance to find the closest establishments
]
limit = 5
results = establishments.find(query, fields).sort(sort).limit(limit)

# Print the results
print (f"Top 5 establishments with a RatingValue of 5, sorted by lowest hygiene score and closest proximity to 'Penang Flavours':")
print("")
for result in results:
    pprint(result)


Top 5 establishments with a RatingValue of 5, sorted by lowest hygiene score and closest proximity to 'Penang Flavours':

{'BusinessName': 'Iceland',
 'RatingValue': 5,
 'geocode': {'latitude': 51.4871482849121, 'longitude': 0.0924199968576431},
 'scores': {'Hygiene': 0}}
{'BusinessName': 'Howe and Co Fish and Chips - Van 17',
 'RatingValue': 5,
 'geocode': {'latitude': 51.4875335693359, 'longitude': 0.0925370007753372},
 'scores': {'Hygiene': 0}}
{'BusinessName': 'Volunteer',
 'RatingValue': 5,
 'geocode': {'latitude': 51.4873437, 'longitude': 0.09208},
 'scores': {'Hygiene': 0}}
{'BusinessName': 'Plumstead Manor Nursery',
 'RatingValue': 5,
 'geocode': {'latitude': 51.481517791748, 'longitude': 0.0859939977526665},
 'scores': {'Hygiene': 0}}
{'BusinessName': 'Atlantic Fish Bar',
 'RatingValue': 5,
 'geocode': {'latitude': 51.4867296, 'longitude': 0.0912164},
 'scores': {'Hygiene': 0}}


In [59]:
# Convert result to Pandas DataFrame
nearest_rest_db = pd.DataFrame(list(results))

### 4. How many establishments in each Local Authority area have a hygiene score of 0?

In [83]:
# Create a pipeline that:
# 1. Matches establishments with a hygiene score of 0
new_query =  {'$match': {'scores.Hygiene': 0}} # removed regex as the system required it to be a string and not integer

# 2. Groups the matches by Local Authority
group_matches = {
    '$group': {
        '_id': '$LocalAuthorityCode',  # Group by Local Authority Code
        'count': {'$sum': 1}  # Count the number of documents in each group
    }
}
# 3. Sorts the matches from highest to lowest count
sort_values = {'$sort': {'count': -1, }} #set descending order

# Put the pipeline together
pipeline = [new_query, group_matches, sort_values]
# Run the pipeline through the aggregate method and then save the results to a variable
results = list(establishments.aggregate(pipeline))

# Print the number of documents in the result
print(f"There are {len(results)} Local Authorities with establishments that have a hygiene score of 0 in the area.")
print("")
# Print the first 10 results
print("The first 10 results, sorted by the highest count are:")
for result in results[:10]:
    pprint(result)


There are 53 Local Authorities with establishments that have a hygiene score of 0 in the area.

The first 10 results, sorted by the highest count are:
{'_id': '190', 'count': 772}
{'_id': '511', 'count': 670}
{'_id': '525', 'count': 566}
{'_id': '896', 'count': 557}
{'_id': '185', 'count': 523}
{'_id': '503', 'count': 508}
{'_id': '189', 'count': 502}
{'_id': '113', 'count': 479}
{'_id': '893', 'count': 435}
{'_id': '120', 'count': 430}


In [87]:
# Convert the result to a Pandas DataFrame
authorities_df = pd.DataFrame(list(results))

# Display the number of rows in the DataFrame
rows_number = len(authorities_df)
print(f"The Data Frame has a total of {rows_number} rows.")

# Display the first 10 rows of the DataFrame
print(authorities_df.head(10))


The Data Frame has a total of 53 rows.
   _id  count
0  190    772
1  511    670
2  525    566
3  896    557
4  185    523
5  503    508
6  189    502
7  113    479
8  893    435
9  120    430
