In [None]:
%pip install -r ../requirements.txt

In [5]:
import requests
from bs4 import BeautifulSoup
from pprint import pprint
from dotenv import load_dotenv
from pymongo import MongoClient
import os
from datetime import datetime
from pymongo.server_api import ServerApi
import zipfile
from io import BytesIO
load_dotenv()

True

In [6]:
BIXI_DATA_URL = "https://bixi.com/en/open-data"
BIXI_CDN = "https://s3.ca-central-1.amazonaws.com/cdn.bixi.com/"
MONGO_URI = os.getenv("PYTRANSIT_MONGO_URI")
BIXI_DB_NAME = os.getenv("BIXI_DB_NAME")
BIXI_HISTORIC_URLS_COLLECTION = os.getenv("BIXI_HISTORIC_URLS_COLLECTION")
DEFAULT_ZIP_PATH = "data/file.zip"
DEFAULT_EXTRACT_PATH = "data/"

## Getting the current urls from the website

In [8]:
def extract_bixi_historic_data_urls(url=BIXI_DATA_URL):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    urls = [a['href'] for a in soup.find_all('a', href=True) if BIXI_CDN in a['href']]
    return urls

In [9]:
urls = extract_bixi_historic_data_urls()
pprint(urls)

['https://s3.ca-central-1.amazonaws.com/cdn.bixi.com/wp-content/uploads/2024/03/DonneesOuvertes2024_0102.zip',
 'https://s3.ca-central-1.amazonaws.com/cdn.bixi.com/wp-content/uploads/2024/01/DonneesOuvertes2023_12.zip',
 'https://s3.ca-central-1.amazonaws.com/cdn.bixi.com/wp-content/uploads/2023/08/DonneesOuverte2022.zip',
 'https://s3.ca-central-1.amazonaws.com/cdn.bixi.com/wp-content/uploads/2023/06/Historique-BIXI-2021.zip',
 'https://s3.ca-central-1.amazonaws.com/cdn.bixi.com/wp-content/uploads/2023/06/Historique-BIXI-2020.zip',
 'https://s3.ca-central-1.amazonaws.com/cdn.bixi.com/wp-content/uploads/2023/06/Historique-BIXI-2019.zip',
 'https://s3.ca-central-1.amazonaws.com/cdn.bixi.com/wp-content/uploads/2023/06/Historique-BIXI-2018.zip',
 'https://s3.ca-central-1.amazonaws.com/cdn.bixi.com/wp-content/uploads/2023/06/Historique-BIXI-2017.zip',
 'https://s3.ca-central-1.amazonaws.com/cdn.bixi.com/wp-content/uploads/2023/06/Historique-BIXI-2016.zip',
 'https://s3.ca-central-1.amazona

## Saving the urls in mongo for later comparison

In [5]:
def save_bixi_urls_in_mongodb(urls):
    client = MongoClient(MONGO_URI, server_api=ServerApi('1'))
    db = client[BIXI_DB_NAME]
    collection = db[BIXI_HISTORIC_URLS_COLLECTION]
    for url in urls:
        document = {"date_added": datetime.now()}
        collection.replace_one({"_id": url}, document, upsert=True)
    print(f"{len(urls)} urls saved.")
    client.close()

In [6]:
save_bixi_urls_in_mongodb(urls)

11 urls saved.


In [9]:
# For good measure
def delete_bixi_urls_from_mongodb(urls):
    client = MongoClient(MONGO_URI, server_api=ServerApi('1'))
    db = client[BIXI_DB_NAME]
    collection = db[BIXI_HISTORIC_URLS_COLLECTION]
    for url in urls:
        collection.delete_one({"_id": url})
    print(f"{len(urls)} urls deleted.")
    client.close()

In [10]:
delete_bixi_urls_from_mongodb(urls[0:2])

2 urls deleted.


## How do we know we have new data?

If we don't have the url saved in our db, we can assume it is new data.

In [11]:
def check_new_data(urls):
    client = MongoClient(MONGO_URI, server_api=ServerApi('1'))
    db = client[BIXI_DB_NAME]
    collection = db[BIXI_HISTORIC_URLS_COLLECTION]
    existing_urls = {document['_id'] for document in collection.find({}, {'_id': 1})}
    new_urls = [url for url in urls if url not in existing_urls]
    client.close()
    return new_urls

In [12]:
new_urls = check_new_data(urls)
pprint(new_urls)

['https://s3.ca-central-1.amazonaws.com/cdn.bixi.com/wp-content/uploads/2024/03/DonneesOuvertes2024_0102.zip',
 'https://s3.ca-central-1.amazonaws.com/cdn.bixi.com/wp-content/uploads/2024/01/DonneesOuvertes2023_12.zip']


## Downloading the data

In this notebook we are designing the extraction step, but we should think ahead about the whole process to get an idea of a viable architecture.

The workload is as follows:
- check for new data url
- download a ~500MB zip file from internet
- unzip it to a ~2GB file
- perform various transformations
- load into mongodb
- repeat all the above once a month

The initial assessment is that this workload is too heavy for lambdas. An on-demand EC2 instance is more appropriate. It could be triggered by EventBridge once a month, execute the steps and shutdown. We need to test the process and estimate the duration on a modest machine like a raspberry pi. This would help in estimating the cost as well.

Downloading script:

In [13]:
def download_file(url, output_path=DEFAULT_ZIP_PATH):
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with requests.get(url) as r:
        r.raise_for_status()
        with open(output_path, "wb") as file:
            file.write(r.content)
            print(f"successful download: {output_path}")

In [14]:
download_file(urls[0])

successful download: data/file.zip


Resource utilization while running `download_file` on raspberry pi:
```bash
(myenv) ➜  pytransit-bixi-extract du -sh . && /usr/bin/time -v python script.py "https://s3.ca-central-1.amazonaws.com/cdn.bixi.com/wp-content/uploads/2024/01/DonneesOuvertes2023_12.zip" && du -sh .
31M     .
successful download: data/file.zip
        Command being timed: "python script.py https://s3.ca-central-1.amazonaws.com/cdn.bixi.com/wp-content/uploads/2024/01/DonneesOuvertes2023_12.zip"
        User time (seconds): 10.51
        System time (seconds): 7.22
        Percent of CPU this job got: 51%
        Elapsed (wall clock) time (h:mm:ss or m:ss): 0:34.48
        Average shared text size (kbytes): 0
        Average unshared data size (kbytes): 0
        Average stack size (kbytes): 0
        Average total size (kbytes): 0
        Maximum resident set size (kbytes): 840532
        Average resident set size (kbytes): 0
        Major (requiring I/O) page faults: 0
        Minor (reclaiming a frame) page faults: 207913
        Voluntary context switches: 169791
        Involuntary context switches: 606
        Swaps: 0
        File system inputs: 0
        File system outputs: 726320
        Socket messages sent: 0
        Socket messages received: 0
        Signals delivered: 0
        Page size (bytes): 4096
        Exit status: 0
385M    .
```

It shows that, for downloading `DonneesOuvertes2023_12.zip`, we used 
- `840.5MB` of memory
- `51%` of CPU (Raspberry Pi has 4 cores)
- about `350MB` of disk space
- about `34 seconds` to complete (on ~10MBps internet)



We still have to decompress the downloaded zip file. To save disk space, we could bypass writing the zip on disk and start decompressing directly. But that would take up more memory.

In [10]:
def download_and_extract_zip(url, extract_path=DEFAULT_EXTRACT_PATH):
    os.makedirs(extract_path, exist_ok=True)
    with requests.get(url) as r:
        r.raise_for_status()
        with zipfile.ZipFile(BytesIO(r.content)) as z:
            z.extractall(path=extract_path)
            print(f"File extracted to: {extract_path}")

In [11]:
download_and_extract_zip(urls[0])

File extracted to: data/


Resource utilization:

```bash
(myenv) ➜  pytransit-bixi-extract du -sh . && /usr/bin/time -v python script.py "https://s3.ca-central-1.amazonaws.com/cdn.bixi.com/wp-content/uploads/2024/01/DonneesOuvertes2023_12.zip" && du -sh .
31M     .
Files extracted to: data/
        Command being timed: "python script.py https://s3.ca-central-1.amazonaws.com/cdn.bixi.com/wp-content/uploads/2024/01/DonneesOuvertes2023_12.zip"
        User time (seconds): 28.01
        System time (seconds): 14.02
        Percent of CPU this job got: 34%
        Elapsed (wall clock) time (h:mm:ss or m:ss): 2:02.76
        Average shared text size (kbytes): 0
        Average unshared data size (kbytes): 0
        Average stack size (kbytes): 0
        Average total size (kbytes): 0
        Maximum resident set size (kbytes): 796724
        Average resident set size (kbytes): 0
        Major (requiring I/O) page faults: 0
        Minor (reclaiming a frame) page faults: 197010
        Voluntary context switches: 167507
        Involuntary context switches: 2749
        Swaps: 0
        File system inputs: 0
        File system outputs: 3823184
        Socket messages sent: 0
        Socket messages received: 0
        Signals delivered: 0
        Page size (bytes): 4096
        Exit status: 0
1.9G    .
```

This shows that, for downloading and decompressing `DonneesOuvertes2023_12.zip`, we used:
- `796.7MB` of memory
- `34%` of CPU
- `~1.9GB` of disk space
- a little over `2 minutes` of running time

After analyzing this, we can see that the extraction workload is not actually so heavy for lambdas if not for the disk space. We can imagine a scenario where, a 1024MB Lambda runs the `download_and_extract_zip` function and saves the decompressed file into a S3 bucket. Then other Lambdas would read from the S3 bucket for the `Transform` and `Load` steps and perform a cleanup afterwards.

Another alternative, as mentioned before is to use an on-demand EC2 instance with a EBS attachment for storage to take care of the whole process.

We can also explore AWS batch. We probably should test the rest of the ETL process to get a better picture of the architectural requirements.

