In [None]:
%cd ../..

In [6]:
import os
from pprint import pprint
from pymongo import MongoClient

from BIXI_Services.BIXI_Historical_Data_Checker.main import handler as checker_handler
from BIXI_Services.BIXI_Historical_Data_Processor.main import Config
from BIXI_Services.BIXI_Historical_Data_Processor.main import async_handler as processor_handler

from dotenv import load_dotenv


In [7]:
load_dotenv()
# os.environ["ATLAS_URI"] = 
os.environ["MONGO_DATABASE_NAME"] = "prod-monitoring-mtl"
os.environ["BIXI_URL_COLLECTION"] = "historic_data_urls"
os.environ["BIXI_LOCATION_COLLECTION"] = "station_locations"
os.environ["BIXI_TRIP_COLLECTION"] = "trips"
os.environ["BIXI_DATA_URL"] = "https://bixi.com/en/open-data"
os.environ["BIXI_CDN"] = "https://s3.ca-central-1.amazonaws.com/cdn.bixi.com/"
os.environ["BIXI_DEFAULT_EXTRACT_PATH"] = "/tmp/data/"
os.environ["BIXI_CHUNK_SIZE"] = "500000"
os.environ["BIXI_CONCURRENCY"] = "8"
os.environ["BIXI_QUEUE_SIZE"] = "1"

# Historical data initial db operations

## Initialize DB objects

‼️ `db.drop_collection(collection_name)` deletes all data.

In [8]:
config = Config(**os.environ)
client = MongoClient(config.ATLAS_URI)
db = client[config.MONGO_DATABASE_NAME]
for collection_name in db.list_collection_names():
    db.drop_collection(collection_name)

In [9]:
os.environ["BIXI_DEFAULT_EXTRACT_PATH"]
os.environ["MONGO_DATABASE_NAME"]

'test-monitoring-mtlggggggggggg'

## Get the urls

In [5]:
checker_results = checker_handler(None, None)
urls = checker_results["urls"]
pprint(urls)

checking for new historic data..
scraped_urls {2014: 'https://s3.ca-central-1.amazonaws.com/cdn.bixi.com/wp-content/uploads/2023/06/Historique-BIXI-2014.zip', 2015: 'https://s3.ca-central-1.amazonaws.com/cdn.bixi.com/wp-content/uploads/2023/06/Historique-BIXI-2015.zip', 2016: 'https://s3.ca-central-1.amazonaws.com/cdn.bixi.com/wp-content/uploads/2023/06/Historique-BIXI-2016.zip', 2017: 'https://s3.ca-central-1.amazonaws.com/cdn.bixi.com/wp-content/uploads/2023/06/Historique-BIXI-2017.zip', 2018: 'https://s3.ca-central-1.amazonaws.com/cdn.bixi.com/wp-content/uploads/2023/06/Historique-BIXI-2018.zip', 2019: 'https://s3.ca-central-1.amazonaws.com/cdn.bixi.com/wp-content/uploads/2023/06/Historique-BIXI-2019.zip', 2020: 'https://s3.ca-central-1.amazonaws.com/cdn.bixi.com/wp-content/uploads/2023/06/Historique-BIXI-2020.zip', 2021: 'https://s3.ca-central-1.amazonaws.com/cdn.bixi.com/wp-content/uploads/2023/06/Historique-BIXI-2021.zip', 2022: 'https://s3.ca-central-1.amazonaws.com/cdn.bixi.com

## ETL

The format of the historic data have changed over the years. We've implemented an ETL strategy for the current format which is in effect since 2022. Urls for prior years will not be processed until we've created appropriate ETL strategies and configured them in `BIXI_Services.BIXI_Historical_Data_Processor.etl.transform_load.context` module.

In [6]:
if urls:
    processor_results = await processor_handler({"urls": urls}, None)
    pprint(processor_results)

historic data processing started.
ETL process started for URL: https://s3.ca-central-1.amazonaws.com/cdn.bixi.com/wp-content/uploads/2023/06/Historique-BIXI-2014.zip
start download and extract https://s3.ca-central-1.amazonaws.com/cdn.bixi.com/wp-content/uploads/2023/06/Historique-BIXI-2014.zip
extracted files: ['/Users/kochiallagbe/Documents/GitHub/PyTransit-MicroServices/tmp/data/BixiMontrealRentals2014/OD_2014-04.csv', '/Users/kochiallagbe/Documents/GitHub/PyTransit-MicroServices/tmp/data/BixiMontrealRentals2014/OD_2014-05.csv', '/Users/kochiallagbe/Documents/GitHub/PyTransit-MicroServices/tmp/data/BixiMontrealRentals2014/OD_2014-06.csv', '/Users/kochiallagbe/Documents/GitHub/PyTransit-MicroServices/tmp/data/BixiMontrealRentals2014/OD_2014-07.csv', '/Users/kochiallagbe/Documents/GitHub/PyTransit-MicroServices/tmp/data/BixiMontrealRentals2014/OD_2014-08.csv', '/Users/kochiallagbe/Documents/GitHub/PyTransit-MicroServices/tmp/data/BixiMontrealRentals2014/OD_2014-09.csv', '/Users/kochia

## Creating indexes

For [UC006](https://github.com/Monitoring-Mtl/Frontend/issues/114), aggregations will be frequently performed on the `DURATIONMS` columns. For that reason, we create an index on `DURATIONSMS` for faster lookups. Depending on the use case, more columns can be indexed.

In [7]:
db[config.BIXI_TRIP_COLLECTION].create_index("DURATIONMS")
# db[config.BIXI_TRIP_COLLECTION].create_index("STARTTIMEMS")
# db[config.BIXI_TRIP_COLLECTION].create_index("ENDTIMEMS")

'DURATIONMS_1'

## DB statistics

In [8]:
import datetime


def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return f"{num:3.1f}{unit}{suffix}"
        num /= 1024.0
    return f"{num:.1f}Yi{suffix}"

for collection_name in db.list_collection_names():
    current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"Collection: {collection_name} as of {current_time}")
    stats = db.command("collStats", collection_name)
    print(f"Collection: {collection_name}")
    print(f"Storage size: {sizeof_fmt(stats['storageSize'])}")
    print(f"Documents: {stats['count']}")
    print(f"Avg. document size: {sizeof_fmt(stats['avgObjSize'])}")
    print(f"Indexes: {stats['nindexes']}")
    print(f"Total index size: {sizeof_fmt(stats['totalIndexSize'])}\n")

Collection: trips as of 2024-04-02 02:21:05
Collection: trips
Storage size: 1.3GiB
Documents: 20883589
Avg. document size: 178.0B
Indexes: 2
Total index size: 400.4MiB

Collection: station_locations as of 2024-04-02 02:21:05
Collection: station_locations
Storage size: 124.0KiB
Documents: 1103
Avg. document size: 119.0B
Indexes: 1
Total index size: 92.0KiB

Collection: historic_data_urls as of 2024-04-02 02:21:05
Collection: historic_data_urls
Storage size: 20.0KiB
Documents: 3
Avg. document size: 128.0B
Indexes: 1
Total index size: 20.0KiB

