<a href="https://colab.research.google.com/github/Leixb/UPC-ADSDB_e2e/blob/master/notebooks/landing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from zipfile import ZipFile
import glob
import os
from datetime import datetime
import json
import hashlib

In [2]:
try:
  from google.colab import drive
  drive.mount('/content/drive', force_remount=True)
  is_local = False
except ModuleNotFoundError:
  is_local = True

In [3]:
folder_landing = "./landing" if (is_local) else "/content/drive/MyDrive/ADSDB/landing"

folder_temporal = os.path.join(folder_landing, "temporal")
folder_persistent = os.path.join(folder_landing, "persistent")

extract_dir = os.path.join(folder_persistent, "extracted")

In [4]:
def sha256_file(filename):
  with open(filename,"rb") as f:
    return hashlib.sha256(f.read()).hexdigest();

In [5]:
def landing_zip(zip_file):
  filename = os.path.basename(zip_file)

  modification_time = datetime.fromtimestamp(os.path.getmtime(zip_file))
  modification_timestamp = modification_time.timestamp()

  ingestion_time = datetime.now()
  ingestion_timestamp = ingestion_time.timestamp()

  sha256 = sha256_file(zip_file)

  out_dir = os.path.join(extract_dir, f"{filename[:-4]}-{sha256}-{ingestion_timestamp}")
  os.makedirs(out_dir, exist_ok = True)

  print(out_dir)

  with ZipFile(zip_file, 'r') as zipObj:
    zipObj.extractall(out_dir)
    
    file_list = zipObj.namelist()

  metadata = {
      "filename" : filename,
      "dir" : out_dir,
      "source" : os.path.relpath(zip_file, start = out_dir),
      "sha256" : sha256,
      
      "contents" : file_list,

      "modification_timestamp" : modification_timestamp,
      "modification_time" : modification_time.strftime(time_format),

      "ingestion_timestamp" : ingestion_timestamp,
      "ingestion_time" : ingestion_time.strftime(time_format),
  }

  with open(os.path.join(out_dir, "metadata.json"), 'w') as outfile:
    json.dump(metadata, outfile, indent=2, sort_keys=True)
    
  return metadata

In [6]:
time_format = "%Y/%m/%d %H:%M:%S"

global_metadata = dict()

for zip_file in glob.glob(os.path.join(folder_temporal, "*.zip")):
  print(zip_file)

  metadata = landing_zip(zip_file)
    
  filename = metadata["filename"]
  ingestion_timestamp = metadata["ingestion_timestamp"]

  global_metadata[f"{filename}-{ingestion_timestamp}"] = metadata

glob_meta_file = os.path.join(folder_persistent, "global_metadata.json")

try:
  with open(glob_meta_file, 'r') as f:
    old_meta = json.load(f)
except FileNotFoundError:
  old_meta = dict()

joined_meta = {**old_meta, **global_metadata}

with open(glob_meta_file, 'w') as f:
  old_meta = json.dump(joined_meta, f, indent=2, sort_keys=True)

./landing/temporal/mort_availability.zip
./landing/persistent/extracted/mort_availability-72696e4e26befe59c99ef76edf051c3d797350d4b7c7cb084f60f0b61f6280cc-1642954181.980328
./landing/temporal/mort_country_codes.zip
./landing/persistent/extracted/mort_country_codes-8c410820356fc572845b5281b36f638e044a565f808c4e72efc8fb69b07df6b2-1642954182.003573
./landing/temporal/mort_documentation71f9e29d-7e3f-41e6-aafc-c4c1775c7aa3.zip
./landing/persistent/extracted/mort_documentation71f9e29d-7e3f-41e6-aafc-c4c1775c7aa3-eaff73ddddc2ea57057f3fc20038f0a42ea16c0442b8e28e93e8df47de50542b-1642954182.004449
./landing/temporal/mort_notes.zip
./landing/persistent/extracted/mort_notes-a67b4db9d9867e9076791d77c3ce9895eb19a7c6a789f367764676ad23eaec36-1642954182.007731
./landing/temporal/mort_pop.zip
./landing/persistent/extracted/mort_pop-3b8f463ba095690338a8d9692ab4bb0457639eefff12c9df7486c1cdaf7ac833-1642954182.008323
./landing/temporal/morticd07.zip
./landing/persistent/extracted/morticd07-22b463f703974c247