In [5]:
import os

import pandas as pd
from pandas_profiling import ProfileReport

In [6]:
os.getcwd()

'/Users/dgarhdez/Desktop/IE/ie-pda/2023/september_2023/pda2/sessions_material/session_03_04'

## RAW

In [7]:
datasources = {
    source.replace(".zip",""):source 
    for source in os.listdir("data/in") 
    if source.endswith(".zip")
}

datasources

{'epidemiology': 'epidemiology.zip',
 'vaccinations': 'vaccinations.zip',
 'health': 'health.zip',
 'demographics': 'demographics.zip',
 'hospitalizations': 'hospitalizations.zip',
 'index': 'index.zip'}

### Load data sources

In this part we just load all data with Pandas for further processing. In this case we just have a bunch of CSV data, but data loading (or extraction, the E of "ETL") can be much more complicated when we have to connect to the clients' database or when the files we have to process are Excel files with macros 😱

In [9]:
datasets = {}
for key,value in datasources.items():
    datasets[key] = pd.read_csv(f"data/in/{value}")

### Execute a data profiling for each source

To have a very brief and direct summary to understand data, we can execute a profiling on al data sources individually. This operation will give us a brief overview about the state of data and will help us to establish next steps in the following data processing phases (how to filter, how to impute missing values, if column names afre correct, etc.)

In [10]:
os.makedirs("profiling", exist_ok=True)

for key,value in datasets.items():
    profile = ProfileReport(value, minimal=True)
    profile.to_file(f"profiling/{key}.html")

Summarize dataset: 100%|██████████| 16/16 [00:01<00:00, 13.05it/s, Completed]                            
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.32s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  6.61it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 1349.95it/s]
Summarize dataset: 100%|██████████| 38/38 [00:00<00:00, 51.79it/s, Completed]                                                      
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  3.31it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00, 21.90it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 1552.30it/s]
Summarize dataset: 100%|██████████| 20/20 [00:00<00:00, 631.45it/s, Completed]                                            
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  6.34it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00, 40.50it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 1726.76it/s]
Summarize dataset: 100%|██████

### Save data into `raw` schema

In [11]:
os.makedirs("data/raw", exist_ok=True)

In [12]:
for key,value in datasets.items():
    value.to_csv(f"data/raw/{key}.zip", index=False)