<a href="https://colab.research.google.com/github/MateoJacomeUPC/BDM/blob/master/BDM_Project_OpenDataIncome.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install "dask[complete]"

In [2]:
import os
from datetime import datetime
import pandas as pd
import dask.dataframe as dd
import numpy as np
from pyarrow import fs
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import pyarrow as pa

Using Dask with Remote Data
- https://docs.dask.org/en/latest/how-to/connect-to-remote-data.html
- https://docs.dask.org/en/latest/generated/dask.dataframe.read_csv.html#dask.dataframe.read_csv

In [3]:
# returns dataframe for processing to parquet for persistent storage
# needs to return file list so that files can be deleted after processing?
def DaskLoadPartitionedCSV(directory, source):
  """ 
  Input: a string for the data directory path,  
  a string of the source folder name that contains partitioned data in csv format
  Output: dask dataframe, list of loaded files
  """
  path = directory + "/" + source + '/*.csv'
  # loading all csv files in path to a single dask dataframe, adding column for source file
  ddf = dd.read_csv(path, include_path_column='sourceFile', blocksize='64MB')
  # add timestamp to column called 'load_time'
  ddf['load_time'] = datetime.now()
  return ddf

Setting Datatypes in Dask
- https://docs.dask.org/en/latest/generated/dask.dataframe.DataFrame.astype.html

In [4]:
def setSchema(source, ddf):
  """ 
  Input: a string label for the source data,  
  a dask dataframe that has been imported from source files
  Output: dask dataframe that complies with schema
  """
  if source == "opendatabcn-income":
    # set schema using smallest possible datatype
    schema = {
        'Any':'uint16',
        'Codi_Districte':'uint8',
        'Nom_Districte': "string[pyarrow]",
        'Codi_Barri':'uint8',
        'Nom_Barri': "string[pyarrow]",
        'Població':'uint32',
        'Índex RFD Barcelona = 100': "string[pyarrow]",
        'sourceFile': "string[pyarrow]"}
    ddf = ddf.astype(schema)
    # mixed datatype columns must be converted using dd.to_numeric()
    ddf['Índex RFD Barcelona = 100']= dd.to_numeric(ddf['Índex RFD Barcelona = 100'], errors='coerce')
  return ddf

In [5]:
def getPyarrowTable(source, ddf):
  """ 
  Input: a string label for the source data,  
  a dask dataframe with the correct schema
  Output: pyarrow table that complies with schema
  """
  if source == "opendatabcn-income":
    # set pyarrow schema
    pa_schema = pa.schema([
        ("Any", pa.uint16()),
        ("Codi_Districte", pa.uint8()),
        ("Nom_Districte", pa.string()),
        ("Codi_Barri", pa.uint8()),
        ("Nom_Barri", pa.string()),
        ("Població", pa.uint32()),
        ("Índex RFD Barcelona = 100", pa.float64()),
        ('sourceFile', pa.string()),
        ("load_time", pa.timestamp('ns')) # datetime.now()
        ])
    # convert Dask df to Pandas df 
    df = ddf.compute()
    # sort and set index using Pandas
    df = df.sort_values(by=["Nom_Districte", "Nom_Barri", "Any"])
    df = df.set_index(["Nom_Districte", "Nom_Barri", "Any"])
    # Load Pandas df to pyarrow table using schema
    table = pa.Table.from_pandas(df, schema=pa_schema, preserve_index=True)
  return table

In [6]:
directory = "/content/drive/MyDrive/BDM-Project/Data"
source = "opendatabcn-income"
ddf = DaskLoadPartitionedCSV(directory, source) # load data
ddf = setSchema(source, ddf) # set schema
table = getPyarrowTable(source, ddf) # convert to pyarrow table

# Write a parquet table and collect metadata information
metadata_collector = []
pq.write_table(table, '/content/drive/MyDrive/BDM-Project/Data/opendatabcn-income/opendatabcn-income.parquet', 
               metadata_collector=metadata_collector,
               row_group_size=134217728) #128 mb
#use pq.write_metadata to combine and write in a single step
pq.write_metadata(table.schema, "/content/drive/MyDrive/BDM-Project/Data/opendatabcn-income/_metadata",
    metadata_collector=metadata_collector)


Checking the written .parquet files by reading them into a new dataframe

In [13]:
parquet_file = pq.ParquetFile('/content/drive/MyDrive/BDM-Project/Data/opendatabcn-income/opendatabcn-income.parquet')
metadata = parquet_file.metadata
metadata

<pyarrow._parquet.FileMetaData object at 0x7f121737d1d0>
  created_by: parquet-cpp-arrow version 6.0.1
  num_columns: 9
  num_rows: 811
  num_row_groups: 1
  format_version: 1.0
  serialized_size: 5382

In [14]:
schema = pq.read_schema('/content/drive/MyDrive/BDM-Project/Data/opendatabcn-income/opendatabcn-income.parquet')
schema

Any: uint16
Codi_Districte: uint8
Nom_Districte: string
Codi_Barri: uint8
Nom_Barri: string
Població: int64
Índex RFD Barcelona = 100: double
sourceFile: string
load_time: timestamp[us]
-- schema metadata --
pandas: '{"index_columns": ["Any", "Nom_Districte", "Nom_Barri"], "column' + 1295

In [15]:
new_df = pd.read_parquet('/content/drive/MyDrive/BDM-Project/Data/opendatabcn-income/opendatabcn-income.parquet')
new_df.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Codi_Districte,Codi_Barri,Població,Índex RFD Barcelona = 100,sourceFile,load_time
Any,Nom_Districte,Nom_Barri,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2007,Ciutat Vella,"Sant Pere, Santa Caterina i la Ribera",1,4,22572,80.2,/content/drive/MyDrive/BDM-Project/Data/openda...,2022-04-04 12:49:12.835716
2008,Ciutat Vella,"Sant Pere, Santa Caterina i la Ribera",1,4,22649,81.8,/content/drive/MyDrive/BDM-Project/Data/openda...,2022-04-04 12:49:12.835716
2009,Ciutat Vella,"Sant Pere, Santa Caterina i la Ribera",1,4,23136,88.8,/content/drive/MyDrive/BDM-Project/Data/openda...,2022-04-04 12:49:12.835716
2010,Ciutat Vella,"Sant Pere, Santa Caterina i la Ribera",1,4,23101,91.2,/content/drive/MyDrive/BDM-Project/Data/openda...,2022-04-04 12:49:12.835716
2011,Ciutat Vella,"Sant Pere, Santa Caterina i la Ribera",1,4,22632,86.1,/content/drive/MyDrive/BDM-Project/Data/openda...,2022-04-04 12:49:12.835716
2012,Ciutat Vella,"Sant Pere, Santa Caterina i la Ribera",1,4,22873,89.3,/content/drive/MyDrive/BDM-Project/Data/openda...,2022-04-04 12:49:12.835716
2013,Ciutat Vella,"Sant Pere, Santa Caterina i la Ribera",1,4,22821,91.2,/content/drive/MyDrive/BDM-Project/Data/openda...,2022-04-04 12:49:12.835716
2014,Ciutat Vella,"Sant Pere, Santa Caterina i la Ribera",1,4,22674,92.5,/content/drive/MyDrive/BDM-Project/Data/openda...,2022-04-04 12:49:12.835716
2015,Ciutat Vella,"Sant Pere, Santa Caterina i la Ribera",1,4,22305,96.4,/content/drive/MyDrive/BDM-Project/Data/openda...,2022-04-04 12:49:12.835716
2016,Ciutat Vella,"Sant Pere, Santa Caterina i la Ribera",1,4,22380,97.8,/content/drive/MyDrive/BDM-Project/Data/openda...,2022-04-04 12:49:12.835716
