In [5]:
import pandas as pd
import glob
import gc
import numpy as np
import dask.dataframe as dd

## Functions and Variables

In [2]:
def find_files(path, pattern):
    files = glob.glob(f'{path}/{pattern}')
    # print(files)
    # print(len(files))

    return files

In [32]:
# Creates a column called index in the dataframe
# this column will have the format:
# prefix + 0000001 (Ex. 310.000.001) to prefix + length of dataframe (Ex. 312.123.942)

def add_index(df, prefix):
    size = len(df)
    prefix = prefix * 10**len(str(size)) + 1

    df['index'] = range(prefix, prefix + size)
    df.set_index('index', inplace=True)

In [34]:
def safe_delete(df, list_columns):
    for item in list_columns:
        if item in df.columns:
            df = df.drop(item, axis=1)
    
    return df

In [19]:
files = find_files('D:\\_repositories\\Aggregation\\SIH\\3. Concatenado', '*.csv')

---

## Processing

In [36]:
files = glob.glob('results/*.parquet')

In [38]:
files

['results\\RD2012.parquet',
 'results\\RD2013.parquet',
 'results\\RD2014.parquet',
 'results\\RD2015.parquet',
 'results\\RD2016.parquet',
 'results\\RD2017.parquet',
 'results\\RD2018.parquet',
 'results\\RD2019.parquet',
 'results\\RD2020.parquet',
 'results\\RD2021.parquet',
 'results\\RD2022.parquet']

In [44]:
# Adds the index, and treats the dtype warning to successfully corvert to parquet

inicial_ano = 12 # ano em processamento

for item in files:
    df = pd.read_parquet(item)

    # Index
    add_index(df, int('3' + str(inicial_ano)))

    # Deleting past index columns
    df = safe_delete(df, ['Unnamed: 0', 'Unnamed: 0.1'])

    # Saving to parquet
    df.to_parquet(f'final shape/RD20{inicial_ano}.parquet')
    inicial_ano += 1

    # Memory safety
    del df
    gc.collect()

---

## Others

In [16]:
files = glob.glob('results/*.parquet')

In [18]:
files

['results\\RD2012.parquet',
 'results\\RD2013.parquet',
 'results\\RD2014.parquet',
 'results\\RD2015.parquet',
 'results\\RD2016.parquet',
 'results\\RD2017.parquet',
 'results\\RD2018.parquet',
 'results\\RD2019.parquet',
 'results\\RD2020.parquet',
 'results\\RD2021.parquet',
 'results\\RD2022.parquet']

In [26]:
for item in files:
    df = pd.read_parquet(item)
    print(f'{df.memory_usage().sum()} - {df.duplicated().sum()}')
    

2384084610 - 0
2714527792 - 0
3766512573 - 0
3409573112 - 0
3384043318 - 0
3347896289 - 0
3483350150 - 0
3018110962 - 0
3039705103 - 0
3216779181 - 0
3530272864 - 0
