In [1]:
import pandas as pd
import json
import os
import ast

In [2]:
origin_json = "202409PROD01REQSRESOURCES"
data_file_path =  os.path.join('..','data', 'taxone_rum_reqs_13092024_1500_1600_resource.json 1-final.jsonl')

In [3]:
# Lista para armazenar os dados
data = []

# Lendo o arquivo linha por linha
with open(data_file_path, 'r', encoding='utf-8') as file:
    current_record = []
    for line in file:
        line = line.strip()
        if line == '[':
            current_record = []
        elif line == ']':
            data.append(current_record)
        else:
            # Remover a vírgula do final da linha, se houver
            if line.endswith(','):
                line = line[:-1]
            # Adicionar a linha ao registro atual
            current_record.append(ast.literal_eval(line))

In [4]:
len(data)

170154

In [5]:
columns = [
    "datetime", "UserId", "DBSchema", "Tenant", "Module", "MenuPath", "Version", 
    "StorageID", "NavigatorLanguage", "ViewName", "ResourceUrl", "ResourceDuration"
]

df = pd.DataFrame(data, columns=columns)

df.head(3)

Unnamed: 0,datetime,UserId,DBSchema,Tenant,Module,MenuPath,Version,StorageID,NavigatorLanguage,ViewName,ResourceUrl,ResourceDuration
0,2024-09-13T18:00:00.005Z,raimunda.dias@grupobimbo.com,bimbo,AC0,Data Warehouse,Manutenção > Documento Fiscal > Doc. Fiscal de...,210816-0000 :: DW/T1 => 297.1.1-RC [09-09-2024...,d9f47d58-36d9-4fbc-ab89-840dfb6ef726,pt-BR,/taxone/Data Warehouse/Manutenção > Documento ...,https://www.onesourcetax.com/amer1/oms-taxone/...,379000000
1,2024-09-13T18:00:00.020Z,sf181329@prd.com,camargocorrea,B58,EFD - Reinf,REINF > Geração Prévia > Movimentos,210816-0000 :: DW/T1 => 297.1.1-RC [09-09-2024...,ed91053c-16bb-47c2-90eb-381142b88fdd,pt-BR,/taxone/EFD - Reinf/REINF > Geração Prévia > M...,https://www.onesourcetax.com/amer1/oms-taxone/...,488500000
2,2024-09-13T18:00:00.061Z,netoarna,legrand,BGH,EFD - Reinf,REINF > Envio de Eventos - Grandes Volumes > F...,210816-0000 :: DW/T1 => 297.1.1-RC [09-09-2024...,665481df-1e28-47d2-84a7-2c5fb9793c77,pt-BR,/taxone/EFD - Reinf/REINF > Envio de Eventos -...,https://www.onesourcetax.com/amer1/oms-taxone/...,562100000


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170154 entries, 0 to 170153
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   datetime           170154 non-null  object
 1   UserId             170154 non-null  object
 2   DBSchema           170154 non-null  object
 3   Tenant             170154 non-null  object
 4   Module             170154 non-null  object
 5   MenuPath           170154 non-null  object
 6   Version            170154 non-null  object
 7   StorageID          170154 non-null  object
 8   NavigatorLanguage  170154 non-null  object
 9   ViewName           170154 non-null  object
 10  ResourceUrl        170154 non-null  object
 11  ResourceDuration   170154 non-null  int64 
dtypes: int64(1), object(11)
memory usage: 15.6+ MB


In [8]:
df["datetime"] = pd.to_datetime(df["datetime"], format='ISO8601')
df['datetime'] = df['datetime'].dt.tz_convert('America/Sao_Paulo') # Convertendo para o fuso horário de São Paulo
# df['datetime'] = df['datetime'].dt.strftime('%Y-%m-%d %H:%M:%S') # Formatando para exibir apenas data e hora
df.head(3)

Unnamed: 0,datetime,UserId,DBSchema,Tenant,Module,MenuPath,Version,StorageID,NavigatorLanguage,ViewName,ResourceUrl,ResourceDuration
0,2024-09-13 15:00:00.005000-03:00,raimunda.dias@grupobimbo.com,bimbo,AC0,Data Warehouse,Manutenção > Documento Fiscal > Doc. Fiscal de...,210816-0000 :: DW/T1 => 297.1.1-RC [09-09-2024...,d9f47d58-36d9-4fbc-ab89-840dfb6ef726,pt-BR,/taxone/Data Warehouse/Manutenção > Documento ...,https://www.onesourcetax.com/amer1/oms-taxone/...,379000000
1,2024-09-13 15:00:00.020000-03:00,sf181329@prd.com,camargocorrea,B58,EFD - Reinf,REINF > Geração Prévia > Movimentos,210816-0000 :: DW/T1 => 297.1.1-RC [09-09-2024...,ed91053c-16bb-47c2-90eb-381142b88fdd,pt-BR,/taxone/EFD - Reinf/REINF > Geração Prévia > M...,https://www.onesourcetax.com/amer1/oms-taxone/...,488500000
2,2024-09-13 15:00:00.061000-03:00,netoarna,legrand,BGH,EFD - Reinf,REINF > Envio de Eventos - Grandes Volumes > F...,210816-0000 :: DW/T1 => 297.1.1-RC [09-09-2024...,665481df-1e28-47d2-84a7-2c5fb9793c77,pt-BR,/taxone/EFD - Reinf/REINF > Envio de Eventos -...,https://www.onesourcetax.com/amer1/oms-taxone/...,562100000


In [9]:
df['ResourceDuration'] = df['ResourceDuration'] / 1_000_000
df.head(3)

Unnamed: 0,datetime,UserId,DBSchema,Tenant,Module,MenuPath,Version,StorageID,NavigatorLanguage,ViewName,ResourceUrl,ResourceDuration
0,2024-09-13 15:00:00.005000-03:00,raimunda.dias@grupobimbo.com,bimbo,AC0,Data Warehouse,Manutenção > Documento Fiscal > Doc. Fiscal de...,210816-0000 :: DW/T1 => 297.1.1-RC [09-09-2024...,d9f47d58-36d9-4fbc-ab89-840dfb6ef726,pt-BR,/taxone/Data Warehouse/Manutenção > Documento ...,https://www.onesourcetax.com/amer1/oms-taxone/...,379.0
1,2024-09-13 15:00:00.020000-03:00,sf181329@prd.com,camargocorrea,B58,EFD - Reinf,REINF > Geração Prévia > Movimentos,210816-0000 :: DW/T1 => 297.1.1-RC [09-09-2024...,ed91053c-16bb-47c2-90eb-381142b88fdd,pt-BR,/taxone/EFD - Reinf/REINF > Geração Prévia > M...,https://www.onesourcetax.com/amer1/oms-taxone/...,488.5
2,2024-09-13 15:00:00.061000-03:00,netoarna,legrand,BGH,EFD - Reinf,REINF > Envio de Eventos - Grandes Volumes > F...,210816-0000 :: DW/T1 => 297.1.1-RC [09-09-2024...,665481df-1e28-47d2-84a7-2c5fb9793c77,pt-BR,/taxone/EFD - Reinf/REINF > Envio de Eventos -...,https://www.onesourcetax.com/amer1/oms-taxone/...,562.1


In [10]:
import time


def create_custom_fields(row):
    fields = [
        "datetime",
        "UserId",
        "DBSchema",
        "Tenant",
        "Module",
        "MenuPath",
        "Version",
        "StorageID",
        "NavigatorLanguage",
        "ViewName",
        "ResourceUrl",
        "ResourceDuration",
    ]
    field_types = {"datetime": "datetime", "ResourceDuration": "numeric"}

    custom_fields = []
    for field in fields:
        field_type = field_types.get(field, "string")
        field_value = row[field]

        if field_type == "datetime" and pd.notnull(field_value):
            field_value = field_value.strftime("%d/%m/%Y %H:%M:%S")

        custom_fields.append(
            {
                "FieldName": field,
                "FieldType": field_type,
                "FieldValue": field_value,
            }
        )

    return custom_fields


def generate_json_files_in_chunks(origin, df, chunk_size):
    num_chunks = (len(df) + chunk_size - 1) // chunk_size  # Calcula o número de partes necessárias

    for i in range(num_chunks):
        chunk_df = df.iloc[i * chunk_size : (i + 1) * chunk_size]

        data = {
            "Origin": origin,
            "Data": [
                {
                    "Id": f"{origin}-{index:010}",
                    "Date": row["datetime"].strftime("%d/%m/%Y %H:%M:%S"),
                    "quantity": 1,
                    "CustomFields": create_custom_fields(row),
                }
                for index, row in chunk_df.iterrows()
            ],
        }

        timestamp_file = time.strftime("%Y%m%d%H%M%S")
        chunk_filename = (
            f"../output/taxone_{origin}_comportamental_{timestamp_file}.json"
        )
        with open(chunk_filename, "w", encoding="utf-8") as json_file:
            json.dump(data, json_file, separators=(",", ":"), ensure_ascii=False)
        print(f"Arquivo {chunk_filename} gerado com sucesso.")


generate_json_files_in_chunks(
    origin_json, df, 100000
)  # Divide o JSON em partes de X registros cada

Arquivo ../output/taxone_202409PROD01REQSRESOURCES_comportamental_20240918160932.json gerado com sucesso.
Arquivo ../output/taxone_202409PROD01REQSRESOURCES_comportamental_20240918160939.json gerado com sucesso.
