## Importando todas as bibliotecas e dependências.


In [9]:
import os
import h2o
import pandas as pd
from datetime import datetime
from h2o.automl import H2OAutoML

## Método que processa os dados de voo e os transforma em uma linha (.parquet)


In [10]:
agg_methods = {
    "amscHprsovDrivF-1a": "sum",
    "amscHprsovDrivF-1b":"sum",
    "amscHprsovDrivF-2b":"sum",
    "amscPrsovDrivF-1a":"sum",
    "amscPrsovDrivF-1b":"sum",
    "amscPrsovDrivF-2b":"sum",
    "basBleedLowPressF-1a":"sum",
    "basBleedLowPressF-2b":"sum",
    "basBleedLowTempF-1a":"sum",
    "basBleedLowTempF-2b":"sum",
    "basBleedOverPressF-1a":"sum",
    "basBleedOverPressF-2b":"sum",
    "basBleedOverTempF-1a":"sum",
    "basBleedOverTempF-2b":"sum",
    "bleedFavTmCmd-1a":"sum",
    "bleedFavTmCmd-1b": "max",
    "bleedFavTmCmd-2a":"sum",
    "bleedFavTmCmd-2b":"sum",
    "bleedFavTmFbk-1a":"max",
    "bleedFavTmFbk-1b": "max",
    "bleedFavTmFbk-2b":"max",
    "bleedHprsovCmdStatus-1a":"sum",
    "bleedHprsovCmdStatus-1b":"sum",
    "bleedHprsovCmdStatus-2a":"sum",
    "bleedHprsovCmdStatus-2b":"sum",
    "bleedHprsovOpPosStatus-1a":"sum",
    "bleedHprsovOpPosStatus-1b":"sum",
    "bleedHprsovOpPosStatus-2a":"sum",
    "bleedHprsovOpPosStatus-2b":"sum",
    "bleedMonPress-1a":"max",
    "bleedMonPress-1b":"max",
    "bleedMonPress-2a":"max",
    "bleedMonPress-2b":"max",
    "bleedOnStatus-1a":"sum",
    "bleedOnStatus-1b":"sum",
    "bleedOnStatus-2b":"sum",
    "bleedOverpressCas-2a":"sum",
    "bleedOverpressCas-2b":"sum",
    "bleedPrecoolDiffPress-1a":"max",
    "bleedPrecoolDiffPress-1b":"max",
    "bleedPrecoolDiffPress-2a":"max",
    "bleedPrecoolDiffPress-2b":"max",
    "bleedPrsovClPosStatus-1a":"sum",
    "bleedPrsovClPosStatus-2a":"sum",
    "bleedPrsovFbk-1a":"sum",
    'message0422DAA-1':"max",
    'message0418DAA-1':"max",
    # You can use 'sum', 'max', 'min', 'median' as aggregation methods.
}

## Nome após o processo de ETL

In [12]:
filtered_cols = [
    "amscHprsovDrivF-1a",
    "amscHprsovDrivF-1b",
    "amscHprsovDrivF-2b",
    "amscPrsovDrivF-1a",
    "amscPrsovDrivF-1b",
    "amscPrsovDrivF-2b",
    "basBleedLowPressF-1a",
    "basBleedLowPressF-2b",
    "basBleedLowTempF-1a",
    "basBleedLowTempF-2b",
    "basBleedOverPressF-1a",
    "basBleedOverPressF-2b",
    "basBleedOverTempF-1a",
    "basBleedOverTempF-2b",
    "bleedFavTmCmd-1a",
    "bleedFavTmCmd-1b",
    "bleedFavTmCmd-2a",
    "bleedFavTmCmd-2b",
    "bleedFavTmFbk-1a",
    "bleedFavTmFbk-1b",
    "bleedFavTmFbk-2b",
    "bleedHprsovCmdStatus-1a",
    "bleedHprsovCmdStatus-1b",
    "bleedHprsovCmdStatus-2a",
    "bleedHprsovCmdStatus-2b",
    "bleedHprsovOpPosStatus-1a",
    "bleedHprsovOpPosStatus-1b",
    "bleedHprsovOpPosStatus-2a",
    "bleedHprsovOpPosStatus-2b",
    "bleedMonPress-1a",
    "bleedMonPress-1b",
    "bleedMonPress-2a",
    "bleedMonPress-2b",
    "bleedOnStatus-1a",
    "bleedOnStatus-1b",
    "bleedOnStatus-2b",
    "bleedOverpressCas-2a",
    "bleedOverpressCas-2b",
    "bleedPrecoolDiffPress-1a",
    "bleedPrecoolDiffPress-1b",
    "bleedPrecoolDiffPress-2a",
    "bleedPrecoolDiffPress-2b",
    "bleedPrsovClPosStatus-1a",
    "bleedPrsovClPosStatus-2a",
    "bleedPrsovFbk-1a",
    'message0422DAA-1',
    'message0418DAA-1',
]

## Setup para o tratamento e a respectiva saída

In [13]:
directory = './parquets' # It's the directory that contains the parquet files. [ Requirement ]
output_directory = './output' # It's the directory that contains the output files. [ Requirement ]

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
output_file_name = f'output_{timestamp}.csv'
output_file_path = os.path.join(output_directory, output_file_name)
print(output_file_path)

./output/output_20230825105431.csv


## Tratamento e a respectiva saída

In [14]:
try:
    for subdir, _, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(subdir, file)
            df = pd.read_parquet(file_path)
            df = df[filtered_cols]
            aggregated_data = df.agg(agg_methods)
            aggregated_data.to_frame().T.to_csv(output_file_path, mode='a', header=False, index=False)
except:
    print('New parquet not found')

New parquet not found


## Adicionando o nome das colunas

In [15]:
df = pd.read_csv(output_file_path)
df.columns = filtered_cols
df.to_csv(output_file_path, index=False)

## Aplicação do AutoML

In [21]:
h2o.init()
data = h2o.import_file(output_file_path)
x = data.columns[:-2]  # Todas as colunas, exceto a última (coluna alvo)
y = data.columns[-2]   # A última coluna (coluna alvo)
data[y] = data[y].asfactor()
train, test = data.split_frame(ratios=[0.8], seed=1)
aml = H2OAutoML(seed=1, project_name="bleed_system_prediction")
aml.train(x=x, y=y, training_frame=train)
lb = aml.leaderboard
best_model = aml.leader
predictions = best_model.predict(test)
h2o.shutdown()

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
11:06:25.998: _train param, Dropping bad and constant columns: [basBleedOverPressF-1a, basBleedOverPressF-2b, bleedOverpressCas-2b, bleedOverpressCas-2a, amscHprsovDrivF-1a, amscHprsovDrivF-2b, amscPrsovDrivF-1b, amscHprsovDrivF-1b, amscPrsovDrivF-1a, amscPrsovDrivF-2b, basBleedOverTempF-1a, basBleedLowPressF-1a, basBleedLowPressF-2b, basBleedLowTempF-1a, basBleedLowTempF-2b]

█
11:06:29.767: _train param, Dropping bad and constant columns: [basBleedOverPressF-1a, basBleedOverPressF-2b, bleedOverpressCas-2b, bleedOverpressCas-2a, amscHprsovDrivF-1a, amscHprsovDrivF-2b, amscPrsovDrivF-1b, amscHprsovDrivF-1b, amscPrsovDrivF-1a, amscPrsovDrivF-2b, basBleedOverTempF-1a, basBleedLowPressF-1a, basBleedLowPressF-2b, basBleedLowTempF-1a, basBleedLowTempF-2b]

█
11:06:30.680: _train param, Dropping bad and constant columns: [basBleedOverPressF-1a, basBleedOverPressF-2b, bleedOverpre