# baixando bibliotecas e setando pandas

In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import dask.dataframe as dd
import numpy as np
from datetime import datetime as dt

In [2]:
pd.set_option('display.max_columns', None)

# Janela deslizante

In [45]:
def create_data_voo(row):    
    year = int(row['dateYear-1'])
    month = int(row['dateMonth-1'])
    day = int(row['dateDay-1'])
    # Verificar se a data é válida
    if 1 <= month <= 12 and 1 <= day <= 31:
        return f"{year}-{str(month).zfill(2)}-{str(day).zfill(2)}"
    else:
        return pd.NaT  # Retorna "Not a Timestamp" para datas inválidas

In [46]:
def calculate_mean(group):
    return group.mean()

In [47]:
file_path = "../../../../Exploração/Datasets/06120018"

sensor_columns = [
    'amscHprsovDrivF-1a', 'amscHprsovDrivF-1b', 'amscHprsovDrivF-2b', 
    'amscPrsovDrivF-1a', 'amscPrsovDrivF-1b', 'amscPrsovDrivF-2b', 
    'basBleedLowPressF-1a', 'basBleedLowPressF-2b', 'basBleedLowTempF-1a', 
    'basBleedLowTempF-2b', 'basBleedOverPressF-1a', 'basBleedOverPressF-2b', 
    'basBleedOverTempF-1a', 'basBleedOverTempF-2b', 'bleedFavTmCmd-1a', 
    'bleedFavTmCmd-1b', 'bleedFavTmCmd-2a', 'bleedFavTmCmd-2b', 
    'bleedFavTmFbk-1a', 'bleedFavTmFbk-1b', 'bleedFavTmFbk-2b', 
    'bleedHprsovCmdStatus-1a', 'bleedHprsovCmdStatus-1b', 'bleedHprsovCmdStatus-2a', 
    'bleedHprsovCmdStatus-2b', 'bleedHprsovOpPosStatus-1a', 'bleedHprsovOpPosStatus-1b', 
    'bleedHprsovOpPosStatus-2a', 'bleedHprsovOpPosStatus-2b', 'bleedMonPress-1a', 
    'bleedMonPress-1b', 'bleedMonPress-2a', 'bleedMonPress-2b', 'bleedOnStatus-1a', 
    'bleedOnStatus-1b', 'bleedOnStatus-2b', 'bleedOverpressCas-2a', 
    'bleedOverpressCas-2b', 'bleedPrecoolDiffPress-1a', 'bleedPrecoolDiffPress-1b', 
    'bleedPrecoolDiffPress-2a', 'bleedPrecoolDiffPress-2b', 'bleedPrsovClPosStatus-1a', 
    'bleedPrsovClPosStatus-2a', 'bleedPrsovFbk-1a'
]

new_sensor_columns = [
    'amscHprsovDrivF-2b','amscPrsovDrivF-1a', 'amscPrsovDrivF-1b', 'bleedFavTmCmd-1a', 
    'bleedFavTmCmd-1b', 'bleedFavTmCmd-2a', 'bleedFavTmCmd-2b', 
    'bleedFavTmFbk-1a', 'bleedFavTmFbk-1b', 'bleedFavTmFbk-2b', 
    'bleedHprsovCmdStatus-1a', 'bleedHprsovCmdStatus-1b', 'bleedHprsovCmdStatus-2a', 
    'bleedHprsovCmdStatus-2b', 'bleedHprsovOpPosStatus-1a', 'bleedHprsovOpPosStatus-1b', 
    'bleedHprsovOpPosStatus-2a', 'bleedHprsovOpPosStatus-2b', 'bleedMonPress-1', 'bleedMonPress-2', 'bleedOnStatus-1a', 
    'bleedOnStatus-1b', 'bleedOnStatus-2b', 'bleedPrecoolDiffPress-1a', 'bleedPrecoolDiffPress-1b', 
    'bleedPrecoolDiffPress-2', 'bleedPrsovClPosStatus-1a', 
    'bleedPrsovClPosStatus-2a', 'bleedPrsovFbk-1a'
]


def tratamento_e_feature_selection(data):   
    # pega o arquivo 
    df = pd.read_parquet(os.path.join(file_path, data))

    # da fill forward e fill back
    df=df.fillna(method='ffill')
    df=df.fillna(method='bfill')
    
    # exclui as colunas nao numéricas e separa elas para adicionar depois
    columns_to_add_later = ['recording_time','aircraftSerNum-1','timeHours-1','timeMinutes-1','timeSeconds-1','dateDay-1','dateMonth-1','dateYear-1','message0418DAA-1','message0422DAA-1']
    df_mean = df.drop(columns=columns_to_add_later)
    df_columns_to_add_later = df[columns_to_add_later] 

    # Pegando uma linha a cada 100, o que equivale a 300x50 = 15000ms = 15s
    df_columns_to_add_later = df_columns_to_add_later.loc[::300]

    # calcula a media de cada valor a cada 100 linhas
    mean_of_100_rows = df_mean.groupby(df_mean.index//300).apply(calculate_mean)

    # setando o index do df_columns_to_add_later igual ao mean_of_100_rows para não haver problemas na hora de fazer o join, pois o join é com base no index de cada df
    df_columns_to_add_later.index = mean_of_100_rows.index

    # join
    df_join = mean_of_100_rows.join(df_columns_to_add_later, lsuffix='_caller', rsuffix='_other')

    # Unificação de colunas gêmeas
    df_join['bleedMonPress-1'] = (df_join['bleedMonPress-1a']+df_join['bleedMonPress-1b'])/2
    df_join['bleedMonPress-2'] = (df_join['bleedMonPress-2a']+df_join['bleedMonPress-2b'])/2
    df_join['bleedPrecoolDiffPress-2'] = (df_join['bleedPrecoolDiffPress-2a']+df_join['bleedPrecoolDiffPress-2b'])/2

    # remove colunas desnecessárias
    feature_selection = set(df.columns)-set(sensor_columns)
    feature_selection = list(feature_selection)
    for feature in new_sensor_columns:
        feature_selection.append(feature)
    df_join = df_join[feature_selection]
    
    return df_join

# definitivo


In [49]:
file_path = "../../../../Exploração/Datasets/06120018"
parquet_files = [file for file in os.listdir(file_path) if file.endswith('.parquet')]

total_rows = 0
list_of_dfs = []

list_of_last_30_parquets = []
list_of_last_30_parquets_without_bleed_error = []

list_of_last_75_to_90_parquets = []
list_of_last_75_to_90_parquets_without_bleed_error = []

list_of_last_120_to_135_parquets = []
list_of_last_120_to_135_parquets_without_bleed_error = []

list_of_last_180_to_195_parquets = []
list_of_last_180_to_195_parquets_without_bleed_error = []

ocorreu_erro = False

# esta é uma lista de bleed que aconteceram após um longo intervalo de tempo, será composta pelo erro de bleed e seu index
list_of_files_with_bleed = []


for file_name in parquet_files:

    parquet_index = parquet_files.index(file_name)

    # transforma parquet em dataframe
    full_path = os.path.join(file_path, file_name)
    df = pd.read_parquet(full_path)
    print(parquet_files.index(file_name))

    # da fill forward e fill back
    df=df.fillna(method='ffill')
    df=df.fillna(method='bfill')

    # verifica se tem falha de bleed
    if (df['message0418DAA-1'].unique().any() != 0  or df['message0422DAA-1'].unique().any() != 0):
        
        print('parquet com erro:', parquet_index)

        # pega entre 15 a 30 parquets antes
        last_15_to_30_files = parquet_files[int(parquet_index-16):int(parquet_index-1)]
        print('ultimos 15 à 30 parquets: ', int(parquet_index-31),':', int(parquet_index-15))

        # pega os 30 primeros parquets        
        last_30_files = parquet_files[int(parquet_index-31):int(parquet_index-1)]
        print('ultimos 30 parquets: ', int(parquet_index-31),':', int(parquet_index-1))

        # pega de 75 as 90 ultimos parquets
        last_75_to_90_files = parquet_files[int(parquet_index-91):int(parquet_index-76)]
        last_30_and_75_to_90_files = last_30_files + last_75_to_90_files
        print('ultimos 75 à 90 parquets: ', int(parquet_index-91),':', int(parquet_index-76))

        # pega de 120 as 135 ultimos parquets
        last_120_to_135_files = parquet_files[int(parquet_index-136):int(parquet_index-121)]
        last_30_75_to_90_and_120_to_135_files = last_30_and_75_to_90_files + last_120_to_135_files
        print('ultimos 120 à 135 parquets: ', int(parquet_index-136),':', int(parquet_index-121))

        # pega de 180 as 195 ultimos parquets
        last_180_to_195_files = parquet_files[int(parquet_index-196):int(parquet_index-181)]
        all_last = last_30_75_to_90_and_120_to_135_files + last_180_to_195_files
        print('ultimos 180 à 195 parquets: ', int(parquet_index-196),':', int(parquet_index-181))

        # pega os últimos 30 parquets para ver se houve erro
        for file in all_last:

            # transforma parquet em dataframe
            full_path_last_files = os.path.join(file_path, file)
            df = pd.read_parquet(full_path_last_files)

            # da fill forward e fill back
            df=df.fillna(method='ffill')
            df=df.fillna(method='bfill')

            print(df['message0418DAA-1'].unique(), df['message0422DAA-1'].unique())

            # se houve erro de bleed nos últimos 15 parquets, ele quebra o for 
            if(df['message0418DAA-1'].unique().any() != 0  or df['message0422DAA-1'].unique().any() != 0):
                last_15_to_30_files = []
                last_75_to_90_files = []
                last_120_to_135_files = []
                last_180_to_195_files = []
                print('ERRO: ', parquet_files.index(file),'\n')
                ocorreu_erro = True
                break

        # passa os parquets sem erro para serem tratados futuramente, também liberando o outro array
        for parquet in last_15_to_30_files:
            list_of_last_30_parquets_without_bleed_error.append(parquet)
        last_15_to_30_files = []
        print('numero de parquets sem erros anteriores da lista de 15 à 30:',len(list_of_last_30_parquets_without_bleed_error),'\n')

        # passa os parquets sem erro para serem tratados futuramente, também liberando o outro array
        for parquet in last_75_to_90_files:
            list_of_last_75_to_90_parquets_without_bleed_error.append(parquet)
        last_75_to_90_files = []
        print('numero de parquets sem erros anteriores da lista de 75 à 90: ', len(list_of_last_75_to_90_parquets_without_bleed_error),'\n')

        for parquet in last_120_to_135_files:
            list_of_last_120_to_135_parquets_without_bleed_error.append(parquet)
        last_120_to_135_files = []
        print('numero de parquets sem erros anteriores da lista de 75 à 90: ', len(list_of_last_120_to_135_parquets_without_bleed_error),'\n')

        for parquet in last_180_to_195_files:
            list_of_last_180_to_195_parquets_without_bleed_error.append(parquet)
        last_180_to_195_files = []
        print('numero de parquets sem erros anteriores da lista de 75 à 90: ', len(list_of_last_180_to_195_parquets_without_bleed_error),'\n')

        if not ocorreu_erro:
            list_of_files_with_bleed.append(file_name)
        print('numero de parquets de bleed de erro após longos intervalos: ', len(list_of_files_with_bleed),'\n')
        ocorreu_erro = False



0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
parquet com erro: 134
ultimos 15 à 30 parquets:  103 : 119
ultimos 30 parquets:  103 : 133
ultimos 75 à 90 parquets:  43 : 58
ultimos 120 à 135 parquets:  -2 : 13
ultimos 180 à 195 parquets:  -62 : -47
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.]
[0.] [0.

de 15 à 30 parquets 

In [50]:
# trata os dfs que não tiverem bleed e os junta no final
list_of_dfs = []
for df in list_of_last_30_parquets_without_bleed_error:
    df_join = tratamento_e_feature_selection(df)

    # adicionando a um array de df para futuramente concatenar
    total_rows += len(df_join)
    list_of_dfs.append(df_join)
    print(f"Arquivo {file_name} tem {len(df_join)} linhas.") 
            
# Concatenar todos os Dask DataFrames em um único Dask DataFrame
from_15_to_30 = pd.concat(list_of_dfs)
from_15_to_30['bleed_fail'] = 0

Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 669 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 631 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 284 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 572 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 47 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 896 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 994 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 262 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 290 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 45 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 249 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 328 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 393 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 435 linhas.
Arquivo TCRF_ARCHIVE_06120018_202307

de 75 à 90 parquets

In [51]:
# trata os dfs que não tiverem bleed e os junta no final
list_of_dfs = []
for df in list_of_last_75_to_90_parquets_without_bleed_error:
    df_join = tratamento_e_feature_selection(df)

    # adicionando a um array de df para futuramente concatenar
    total_rows += len(df_join)
    list_of_dfs.append(df_join)

    print(f"Arquivo {file_name} tem {len(df_join)} linhas.") 
            
# Concatenar todos os Dask DataFrames em um único Dask DataFrame
from_75_to_90 = pd.concat(list_of_dfs)
from_75_to_90['bleed_fail'] = 0

Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 309 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 312 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 322 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 380 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 374 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 409 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 342 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 344 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 398 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 327 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 304 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 396 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 404 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 518 linhas.
Arquivo TCRF_ARCHIVE_06120018_2023

In [52]:
# trata os dfs que não tiverem bleed e os junta no final
list_of_dfs = []
for df in list_of_last_120_to_135_parquets_without_bleed_error:
    df_join = tratamento_e_feature_selection(df)

    # adicionando a um array de df para futuramente concatenar
    total_rows += len(df_join)
    list_of_dfs.append(df_join)

    print(f"Arquivo {file_name} tem {len(df_join)} linhas.") 
            
# Concatenar todos os Dask DataFrames em um único Dask DataFrame
from_120_to_135 = pd.concat(list_of_dfs)
from_120_to_135['bleed_fail'] = 0

Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 558 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 397 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 436 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 514 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 513 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 227 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 236 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 365 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 362 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 858 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 177 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 839 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 294 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 288 linhas.
Arquivo TCRF_ARCHIVE_06120018_2023

In [53]:
# trata os dfs que não tiverem bleed e os junta no final
list_of_dfs = []
for df in list_of_last_180_to_195_parquets_without_bleed_error:
    df_join = tratamento_e_feature_selection(df)

    # adicionando a um array de df para futuramente concatenar
    total_rows += len(df_join)
    list_of_dfs.append(df_join)

    print(f"Arquivo {file_name} tem {len(df_join)} linhas.") 
            
# Concatenar todos os Dask DataFrames em um único Dask DataFrame
from_180_to_195 = pd.concat(list_of_dfs)
from_180_to_195['bleed_fail'] = 0

Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 562 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 772 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 785 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 709 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 386 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 474 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 344 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 306 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 315 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 456 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 418 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 264 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 45 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 322 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230

In [54]:
# trata os dfs que não tiverem bleed e os junta no final
list_of_dfs = []
for df in list_of_files_with_bleed:
    df_join = tratamento_e_feature_selection(df)

    # adicionando a um array de df para futuramente concatenar
    total_rows += len(df_join)
    list_of_dfs.append(df_join)

    print(f"Arquivo {file_name} tem {len(df_join)} linhas.") 
            
# Concatenar todos os Dask DataFrames em um único Dask DataFrame
errors = pd.concat(list_of_dfs)
errors['bleed_fail'] = 1

Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 283 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 689 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 312 linhas.
Arquivo TCRF_ARCHIVE_06120018_20230719203539.parquet tem 257 linhas.


In [55]:
errors.shape

(1541, 78)

In [56]:
from_15_to_30.shape

(24352, 78)

In [57]:
from_75_to_90.shape

(22573, 78)

In [58]:
from_120_to_135.shape

(16860, 78)

In [59]:
from_180_to_195.shape

(20550, 78)

concatenação

In [60]:
# concatena todos os fds
df_final = pd.concat([from_15_to_30, from_75_to_90, from_120_to_135, from_180_to_195,errors])

In [61]:
# data do voo
df_final['data_voo'] = df_final.apply(create_data_voo, axis=1)
df_final['data_voo'] = pd.to_datetime(df_final['data_voo'])

# Ordena o DataFrame por data
df_final = df_final.sort_values(by='data_voo')

In [62]:
df_final.index

Index([  0, 266, 265, 264, 263, 262, 261, 260, 267, 259,
       ...
         3,   4,   5,   6,   7,   8,   9,  10,  43, 288],
      dtype='int64', length=85876)

In [63]:
df_final = df_final.reset_index(drop=True)
df_final

Unnamed: 0,correctedCoreSpeed-1a,amscChBasHealthStatus-1a,bleedSingleOperation-1a,bleedPrsovTmCmd-2a,sfyBasFaultWord1Bit13-2b,bleedOutTemp-1a,timeSeconds-1,bleedOutTemp-1b,bleedPrsovOpPosStatus-2a,bleedPrsovTmCmd-1a,bleedOverpressCas-1a,timeMinutes-1,aircraftSerNum-1,phaseOfFlight-1,recording_time,correctedN1Speed-3a,messageInhibitPhases-1,message0422DAA-1,bleedOutTemp-2b,dateMonth-1,message0418DAA-1,bleedOutTempTarget-1a,amscChBasHealthStatus-2b,bleedSwPress-2a,bleedAcsBleedConfigStatus-1b,sfyBasFaultWord1Bit13-1a,bleedPrsovOpPosStatus-1a,bleedSwPress-1a,bleedSingleOperation-2b,amscChBasHealthStatus-1b,correctedCoreSpeed-3a,bleedPrsovOpPosStatus-1b,bleedOutTemp-2a,bleedOutTempTarget-2b,bleedSwPress-2b,bleedSwPress-1b,bleedPrsovFbk-1b,bleedAcsBleedConfigStatus-2b,timeHours-1,bleedPrsovOpPosStatus-2b,bleedPrsovFbk-2b,bleedPrsovTmCmd-1b,phaseOfFlightNavigation-1,bleedPrsovTmCmd-2b,dateDay-1,bleedOutTempTarget-1b,correctedN1Speed-1a,dateYear-1,amscHprsovDrivF-2b,amscPrsovDrivF-1a,amscPrsovDrivF-1b,bleedFavTmCmd-1a,bleedFavTmCmd-1b,bleedFavTmCmd-2a,bleedFavTmCmd-2b,bleedFavTmFbk-1a,bleedFavTmFbk-1b,bleedFavTmFbk-2b,bleedHprsovCmdStatus-1a,bleedHprsovCmdStatus-1b,bleedHprsovCmdStatus-2a,bleedHprsovCmdStatus-2b,bleedHprsovOpPosStatus-1a,bleedHprsovOpPosStatus-1b,bleedHprsovOpPosStatus-2a,bleedHprsovOpPosStatus-2b,bleedMonPress-1,bleedMonPress-2,bleedOnStatus-1a,bleedOnStatus-1b,bleedOnStatus-2b,bleedPrecoolDiffPress-1a,bleedPrecoolDiffPress-1b,bleedPrecoolDiffPress-2,bleedPrsovClPosStatus-1a,bleedPrsovClPosStatus-2a,bleedPrsovFbk-1a,bleed_fail,data_voo
0,0.000000,0.0,0.0,0.0,0.0,148.928333,13.0,149.155417,0.0,0.0,0.0,35.0,20018.0,0.0,0,0.257292,1.0,0.0,105.107083,5.0,0.0,203.875,0.0,0.370000,0.0,0.0,0.0,1.000000,0.0,0.0,0.000000,0.0,105.107083,203.875,0.370000,1.000000,0.241667,0.0,13.0,0.0,0.175417,0.000000,2.000000,0.000000,29.0,203.875,0.178750,2022.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.206667,1.535000,0.161667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,47.768958,47.331250,0.0,0.0,0.0,0.025729,0.031979,0.022083,1.0,1.0,0.161250,0,2022-05-29
1,74.010000,0.0,0.0,0.0,0.0,201.106667,11.0,201.088333,1.0,0.0,0.0,36.0,20018.0,2.0,3990000,29.165990,5.0,0.0,203.660000,5.0,0.0,203.875,0.0,12.235000,1.0,0.0,1.0,12.454167,0.0,0.0,74.111458,1.0,203.660000,203.875,12.235000,12.461667,183.343333,1.0,19.0,1.0,0.181667,181.675000,6.263333,199.695000,29.0,203.875,29.176875,2022.0,0.0,0.0,0.0,0.000000,117.787500,0.0,0.0,0.161667,121.016667,0.160833,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,59.965417,60.391667,1.0,1.0,1.0,0.705625,0.680417,0.597083,0.0,0.0,0.145000,0,2022-05-29
2,73.832552,0.0,0.0,0.0,0.0,200.555833,56.0,200.537500,1.0,0.0,0.0,35.0,20018.0,2.0,3975000,29.265521,5.0,0.0,203.534167,5.0,0.0,203.875,0.0,12.368333,1.0,0.0,1.0,12.537500,0.0,0.0,73.879219,1.0,203.534167,203.875,12.368333,12.553333,185.381667,1.0,19.0,1.0,0.167500,183.441667,6.000000,201.132500,29.0,203.875,29.243854,2022.0,0.0,0.0,0.0,0.000000,117.082500,0.0,0.0,0.153333,121.038333,0.160833,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,59.585000,60.192500,1.0,1.0,1.0,0.697292,0.705417,0.615625,0.0,0.0,0.153333,0,2022-05-29
3,74.065781,0.0,0.0,0.0,0.0,201.250000,41.0,201.273333,1.0,0.0,0.0,35.0,20018.0,2.0,3960000,29.501146,5.0,0.0,203.673333,5.0,0.0,203.875,0.0,12.495000,1.0,0.0,1.0,12.655833,0.0,0.0,74.044271,1.0,203.673333,203.875,12.495000,12.663333,185.118333,1.0,19.0,1.0,0.169167,183.282500,6.000000,200.784167,29.0,203.875,29.511406,2022.0,0.0,0.0,0.0,0.000000,117.393333,0.0,0.0,0.149167,120.845833,0.188333,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,60.004167,60.522500,1.0,1.0,1.0,0.692917,0.691042,0.606458,0.0,0.0,0.143333,0,2022-05-29
4,74.439115,0.0,0.0,0.0,0.0,203.934167,26.0,203.965000,1.0,0.0,0.0,35.0,20018.0,2.0,3945000,29.894427,5.0,0.0,204.601667,5.0,0.0,203.875,0.0,12.705833,1.0,0.0,1.0,12.831667,0.0,0.0,74.368177,1.0,204.601667,203.875,12.705833,12.858333,184.802500,1.0,19.0,1.0,0.174167,183.026667,6.000000,199.620833,29.0,203.875,29.916250,2022.0,0.0,0.0,0.0,0.000000,118.845000,0.0,0.0,0.150833,122.620833,0.160000,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,60.851667,61.286667,1.0,1.0,1.0,0.710208,0.708542,0.611042,0.0,0.0,0.161667,0,2022-05-29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85871,65.983177,0.0,0.0,0.0,0.0,206.847083,36.0,207.074167,0.0,0.0,0.0,8.0,20018.0,0.0,120000,0.086719,2.0,0.0,92.642500,7.0,0.0,203.875,0.0,0.256250,0.0,0.0,0.0,10.170417,0.0,0.0,0.334948,0.0,92.642500,203.875,0.256250,10.176667,0.058750,0.0,10.0,0.0,0.496667,0.000000,2.000000,0.000000,12.0,203.875,23.054427,2023.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,1.001667,0.355000,0.929583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46.711042,45.707917,0.0,0.0,0.0,0.048229,0.038021,0.069583,1.0,1.0,0.762500,0,2023-07-12
85872,62.503073,0.0,0.0,0.0,0.0,198.496250,51.0,198.686250,0.0,0.0,0.0,8.0,20018.0,0.0,135000,2.553437,2.0,0.0,154.607500,7.0,0.0,203.875,0.0,0.454583,0.0,0.0,0.0,9.207083,0.0,0.0,9.704688,0.0,154.607500,203.875,0.454583,9.218750,0.042917,0.0,10.0,0.0,0.508333,0.000000,2.000000,0.000000,12.0,203.875,20.598958,2023.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,1.094583,0.352917,0.933750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46.653542,45.720000,0.0,0.0,0.0,0.021562,0.019167,0.077604,1.0,1.0,0.815417,0,2023-07-12
85873,62.473854,0.0,0.0,0.0,0.0,192.255833,6.0,192.452500,0.0,0.0,0.0,9.0,20018.0,0.0,150000,3.015104,2.0,0.0,183.195417,7.0,0.0,203.875,0.0,0.500000,0.0,0.0,0.0,9.211250,0.0,0.0,11.435677,0.0,183.195417,203.875,0.500000,9.216667,0.065000,0.0,10.0,0.0,0.500000,0.000000,2.000000,0.000000,12.0,203.875,20.722917,2023.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,1.090000,0.357083,0.921667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48.186250,47.882917,0.0,0.0,0.0,0.018125,0.022500,0.078125,1.0,1.0,0.811250,0,2023-07-12
85874,17.335208,0.0,0.0,0.0,0.0,153.612917,35.0,153.786250,0.0,0.0,0.0,35.0,20018.0,0.0,645000,0.812500,1.0,0.0,127.147500,7.0,0.0,203.875,0.0,0.330000,0.0,0.0,0.0,1.435000,0.0,0.0,0.000000,0.0,127.147500,203.875,0.330000,1.456667,0.185000,0.0,8.0,0.0,0.650000,0.000000,5.000000,0.000000,12.0,203.875,4.446146,2023.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,1.066667,0.498333,1.035417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44.213125,43.677917,0.0,0.0,0.0,0.019583,0.029583,0.072083,1.0,1.0,0.832917,0,2023-07-12


In [64]:
# remove colunas irrelevantes para o modelo
df_final = df_final.drop(columns={'dateDay-1', 'timeHours-1','message0418DAA-1', 'phaseOfFlight-1', 'timeSeconds-1','dateYear-1', 'message0422DAA-1','aircraftSerNum-1','phaseOfFlightNavigation-1','dateMonth-1','recording_time', 'timeMinutes-1' })

adiciona quantos dias faltam para a próxima falha

In [65]:
# Suponhamos que você tenha um DataFrame chamado df_final
df_final['dias_faltando_coluna'] = 0

# Encontre os índices onde o evento é igual a 0
indices_evento_0 = df_final.index[df_final['bleed_fail'] == 0]

# Itere pelos índices do evento 0
for index_evento_0 in indices_evento_0:
    # Encontre o próximo índice onde o evento é igual a 1
    proximo_evento_1 = df_final.loc[(df_final.index > index_evento_0) & (df_final['bleed_fail'] == 1)]
    
    if not proximo_evento_1.empty:
        # Encontre a data do evento 0 e a data do próximo evento 1
        data_evento_0 = df_final.at[index_evento_0, 'data_voo']
        data_proximo_evento_1 = proximo_evento_1.iloc[0]['data_voo']
        
        # Calcule a diferença de dias entre os eventos
        dias_faltando = (data_proximo_evento_1 - data_evento_0).days
        
        # Atribua o valor de dias_faltando à coluna 'dias_faltando_coluna'
        df_final.at[index_evento_0, 'dias_faltando_coluna'] = dias_faltando
        
        print(f'Index do 0: {index_evento_0}, Index do 1: {proximo_evento_1.index[0]}, Dias faltando entre os dois: {dias_faltando}')
    else:
        print(f'Index do 0: {index_evento_0}, Não há eventos 1 subsequentes')


Index do 0: 0, Index do 1: 11400, Dias faltando entre os dois: 15
Index do 0: 1, Index do 1: 11400, Dias faltando entre os dois: 15
Index do 0: 2, Index do 1: 11400, Dias faltando entre os dois: 15
Index do 0: 3, Index do 1: 11400, Dias faltando entre os dois: 15
Index do 0: 4, Index do 1: 11400, Dias faltando entre os dois: 15
Index do 0: 5, Index do 1: 11400, Dias faltando entre os dois: 15
Index do 0: 6, Index do 1: 11400, Dias faltando entre os dois: 15
Index do 0: 7, Index do 1: 11400, Dias faltando entre os dois: 15
Index do 0: 8, Index do 1: 11400, Dias faltando entre os dois: 15
Index do 0: 9, Index do 1: 11400, Dias faltando entre os dois: 15
Index do 0: 10, Index do 1: 11400, Dias faltando entre os dois: 15
Index do 0: 11, Index do 1: 11400, Dias faltando entre os dois: 15
Index do 0: 12, Index do 1: 11400, Dias faltando entre os dois: 15
Index do 0: 13, Index do 1: 11400, Dias faltando entre os dois: 15
Index do 0: 14, Index do 1: 11400, Dias faltando entre os dois: 15
Index

In [66]:
df_final.columns

Index(['correctedCoreSpeed-1a', 'amscChBasHealthStatus-1a',
       'bleedSingleOperation-1a', 'bleedPrsovTmCmd-2a',
       'sfyBasFaultWord1Bit13-2b', 'bleedOutTemp-1a', 'bleedOutTemp-1b',
       'bleedPrsovOpPosStatus-2a', 'bleedPrsovTmCmd-1a',
       'bleedOverpressCas-1a', 'correctedN1Speed-3a', 'messageInhibitPhases-1',
       'bleedOutTemp-2b', 'bleedOutTempTarget-1a', 'amscChBasHealthStatus-2b',
       'bleedSwPress-2a', 'bleedAcsBleedConfigStatus-1b',
       'sfyBasFaultWord1Bit13-1a', 'bleedPrsovOpPosStatus-1a',
       'bleedSwPress-1a', 'bleedSingleOperation-2b',
       'amscChBasHealthStatus-1b', 'correctedCoreSpeed-3a',
       'bleedPrsovOpPosStatus-1b', 'bleedOutTemp-2a', 'bleedOutTempTarget-2b',
       'bleedSwPress-2b', 'bleedSwPress-1b', 'bleedPrsovFbk-1b',
       'bleedAcsBleedConfigStatus-2b', 'bleedPrsovOpPosStatus-2b',
       'bleedPrsovFbk-2b', 'bleedPrsovTmCmd-1b', 'bleedPrsovTmCmd-2b',
       'bleedOutTempTarget-1b', 'correctedN1Speed-1a', 'amscHprsovDrivF-2b',
  

In [67]:
df_to_csv = df_final.drop(columns={'bleed_fail', 'data_voo'})

In [68]:
df_to_csv.to_csv('dataset_processado_06120018_more_intervals.csv')