# Agregador

In [10]:
import pandas as pd
import os
import dask.dataframe as dd

In [11]:
def filter_columns(df):
    filtered_cols = ['amscHprsovDrivF-1a', 'amscHprsovDrivF-1b', 
                 'amscHprsovDrivF-2b', 'amscPrsovDrivF-1a', 
                 'amscPrsovDrivF-1b', 'amscPrsovDrivF-2b', 
                 'basBleedLowPressF-1a', 'basBleedLowPressF-2b', 
                 'basBleedLowTempF-1a', 'basBleedLowTempF-2b', 
                 'basBleedOverPressF-1a', 'basBleedOverPressF-2b', 
                 'basBleedOverTempF-1a', 'basBleedOverTempF-2b', 
                 'bleedFavTmCmd-1a', 'bleedFavTmCmd-1b', 
                 'bleedFavTmCmd-2a', 'bleedFavTmCmd-2b', 'bleedFavTmFbk-1a', 
                 'bleedFavTmFbk-1b', 'bleedFavTmFbk-2b', 'bleedHprsovCmdStatus-1a', 
                 'bleedHprsovCmdStatus-1b', 'bleedHprsovCmdStatus-2a', 
                 'bleedHprsovCmdStatus-2b', 'bleedHprsovOpPosStatus-1a', 
                 'bleedHprsovOpPosStatus-1b', 'bleedHprsovOpPosStatus-2a', 
                 'bleedHprsovOpPosStatus-2b', 'bleedMonPress-1a', 
                 'bleedMonPress-1b', 'bleedMonPress-2a', 'bleedMonPress-2b', 
                 'bleedOnStatus-1a', 'bleedOnStatus-1b', 'bleedOnStatus-2b', 
                 'bleedOverpressCas-2a', 'bleedOverpressCas-2b', 
                 'bleedPrecoolDiffPress-1a', 'bleedPrecoolDiffPress-1b', 
                 'bleedPrecoolDiffPress-2a', 'bleedPrecoolDiffPress-2b', 
                 'bleedPrsovClPosStatus-1a', 'bleedPrsovClPosStatus-2a', 
                 'bleedPrsovFbk-1a', 'recording_time', 'message0418DAA-1', 'message0422DAA-1']

    df_filtered = df[filtered_cols]
    return df_filtered

In [12]:
def average_every_n_rows(df, n=72000):
    new_rows = []
    columns_to_average = [col for col in df.columns if (col != 'recording_time' and col != 'message0418DAA-1' and col != 'message0422DAA-1')]

    for i in range(0, len(df), n):
        chunk = df.iloc[i:i + n]
        if not chunk.empty:
            average_row = chunk[columns_to_average].mean(skipna=True)
            
            for col in ['message0418DAA-1', 'message0422DAA-1']:
                if (df[col].notnull() & (df[col] != 0)).any():
                    average_row[col] = 1
                else:
                    average_row[col] = 0
            
            first_time = chunk['recording_time'].iloc[0]
            average_row['recording_time'] = first_time
            new_rows.append(average_row)

    new_df = pd.DataFrame(new_rows)
    return new_df

In [13]:
def preprocess(file_path):
    df = pd.read_parquet(file_path)
    df_filtered = filter_columns(df)
    df_averaged = average_every_n_rows(df_filtered)
    return df_averaged

### Time series

In [14]:
folder_path = '06120033'  
final_dask_df = None
for filename in os.listdir(folder_path)[30:1000]:
    if filename.endswith(".parquet"):
        file_path = os.path.join(folder_path, filename)
        result_dask_df = dd.from_pandas(preprocess(file_path), npartitions=1)
        if final_dask_df is None:
            final_dask_df = result_dask_df
        else:
            final_dask_df = dd.concat([final_dask_df, result_dask_df])

In [15]:
final_dask_df.to_csv('aggregated_time_series.csv', single_file=True, index=False)

['/home/elisa/Documents/machinelearning/aggregated_time_series.csv']

### Classifier

In [19]:
folder_path = '06120033'
final_dask_df = None
for filename in os.listdir(folder_path)[1220:1950]:
    if filename.endswith(".parquet"):
        file_path = os.path.join(folder_path, filename)
        result_dask_df = dd.from_pandas(preprocess(file_path), npartitions=1)
        if final_dask_df is None:
            final_dask_df = result_dask_df
        else:
            final_dask_df = dd.concat([final_dask_df, result_dask_df])

names = ['TCRF_ARCHIVE_06120033_20220530132603.parquet',
         'TCRF_ARCHIVE_06120033_20220601151236.parquet',
         'TCRF_ARCHIVE_06120033_20220603051739.parquet',
         'TCRF_ARCHIVE_06120033_20220603142239.parquet',
         'TCRF_ARCHIVE_06120033_20221106150243.parquet',
         'TCRF_ARCHIVE_06120033_20230103035245.parquet',
         'TCRF_ARCHIVE_06120033_20230714050245.parquet']

for filename in os.listdir(folder_path):
    if filename in names:
        file_path = os.path.join(folder_path, filename)
        result_dask_df = dd.from_pandas(preprocess(file_path), npartitions=1)
        final_dask_df = dd.concat([final_dask_df, result_dask_df])

In [20]:
final_dask_df.to_csv('aggregated_classifier.csv', single_file=True, index=False)

['/home/elisa/Documents/machinelearning/aggregated_classifier.csv']