This notebook reads and merges .csv files for each folder in 'original_dataset_path'

Each merged .csv is saved in a folder in 'processed_data_path'

In [4]:
import os
import glob
import pandas as pd

In [5]:
def csvs_merge(path_in, path_out, types_infos):
    """ 
    Merge all .csv files of a 'file_type' in 'path_in'
    The merged file is saved in "data/processed_data/'dataset'/'folder'/merged_files.csv"
    """

    for file_type, type_infos in types_infos.items():
        
        # Getting all .csv files of type 'file_type'
        csv_files = sorted(glob.glob('%s/%s*.csv' % (path_in, file_type)))
        
        # Some folders don't have all types
        if not csv_files:
            continue

        # Determining separator type with the first .csv file
        reader = pd.read_csv(csv_files[0], sep= None, iterator= True, engine='python')
        inferred_sep = reader._engine.data.dialect.delimiter
        
        # Merging .csv files of type 'file_type'
        merged_csv = pd.concat([ pd.read_csv('%s' %(f), 
                                 sep=inferred_sep, 
                                 usecols=type_infos['usecols'], 
                                 names=type_infos['names'], 
                                 header=None, engine='c') for f in csv_files ])

        merged_csv.to_csv("%s/%s.csv" % (path_out, file_type),
                          index=False, encoding='utf-8-sig')

---

This notebook expect the following directory structure but you can change it with 
'original_dataset_path' and 'processed_data_path' variables bellow

#### Create the folders* indicated bellow
#### After downloading FEMTOBearingDataSet.zip from [NASA repository](https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/) you can just extract 'Training_set.zip' and 'Validation_set.zip' to get all data

Defining paths, file type info and merging .csvs

In [6]:
original_dataset_path = 'data/original_data/femto_dataset'
processed_data_path   = 'data/processed_data/femto_dataset'

types_infos = {
    # file type identifier, columns to read, columns name
    # μs column was removed (usecols 3)
    'acc'  : {'usecols' : [0, 1, 2, 4, 5], 'names' : ['hour', 'min', 'seg', 'h', 'v']},
    'temp' : {'usecols' : [0, 1, 2, 4],    'names' : ['hour', 'min', 'seg', 'temp']}
}