# Importação de Bibliotecas

In [78]:
from copy import deepcopy
import os
import pickle
import sys

from tqdm import tqdm
import numpy as np
import scipy.io as io
import pandas as pd
import h5py
import pyarrow

# Diretório

In [67]:
# Diretório do Pc do Jordan
path = 'H:\\.shortcut-targets-by-id\\1oKn7IN7zznQuhwjDCDdjq8r9wHJYBEhj\\DR_Train\\LRV4306'

print(f'Começando o processamento do vagão referente aos arquivos em: {path}...')
acc_path = f'{path}\\accelerometer_data\\'
gps_path = f'{path}\\gps_data\\'

acc_names = os.listdir(acc_path)
gps_names = os.listdir(gps_path)

Começando o processamento do vagão referente aos arquivos em: H:\.shortcut-targets-by-id\1oKn7IN7zznQuhwjDCDdjq8r9wHJYBEhj\DR_Train\LRV4306...


# Dados de acelerômetros

Carregando dados dos acelerômetros

In [73]:
# Carregamento dos dados dos acelerômetros

print('Carregando os dados dos acelerômetros...')
data = {}
for file in tqdm(acc_names):
    if file[-4:] == '.ini':
        pass
    else:
        splited_name = file.split('_')
        date = splited_name[1]
        daily_passing = splited_name[3]
        region_number = splited_name[4]
        running_direction = splited_name[5]
        sensor = splited_name[-1][0]

        if region_number == '5':
            try:    
                with h5py.File(f'{acc_path}/{file}', 'r') as f:
                    sensor_data = f.get('save_var')
                    sensor_data = np.array(sensor_data)  
            except:
                sensor_data = io.loadmat(f'{acc_path}/{file}')
                sensor_data = sensor_data['save_var'] 

                sensor_data = sensor_data.reshape((-1,))
                sensor_data = sensor_data.astype('float32')

            if not data.get(f'{date}_{daily_passing}_{running_direction}'):
                    data[f'{date}_{daily_passing}_{running_direction}'] = {}
                    data[f'{date}_{daily_passing}_{running_direction}'][f'sensor_{sensor}'] = sensor_data


Carregando os dados dos acelerômetros...


  3%|▎         | 910/30837 [00:07<04:03, 122.93it/s]


NotImplementedError: Please use HDF reader for matlab v7.3 files

Concatenação dos dados dos acelerômetros

In [63]:
data

{'20150108_4_1': {'sensor_5': array([[ 0.006091, -0.016158, -0.030445, ..., -0.010565, -0.0115  ,
           0.001217]])},
 '20150108_4_2': {'sensor_2': array([[ 0.001274,  0.004135,  0.003105, ...,  0.002773, -0.00501 ,
          -0.006026]])},
 '20150122_1_2': {'sensor_1': array([[ 0.030163,  0.028689,  0.031952, ..., -0.010695, -0.016363,
          -0.010606]])},
 '20150122_2_1': {'sensor_1': array([[-0.012331, -0.007563, -0.006641, ..., -0.025214, -0.020505,
          -0.025063]])},
 '20150122_2_2': {'sensor_1': array([[0.017919, 0.014726, 0.009779, ..., 0.006092, 0.011879, 0.016927]])},
 '20150122_3_2': {'sensor_1': array([[0.003898, 0.003687, 0.007826, ..., 0.012047, 0.016206, 0.019628]])},
 '20150730_2_2': {'sensor_1': array([[-0.036307, -0.038331, -0.040665, ..., -0.032973, -0.026514,
          -0.019243]])},
 '20150819_1_1': {'sensor_1': array([[ 0.002055,  0.009343,  0.007895, ..., -0.045241, -0.031   ,
          -0.027606]])},
 '20150819_1_2': {'sensor_1': array([[ 0.000658,

In [46]:
print('Concatenação dos dados...')
full_df = pd.DataFrame()
for key, values in tqdm(data.items()):
    info = key.split('_')
    partial_df = pd.DataFrame()
    for k in values.keys():
        partial_df[k] = values[k]

    partial_df['date'] = int(info[0])
    partial_df['daily_passing'] = int(info[1])
    partial_df['running_direction'] = int(info[2])
    
    if full_df.empty:
        full_df = partial_df
    else:
        full_df = pd.concat([full_df, partial_df])

full_df.reset_index(inplace=True, drop=True)

Concatenação dos dados...


  0%|          | 0/311 [00:00<?, ?it/s]


ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series

In [None]:
# Logs

non_nulls = full_df.isnull().sum()
nulls = non_nulls.sum()

print(f'Número de dados nulos: {nulls}')


# Registro dos dados de acelerômetro em .parquet

print('Salvando os dados em .parquet...')

os.makedirs('accel_data/', exist_ok=True)

wagon = path.split('/')[-1]
full_df.to_parquet(f'accel_data/{wagon}_acc.parquet', index=False)

# Dados de GPS

Carregamento dos dados dos GPS

In [84]:
data_gps = {}
print('Carregando os dados dos GPS...')
for file in tqdm(gps_names):
    if file[-4:] == '.ini':
        pass
    else:
        splited_name = file.split('_')
        date = splited_name[1]
        daily_passing = splited_name[3]
        region_number = splited_name[4]
        running_direction = splited_name[-1][0]

        if region_number == '5':
            gps_data = io.loadmat(f'{gps_path}/{file}')
            gps_data = gps_data['save_var_gps']

            if not data_gps.get(f'{date}_{daily_passing}_{running_direction}'):
                data_gps[f'{date}_{daily_passing}_{running_direction}'] = {}

            for i in range(gps_data.shape[1]):
                data_gps[f'{date}_{daily_passing}_{running_direction}'][f'gps_{i}'] = gps_data[:, i]

Carregando os dados dos GPS...


100%|██████████| 6020/6020 [00:09<00:00, 635.31it/s]


Concatenação dos dados de gps

In [74]:
print('Concatenação dos dados...')
full_df = pd.DataFrame()
for key, values in tqdm(data_gps.items()):
    info = key.split('_')
    partial_df = pd.DataFrame()
    for k in values.keys():
        partial_df[k] = values[k]

    partial_df['date'] = int(info[0])
    partial_df['daily_passing'] = int(info[1])
    partial_df['running_direction'] = int(info[2])
    
    if full_df.empty:
        full_df = partial_df
    else:
        full_df = pd.concat([full_df, partial_df])

full_df.reset_index(inplace=True, drop=True)

Concatenação dos dados...


100%|██████████| 295/295 [00:02<00:00, 120.23it/s]


In [81]:
path.split('\\')[-1]

'LRV4306'

In [83]:
# Logs

non_nulls = full_df.isnull().sum()
nulls = non_nulls.sum()

print(f'Número de dados nulos: {nulls}')

# Registro dos dados de acelerômetro em .parquet

print('Salvando os dados em .parquet...')

os.makedirs('gps_data/', exist_ok=True)

wagon = path.split('\\')[-1]
full_df.to_parquet(f'gps_data/{wagon}_gps.parquet', index=False)

Número de dados nulos: 0
Salvando os dados em .parquet...
