# About this Notebook

The purpose of this notebook is to import the data from each flight into a single h5 file containing all the flights referenced by key. It also introduce how to convert h5 file to pandas dataframe.

# Import packages

In [1]:
import pandas as pd
import h5py
from tqdm.notebook import tqdm_notebook
from pyproj import Transformer
import numpy as np

# 0 - Read Filght Data

Here we create a function that allows us to read the h5 data in a pandas dataframe from a single flight file.

In [2]:
def read_filght_data(flight_number, verbose=False):
    """
    Read h5 flight data and convert it to a pandas dataframe
    
    Arguments:
    - `flight_number` : number of the flight we want to convert to a dataframe
    - `verbose` : print keys with NaNs and infos
    
    Returns:
    - `df` : pandas dataframe containing data from the flight
    """
    
    # Import h5 flight data
    file_path = f'../data/raw/Flt100{flight_number}-train.h5'
    flight_data = h5py.File(file_path, 'r')
    
    # Put flight data in a dataframe
    df = pd.DataFrame()
    for key in flight_data.keys():                                 
        data = flight_data[key]                                          
        if data.shape != ():                                             
            df[key] = data[:]
            if df[key].isnull().any()&verbose:
                print(f'{key} contains NaN(s)')
        elif verbose:
            print(f'{key} = {data[()]}')
    
    # Rename the column names for flights 6 and 7 to match the column names of the other flights
    if flight_number == 7 or 6:
        df.rename(columns = {'line':'tie_line','utm_x':'utmX','utm_y':'utmY',
                             'utm_z':'utmZ','msl':'alt','ins_yaw':'ins_azim',
                             'pitch_rate':'pitch_rt','roll_rate':'roll_rt',
                             'yaw_rate':'yaw_rt','lgtl_acc':'lon_acc',
                             'ltrl_acc':'lat_acc','nrml_acc':'alt_acc',
                             'tas':'true_as','vol_srvo':'vol_servo'},inplace=True)
        
    # Rename the columns according to the challenge data fields
    datafields = pd.read_csv('../data/raw/datafields.csv',
                         header=None,
                         index_col=0).to_dict()[1]
    df = df.rename(columns=datafields,
                   errors="raise")

    # index by TIME (sort)
    df = df.sort_values(by=['TIME'])
    df.index = df['TIME']
    df.index.name = 'Time [s]'
    
    return df

In [3]:
# Test on flight numner 3
flight_number = 3
df = read_filght_data(flight_number, verbose=True)

N = 160030
drape contains NaN(s)
dt = 0.09611163227016886
ogs_alt contains NaN(s)
ogs_mag contains NaN(s)


"NOTE: The dt field in each HDF5 file is incorrect. The correct value is 0.1."

In [4]:
# Check that the data has been imported
df[['FLUXB_X', 'FLUXB_Y', 'FLUXB_Z']].describe()

Unnamed: 0,FLUXB_X,FLUXB_Y,FLUXB_Z
count,160030.0,160030.0,160030.0
mean,34805.294581,32631.568086,-8968.910635
std,10137.198973,8970.374849,13916.57843
min,-15.877,-4469.929,-30561.526
25%,25884.5125,25396.999,-21958.47975
50%,35410.303,34002.262,-10081.293
75%,44255.286,40194.784,5911.287
max,54512.841,51630.098,15810.443


Testing for good understanding of geographic conventions :

In [5]:
WGS_to_UTC = Transformer.from_crs(crs_from=4326, # EPSG:4326 World Geodetic System 1984, https://epsg.io/4326
                                  crs_to=32618)  # EPSG:32618 WGS 84/UTM zone 18N, https://epsg.io/32618

# Transfom (LAT, LONG) -> (X_UTM, Y_UTM)
UTM_X_pyproj, UTM_Y_pyproj = WGS_to_UTC.transform(df.LAT.values,
                                                  df.LONG.values)

# Check if the converted coordinates and the dataset coordinates are equal (+/- 1.4cm).
all(np.sqrt((df.UTM_X - UTM_X_pyproj)**2 + (df.UTM_Y - UTM_Y_pyproj)**2) < 0.014)

True

# 1 - Export data

The data of the 5 flights are stored in HDF5 files :
* Flt1002-train.h5
* Flt1003-train.h5
* Flt1004-train.h5
* Flt1005-train.h5

To make data access and readability easier, we group all flights in a single h5 file. In the next cell, we export all flights in a single h5 file named 'Flt_data.h5', we also export flights to csv.

In [6]:
for flight_number in tqdm_notebook(range(2,8)):
    df = read_filght_data(flight_number)
    
    # export to HDF5
    df.to_hdf('../data/interim/Flt_data.h5',
              key=f'Flt100{flight_number}')
    
    # export to csv
    df.to_csv(f'../data/interim/Flt_data_csv/Flt100{flight_number}.csv')

  0%|          | 0/6 [00:00<?, ?it/s]

 Let's check if if the import works properly :

In [14]:
df2 = pd.read_hdf('../data/interim/Flt_data.h5',
                  key=f'Flt100{flight_number}')
all(df2 == df)

True