------------

<h2>Notebook to transform each flight into one line and agregating all of them to create one parquet file for each aircraft</h2>

In [1]:
# Importing the libraries needed
import pandas as pd
import numpy as np
import gc
import os
import copy

In [2]:
# Reduces the size of the dataframe by converting columns to smaller data types
def convert_columns(df_filtered):
    for column in df_filtered.columns:
        if df_filtered[column].dtype == 'float64':
            df_filtered[column] = df_filtered[column].astype('float32')

        if df_filtered[column].dtype == 'int64':
            df_filtered[column] = df_filtered[column].astype('int32')

    return df_filtered

In [3]:
def process_files_in_directory(dirpath, output_directory):
    # For storing error-related data
    error_data_list = []
    # For storing data related to short flights
    short_flights_data_list = []

    # Loop through all files in the directory and process them
    for filename in os.listdir(dirpath):
        if filename.endswith('.parquet'):
            file_path = os.path.join(dirpath, filename)
            try:
                # Load the file
                df = pd.read_parquet(file_path)

                df = convert_columns(df)

                # If the DataFrame has a column named "erro"
                if "message0418DAA-1" in df.columns and "message0422DAA-1" in df.columns:

                    error_count_erro1 = df[df['message0418DAA-1'].notna() & (df['message0418DAA-1'] != 0)].shape[0]
                    error_count_erro2 = df[df['message0422DAA-1'].notna() & (df['message0422DAA-1'] != 0)].shape[0]
                    unique_values_error1 = df['message0418DAA-1'].unique().tolist()
                    unique_values_error2 = df['message0422DAA-1'].unique().tolist()

                    if error_count_erro1 > 0 or error_count_erro2 > 0:
                        error_data_list.append({
                            "nome": filename,
                            "quantidade_message0418DAA-1": copy.deepcopy(error_count_erro1),
                            "quantidade_message0422DAA-1": copy.deepcopy(error_count_erro2),
                            "valores_error1": copy.deepcopy(unique_values_error1),
                            "valores_error2": copy.deepcopy(unique_values_error2)
                        })

                # Check if the DataFrame has less than 3.6e+6 rows
                if df.shape[0] < 72000:
                    short_flights_data_list.append({
                        "nome": filename,
                        "tempo": copy.deepcopy(df.shape[0])
                    })

                del df
                gc.collect()

            except Exception as e:
                print(f"Error reading the file {file_path}: {e}")

    # Save the consolidated error data if it exists
    if error_data_list:
        error_df = pd.DataFrame(error_data_list)
        error_path = os.path.join(output_directory, f"Voos_com_erro")
        error_file = os.path.join(error_path, f"voos_erro_{os.path.basename(dirpath)}.parquet")
        error_df.to_parquet(error_file)

    # Save the consolidated short flights data if it exists
    if short_flights_data_list:
        short_flights_df = pd.DataFrame(short_flights_data_list)
        short_path = os.path.join(output_directory, f"Voos_curtos")
        short_flights_file = os.path.join(short_path, f"voos_curtos_{os.path.basename(dirpath)}.parquet")
        short_flights_df.to_parquet(short_flights_file)


In [2]:
def detect_cutted_flights(dirpath, output_directory):
    required_navegacao_values = [2, 3, 5, 6, 7, 1]
    required_fase_values = [0, 1, 2]
    # For storing data related to short flights
    missing_data_list = []

    # Loop through all files in the directory and process them
    for filename in os.listdir(dirpath):
        if filename.endswith('.parquet'):
            file_path = os.path.join(dirpath, filename)
            try:
                # Load the file
                df = pd.read_parquet(file_path, columns=["phaseOfFlight-1", "phaseOfFlightNavigation-1"])

                unique_navegacao = df['phaseOfFlightNavigation-1'].unique().tolist()
                unique_fase = df['phaseOfFlight-1'].unique().tolist()

                # Check if all required values are present
                if not all(val in unique_navegacao for val in required_navegacao_values) or \
                   not all(val in unique_fase for val in required_fase_values):
                    
                    missing_data_list.append({
                        "nome": filename,
                        "phaseOfFlightNavigation-1": unique_navegacao,
                        "phaseOfFlight-1": unique_fase
                    })

            except Exception as e:
               print(f"Error reading the file {file_path}: {e}")


            if missing_data_list:
               missing_df = pd.DataFrame(missing_data_list)
               cutted_path = os.path.join(output_directory, f"Voos_cortados")
               cutted_flights_file = os.path.join(cutted_path, f"voos_cortados_{os.path.basename(dirpath)}.parquet")
               missing_df.to_parquet(cutted_flights_file)


In [3]:
# Paths to the data files
directory_path = "/Users/henriquematias/Documents/GitHub/Inteli-Modulo-7/Projeto_Grupo1_Inteli_Azul/notebooks/Datasets"
output_directory = "/Users/henriquematias/Documents/GitHub/Inteli-Modulo-7/Projeto_Grupo1_Inteli_Azul/notebooks/Output"

# Making sure that the output directory exists
os.makedirs(output_directory, exist_ok=True)

# Passing through all the files in all directories specified in the directory_path
for dirpath, dirnames, filenames in os.walk(directory_path):
    detect_cutted_flights(dirpath, output_directory)

------

<h4>CÓDIGOS PARA USO POSTERIOR</h4>

In [None]:
unique_values = {}
for column in df.columns:
      unique_values[column] = df[column].unique()
unique_values

In [None]:
# Descarting columns that are not needed
df = df.loc[:, columns_needed]