In [None]:
import numpy as np
import pandas as pd
import datetime
from pathlib import Path 
import concurrent
from tqdm.contrib.concurrent import process_map
import plotly.express as px

import sys
sys.path.append('./rtaUtils')

from rtaUtils.paths import *
from rtaUtils import sort_vectors, data_cleaning, common, data_loading

# Sort state vectors

In [None]:
if __name__ == '__main__':
    dates = common.get_dates_between(date_start = '2022-01-01', 
                                     date_end   = '2022-09-30')
    
    for date in dates:
        try:
            flights = data_loading.load_raw_data_sort(date)
        except IndexError:
            continue
        indices = data_loading.calculate_indexes(flights)
        
        us_data = []
        for i, (fpId, start_index, end_index) in indices.iterrows():
            us_data.append(flights.loc[start_index:end_index].copy())

        time_start = datetime.datetime.now().strftime("%H:%M:%S")
        print(f'{time_start} Processing: {date.strftime("%Y-%m-%d")} ({len(us_data)} tray)', end='\r')

        with concurrent.futures.ProcessPoolExecutor(max_workers=8) as executor:
            result = list(executor.map(sort_vectors.fix_trajectory, us_data))
        
        if len(result) > 0:
            output = pd.concat(result).sort_values(['ordenFinal'])
            output.to_parquet(sorted_data_path / f'{date.strftime("%Y%m%d")}.parquet')
            
        print(f'{time_start} Processed: {date.strftime("%Y-%m-%d")} ({len(us_data)} tray)   ')

# Merge and fix data

In [None]:
months = [f'2022{str(x).rjust(2,"0")}' for x in range(1,10)]

for month in months:
    output_data = data_loading.load_sorted_data(month)

    # Fix RTA values with corrected timestams
    output_data['RTA'] = output_data.touchdown - output_data.timestamp
    print(f'{month}: {output_data.shape[0]:,} (RTA negativo: {(output_data.RTA<=0).sum()})')
    
    # Remove RTA<0 and ground=True vectors (irrelevant for estimation)
    output_data = output_data[output_data.RTA>0]
    output_data = output_data[~output_data.ground]
    
    output_data = (output_data.pipe(data_cleaning.fill_missing_data)
                              .pipe(data_cleaning.remove_incorrect)
                              .pipe(data_cleaning.modify_data_types)
                              .pipe(data_cleaning.include_additional_columns)
                              .pipe(data_cleaning.drop_columns)
                              .pipe(data_cleaning.drop_duplicates_sort))
    
    # print(output_data.columns)
    output_data.to_parquet(sorted_data_path / f'{month}.parquet')
del output_data

# Remove outliers 

In [None]:
if __name__ == '__main__':
    months = [f'2022{str(x).rjust(2,"0")}' for x in range(1,10)]
    
    for month in months:
        data = pd.read_parquet(sorted_data_path / f'{month}.parquet')
        indices = data_loading.calculate_indexes(data)

        flights = [data.iloc[start:end+1].copy() for idx, (fpId, start, end) in indices.iterrows()]

        with concurrent.futures.ProcessPoolExecutor(max_workers=8) as executor:
            # result = list(executor.map(data_cleaning.detect_outliers, flights))
            result = process_map(data_cleaning.detect_outliers, flights, chunksize=1, desc=month)
        flights = pd.concat(result)
        print(f'{month}: {flights.is_outlier.sum():>6}/{flights.shape[0]:>6}')
        
        flights = flights[~flights.is_outlier]
        
        flights.to_parquet(clean_data_path / f'{month}.parquet')

In [None]:
flights = pd.read_parquet(clean_data_path / f'202202.parquet')

In [None]:
flights.groupby(['fpId','flightDate']).agg({'is_outlier':sum}).sort_values('is_outlier', ascending=False).head(20)

In [None]:
px.scatter_mapbox(
        flights[flights.fpId == 'AT06057635'], 
        lat='latitude', lon='longitude',
        height=725, zoom=4, 
        mapbox_style='open-street-map',
        hover_data=['ordenInicial','ordenFinal','track','altitude','timestamp'], 
        color='is_outlier'
     )

In [None]:
from importlib import reload
sort_vectors = reload(sort_vectors)
flights2 = pd.read_parquet(sorted_data_path / f'20220207.parquet')

In [None]:
px.scatter_mapbox(
#         flights2[flights2.fpId == 'AT06300603'],
        pepe,
        lat='latitude', lon='longitude',
        height=725, zoom=4, 
        mapbox_style='open-street-map',
        hover_data=['ordenInicial','ordenFinal','track','altitude','timestamp', 'flightDate'], 
        color='reordenado'
     )

In [None]:

pepe = flights2[flights2.fpId == 'AT06072256'].copy()

acc = [set(), set()]
pepe['reordenado'] = pepe[['ordenFinal','ordenInicial']].astype(int).apply(sort_vectors.is_resorted, args=[acc, True], axis=1)