In [1]:
import pandas as pd
from datetime import timedelta
import numpy as np

In [2]:
flights_df = pd.read_csv('Datasets/Merged_Flights.csv')
flights_df.shape

  flights_df = pd.read_csv('Datasets/Merged_Flights.csv')


(26490448, 18)

In [3]:
flights_df.drop_duplicates(subset=['icao24', 'last_position'], inplace = True)
flights_df = flights_df[['timestamp', 'icao24', 'latitude', 'longitude', 'ground_speed', 'track', 'vertical_rate', 'baro_altitude']]
flights_df.shape

(19486764, 8)

In [4]:

flights_df = flights_df[(flights_df['longitude'] <= 20) & (flights_df['longitude'] >= -20)]
flights_df = flights_df[(flights_df['baro_altitude'] >= 10000) & (flights_df['baro_altitude'] <= 50000)]
flights_df = flights_df[(flights_df['vertical_rate'] >= -4500) & (flights_df['vertical_rate'] <= 4500)]
flights_df.shape

(15274354, 8)

In [5]:
# Convert 'timestamp' and 'last_position' to datetime
flights_df['timestamp'] = pd.to_datetime(flights_df['timestamp'])
flights_df.shape

(15274354, 8)

In [6]:
flights_df.isnull().sum(), flights_df.shape

(timestamp        0
 icao24           0
 latitude         0
 longitude        0
 ground_speed     0
 track            0
 vertical_rate    0
 baro_altitude    0
 dtype: int64,
 (15274354, 8))

In [7]:
flights_df.dropna(inplace = True)
flights_df.isnull().sum(), flights_df.shape

(timestamp        0
 icao24           0
 latitude         0
 longitude        0
 ground_speed     0
 track            0
 vertical_rate    0
 baro_altitude    0
 dtype: int64,
 (15274354, 8))

In [None]:
flights_df_sorted = flights_df.sort_values(by=['icao24', 'timestamp'])

In [None]:
# Downsampling the dataset by keeping one in every 120 data points
downsampled_df = flights_df_sorted.iloc[::120, :]
downsampled_df.shape

In [None]:
# Function to find the row with the closest timestamp to the target time
def find_first_future_row(df, target_time, icao24):
    # finds the first row that matches the original row time + 10 minutes
    future_rows = df[(df['icao24'] == icao24) & (df['timestamp'] >= target_time) & (df['timestamp'] <= target_time + timedelta(minutes=2))]
    # returns the row if it exists, otherwise returns None (to filter nan values if no future point exists)
    return future_rows.iloc[0] if not future_rows.empty else None

# Adding new columns for future position and altitude
downsampled_df['latitude_in_10min'] = np.nan
downsampled_df['longitude_in_10min'] = np.nan
downsampled_df['baro_altitude_in_10min'] = np.nan

rows = downsampled_df.shape[0]
n=0
# Iterating over each row in the subset
for index, row in downsampled_df.iterrows():
    future_time = row['timestamp'] + timedelta(minutes=10)
    future_row = find_first_future_row(downsampled_df, future_time, row['icao24'])
    if future_row is not None:
        downsampled_df.at[index, 'latitude_in_10min'] = future_row['latitude']
        downsampled_df.at[index, 'longitude_in_10min'] = future_row['longitude']
        downsampled_df.at[index, 'baro_altitude_in_10min'] = future_row['baro_altitude']
    n += 1
    if n % 1000 == 0:
        print(round((n/rows)*100,1), '%')



# Displaying the modified subset to check the results
downsampled_df.head()
downsampled_df.isna().sum()

In [None]:
downsampled_df.dropna(inplace = True)

In [None]:
downsampled_df['Climbing'] = downsampled_df['vertical_rate'] > 0
downsampled_df['Descending'] = downsampled_df['vertical_rate'] < 0
downsampled_df['Cruise'] = downsampled_df['vertical_rate'] == 0

In [None]:
downsampled_df

In [None]:
downsampled_df.to_csv('Datasets/Cleaned_prepared_data.csv')