<a href="https://colab.research.google.com/github/MGentieu/Data_analytics_aircraft_messages/blob/main/TP3/GENTIEU_Martin_TP2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GENTIEU Martin et GOUESSE Sixtine : TP3 de data analytics


## Préparation des données utiles pour le TP3 :

###On se place dans un premier temps dans le répertoire parent

In [1]:
import os
import sys
from pathlib import Path

# Detect working directory
cwd = Path.cwd().resolve()

# If in Colab (/content), search for repo folder automatically
if cwd.name == "content":
    # Look for the cloned project automatically
    candidates = [d for d in cwd.iterdir() if d.is_dir() and "aircraft" in d.name.lower()]

    if not candidates:
        raise FileNotFoundError(
            f"Aucun dossier contenant 'aircraft' trouvé dans /content.\n"
            f"Dossiers présents : {os.listdir(cwd)}"
        )

    PROJECT_ROOT = candidates[0]/"TP3"
    DATA_ROOT = candidates[0]/"TP2"
else:
    PROJECT_ROOT = cwd
    DATA_ROOT = cwd

print(f"Detected project root: {PROJECT_ROOT}")
print(f"Detected data root: {DATA_ROOT}")

# Validate directory
if not PROJECT_ROOT.exists():
    raise FileNotFoundError(f"Missing directory at {PROJECT_ROOT}")

# Add to Python path
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print("Setup complete.")


Detected project root: /content/Data_analytics_aircraft_messages/TP3
Detected data root: /content/Data_analytics_aircraft_messages/TP2
Setup complete.


On importe ensuite les bibliothèques nécessaires à l'analyse :

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import folium
import shapely
import socket
import glob
import csv
from datetime import datetime

In [3]:
HOST = "sbs.glidernet.org"
PORT = 30003

cols = [
    "MessageType", "TransmissionType", "SessionID", "AircraftID", "HexIdent", "FlightID",
    "DateGenerated", "TimeGenerated", "DateLogged", "TimeLogged", "Callsign", "Altitude",
    "GroundSpeed", "Track", "Latitude", "Longitude", "VerticalRate", "Squawk", "Alert",
    "Emergency", "SPI", "IsOnGround"
]

# Cherche un fichier existant data_TP2_*.csv
existing_files = glob.glob(f"{DATA_ROOT}/adsb_data_*.csv")
output_file=""
if existing_files:
    # Si plusieurs → on prend le plus récent
    output_file = max(existing_files, key=os.path.getmtime)
    print(f"Fichier existant trouvé : {output_file}")

Fichier existant trouvé : /content/Data_analytics_aircraft_messages/TP2/adsb_data_20251120_091555.csv


###Récupération des données utiles pour un avion (en utilisant l'identifiant ICAO)

In [4]:
df = pd.read_csv(output_file)
split_df = df["Message"].str.split(",", expand=True)

# Nommer les colonnes SBS-1
colnames = [
    "MessageType", "TransmissionType", "SessionID", "AircraftID", "HexIdent", "FlightID",
    "DateGenerated", "TimeGenerated", "DateLogged", "TimeLogged", "Callsign", "Altitude",
    "GroundSpeed", "Track", "Latitude", "Longitude", "VerticalRate", "Squawk", "Alert",
    "Emergency", "SPI", "IsOnGround"
]

split_df.columns = colnames[:split_df.shape[1]]

df=split_df.copy()

Analyse initiale du dataset :

In [5]:
df.head()

Unnamed: 0,MessageType,TransmissionType,SessionID,AircraftID,HexIdent,FlightID,DateGenerated,TimeGenerated,DateLogged,TimeLogged,...,GroundSpeed,Track,Latitude,Longitude,VerticalRate,Squawk,Alert,Emergency,SPI,IsOnGround
0,MSG,3,,,A2A3B6,,2025/11/20,08:15:55.243,2025/11/20,08:15:55.243,...,202.0,250.0,50.11797,8.83735,,,0,0,0,0
1,MSG,3,,,043EA0,,2025/11/20,08:15:55.245,2025/11/20,08:15:55.245,...,0.0,0.0,50.0,20.0,,,0,0,0,0
2,MSG,3,,,45AC4F,,2025/11/20,08:15:55.246,2025/11/20,08:15:55.246,...,330.0,212.0,51.56957,4.85748,,,0,0,0,0
3,MSG,3,,,440C9B,,2025/11/20,08:15:55.248,2025/11/20,08:15:55.248,...,334.0,109.0,49.13595,10.37468,,,0,0,0,0
4,MSG,3,,,4ACA73,,2025/11/20,08:15:55.249,2025/11/20,08:15:55.249,...,142.0,16.0,60.23327,11.10172,,,0,0,0,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549068 entries, 0 to 549067
Data columns (total 22 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   MessageType       549068 non-null  object
 1   TransmissionType  549063 non-null  object
 2   SessionID         549045 non-null  object
 3   AircraftID        549031 non-null  object
 4   HexIdent          549011 non-null  object
 5   FlightID          548904 non-null  object
 6   DateGenerated     548889 non-null  object
 7   TimeGenerated     548739 non-null  object
 8   DateLogged        546124 non-null  object
 9   TimeLogged        530150 non-null  object
 10  Callsign          518004 non-null  object
 11  Altitude          516259 non-null  object
 12  GroundSpeed       510525 non-null  object
 13  Track             497907 non-null  object
 14  Latitude          487700 non-null  object
 15  Longitude         482444 non-null  object
 16  VerticalRate      481943 non-null  obj

In [7]:
df.describe()

Unnamed: 0,MessageType,TransmissionType,SessionID,AircraftID,HexIdent,FlightID,DateGenerated,TimeGenerated,DateLogged,TimeLogged,...,GroundSpeed,Track,Latitude,Longitude,VerticalRate,Squawk,Alert,Emergency,SPI,IsOnGround
count,549068,549063,549045.0,549031.0,549011,548904.0,548889,548739,546124,530150.0,...,510525.0,497907.0,487700.0,482444.0,481943.0,481927.0,481916,481897,481875,481862
unique,1,2,1.0,1.0,1878,1.0,11,546995,11,519066.0,...,1504.0,1058.0,310228.0,344745.0,1.0,1.0,2,2,2,2
top,MSG,3,,,043EA0,,2025/11/20,08:,2025/11/20,,...,0.0,0.0,50.0,20.0,,,0,0,0,0
freq,549068,549055,549045.0,549031.0,5205,548904.0,548750,35,531844,1629.0,...,18858.0,14883.0,4623.0,4594.0,481943.0,481927.0,481908,481886,481866,481857


###1.2. Extraction des colonnes utiles

In [8]:
df_parsed = pd.DataFrame({
    'icao': df["HexIdent"],
    'registration': df["Callsign"],
    'altitude': pd.to_numeric(df["Altitude"], errors='coerce'),
    'velocity': pd.to_numeric(df["GroundSpeed"], errors='coerce'),
    'heading': pd.to_numeric(df["Track"], errors='coerce'),
    'lat': pd.to_numeric(df["Latitude"], errors='coerce'),
    'lon': pd.to_numeric(df["Longitude"], errors='coerce'),
    'timestamp': df["DateGenerated"] + " " + df["TimeGenerated"]
})

# Convertir le timestamp
df_parsed["timestamp"] = pd.to_datetime(df_parsed["timestamp"], errors='coerce')


Enfin, avant de construire les différents descripteurs, on va filtrer les datasets pour ne garder que les avions pour lesquels nous avons suffisamment de données (au moins 50 messages) :

In [9]:
# Nettoyage : supprimer les lignes avec données essentielles manquantes
df_parsed = df_parsed.dropna(subset=["icao", "lat", "lon", "timestamp"])

# ---- Filtrage des icao revenant 50 fois ou plus ----
icao_counts = df_parsed["icao"].value_counts()
icao_valides = icao_counts[icao_counts >= 50].index

df_parsed = df_parsed[df_parsed["icao"].isin(icao_valides)].reset_index(drop=True)
df_parsed.describe()

Unnamed: 0,altitude,velocity,heading,lat,lon,timestamp
count,472619.0,472619.0,472619.0,472619.0,472619.0,472619
mean,12387.432697,264.844435,171.710295,47.301405,6.572311,2025-11-20 08:29:16.520082176
min,0.0,0.0,0.0,-44.67492,-118.89745,2025-11-20 08:15:55.243000
25%,4817.0,189.0,91.0,45.32623,2.12547,2025-11-20 08:22:50.683500032
50%,10444.0,279.0,167.0,50.0134,7.80007,2025-11-20 08:29:21.230000128
75%,15959.5,347.0,258.0,51.660975,10.789635,2025-11-20 08:35:42.926500096
max,215033.0,729.0,360.0,70.33962,175.73698,2025-11-20 08:42:17.264000
std,10026.442881,119.926912,102.749224,12.29369,11.2638,


## 1. Construction des descripteurs :

Calcul de la distance totale parcourue avec la méthode Haversine.

Notre dataset contient beaucoup d'erreurs liées aux sauts de valeurs. Nous calculerons donc la distance parcourue à partir des données suivantes :
- La médiane des latitudes et longitudes des 15 premières lignes
- La médiane des latitudes et longitudes des 15 dernières lignes.

On applique ensuite la méthode Haversine sur les données trouvées.

In [10]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371_000  # rayon terrestre en mètres
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    return 2 * R * np.arcsin(np.sqrt(a))

distances = []

for icao, group in df_parsed.groupby("icao"):
    g = group.sort_values("timestamp")

    # 1) 15 premières lignes — médianes
    first15 = g.head(15)
    lat_start = first15["lat"].median()
    lon_start = first15["lon"].median()

    # 2) 15 dernières lignes — médianes
    last15 = g.tail(15)
    lat_end = last15["lat"].median()
    lon_end = last15["lon"].median()

    # 3) Distance Haversine entre les deux points médians
    total_dist = haversine(lat_start, lon_start, lat_end, lon_end)

    distances.append({
        "icao": icao,
        "median_haversine_distance": total_dist
    })

# ---- On transforme en DataFrame puis on merge ----
df_dist = pd.DataFrame(distances)

df_parsed = df_parsed.merge(df_dist, on="icao", how="left")

Calcul de la vitesse (groundspeed) moyenne et maximale :

Par souci des erreurs comprises dans le dataset, au lieu de prendre le maximum, on prend le 95e percentile.

In [11]:
stats_velocity = df_parsed.groupby("icao")["velocity"].agg(
    mean_velocity="mean",
    max_velocity=lambda x: np.nanpercentile(x, 95)
)
df_parsed = df_parsed.merge(stats_velocity, on="icao", how="left")

Calcul du taux de montée médian :

Malgré le fait que l'énoncé nous demande de calculer le taux de montée moyen, les erreurs et pics compris dans le dataset affectent également l'altitude et cela peut amener à une moyenne brute non représentative de ce qui se passe réellement.

Afin de pallier ce problème, on se référera plutôt au taux de montée médian, moins sensible aux irrégularités du dataset.

In [12]:
df_parsed["alt_shift"] = df_parsed.groupby("icao")["altitude"].shift()
df_parsed["dt"] = df_parsed.groupby("icao")["timestamp"].diff().dt.total_seconds()

# Calcul du taux de montée, en évitant les divisions par zéro
df_parsed["climb_rate"] = (df_parsed["altitude"] - df_parsed["alt_shift"]) / df_parsed["dt"].replace(0, np.nan)

stats_climb = df_parsed.groupby("icao")["climb_rate"].median().rename("median_climb_rate")
df_parsed = df_parsed.merge(stats_climb, on="icao", how="left")

Calcul de la variance du cap (variance de l'attribut "heading")

Du fait que ces données concernent des angles en degré, il convient d'utiliser une variance circulaire, se basant sur les radians, afin d'éviter d'avoir une variance fausse qui considère qu'il y a un grand saut entre 1° et 365° (de 2° réellement).



In [13]:
# Fonction de variance circulaire (calcul en radians)
def circular_variance_deg(deg_angles):
    # Conversion en radians
    angles = np.deg2rad(deg_angles.dropna())
    if len(angles) == 0:
        return np.nan

    # composantes moyennes
    sin_mean = np.mean(np.sin(angles))
    cos_mean = np.mean(np.cos(angles))

    # resultant vector R
    R = np.sqrt(sin_mean**2 + cos_mean**2)

    # variance circulaire en radians²
    var_rad = 1 - R

    # Conversion en degrés² pour la lisibilité
    var_deg = np.degrees(np.sqrt(var_rad))**2
    return var_deg

# Calcul par avion
stats_heading_deg = (
    df_parsed.groupby("icao")["heading"]
    .apply(circular_variance_deg)
    .rename("circular_heading_variance_deg")
)

# Ajouter la métrique au dataset
df_parsed = df_parsed.merge(stats_heading_deg, on="icao", how="left")


Dispersion géographique : calcul de l'écart-type des latitudes et longitudes

In [14]:
stats_geo = (
    df_parsed.groupby("icao")[["lat", "lon"]]
    .std()
    .rename(columns={"lat": "std_lat", "lon": "std_lon"})
)
df_parsed = df_parsed.merge(stats_geo, on="icao", how="left")


Calcul de l'indice de linéarité de la trajectoire

In [15]:
# Calcul de l'indice de linéarité en utilisant la distance totale déjà calculée
linearities = []

for icao, group in df_parsed.groupby("icao"):
    g = group.sort_values("timestamp")

    # Points de départ et d'arrivée robustes : médianes des 15 premières et dernières positions
    start_lat = g.head(15)["lat"].median()
    start_lon = g.head(15)["lon"].median()
    end_lat = g.tail(15)["lat"].median()
    end_lon = g.tail(15)["lon"].median()

    # Distance droite
    straight_distance = haversine(start_lat, start_lon, end_lat, end_lon)

    # Distance totale déjà calculée
    total_distance = g["median_haversine_distance"].iloc[0]  # identique pour toutes les lignes de l'avion

    # Indice de linéarité
    linearity_index = straight_distance / total_distance if total_distance > 0 else np.nan

    linearities.append({"icao": icao, "linearity_index": linearity_index})

# Ajouter au dataset
df_linearities = pd.DataFrame(linearities)
df_parsed = df_parsed.merge(df_linearities, on="icao", how="left")



Calcul du nombre de virages dont l'angle dépasse 25°.

Cette mesure est très complexe, car on doit prendre en compte les différentes erreurs présentes dans le dataset ainsi que les écarts de temps qui sont assez petits entre plusieurs mesures.

L'angle d'un virage sera donc calculé comme tel :
- On prend la médiane des caps des 10 lignes précédentes
- On fait la même chose pour les 10 lignes suivantes.

**On fait ensuite le calcul de l'angle en se basant sur ces valeurs médianes.**

Cela limite les erreurs liées aux sauts de valeurs qu'on trouve de manière récurrente dans le dataset.

In [16]:
# Fonction pour calculer l'angle entre médianes en radians
def robust_turn_angle_radians(group):
    # Conversion des headings en radians
    headings = np.deg2rad(group["heading"].values)
    n = len(headings)
    turn_angles = np.full(n, np.nan)

    for i in range(n):
        # Médiane des 10 valeurs précédentes
        start = max(0, i-10)
        end = i
        median_prev = np.median(headings[start:end]) if end > start else headings[i]

        # Médiane des 10 valeurs suivantes
        start_next = i+1
        end_next = min(n, i+11)
        median_next = np.median(headings[start_next:end_next]) if end_next > start_next else headings[i]

        # Calcul de l'angle circulaire
        delta = np.angle(np.exp(1j*median_next) / np.exp(1j*median_prev))
        turn_angles[i] = np.abs(delta)  # reste en radians

    return pd.Series(turn_angles, index=group.index)

# Calcul des angles robustes en radians
df_parsed["turn_angle_rad"] = df_parsed.groupby("icao")["heading"].apply(
    lambda x: robust_turn_angle_radians(pd.DataFrame({"heading": x}))
).reset_index(level=0, drop=True)

# Compter les virages > 25° (conversion 25° → radians)
angle_threshold_rad = np.deg2rad(25)
sharp_turns = (
    df_parsed[df_parsed["turn_angle_rad"] > angle_threshold_rad]
    .groupby("icao")["turn_angle_rad"]
    .count()
    .rename("sharp_turns")
)


# Ajouter la métrique au dataset
df_parsed = df_parsed.merge(sharp_turns, on="icao", how="left")
df_parsed["sharp_turns"] = df_parsed["sharp_turns"].fillna(0).astype(int)


On regarde ensuite comment est construit notre dataset pour vérifier que l'on n'a des résultats cohérents

In [17]:
df_parsed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 472619 entries, 0 to 472618
Data columns (total 21 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   icao                           472619 non-null  object        
 1   registration                   472619 non-null  object        
 2   altitude                       472619 non-null  float64       
 3   velocity                       472619 non-null  float64       
 4   heading                        472619 non-null  float64       
 5   lat                            472619 non-null  float64       
 6   lon                            472619 non-null  float64       
 7   timestamp                      472619 non-null  datetime64[ns]
 8   median_haversine_distance      472619 non-null  float64       
 9   mean_velocity                  472619 non-null  float64       
 10  max_velocity                   472619 non-null  float64       
 11  

In [18]:
df_parsed.describe()

Unnamed: 0,altitude,velocity,heading,lat,lon,timestamp,median_haversine_distance,mean_velocity,max_velocity,alt_shift,dt,climb_rate,median_climb_rate,circular_heading_variance_deg,std_lat,std_lon,linearity_index,turn_angle_rad,sharp_turns
count,472619.0,472619.0,472619.0,472619.0,472619.0,472619,472619.0,472619.0,472619.0,471233.0,471233.0,471213.0,472619.0,472619.0,472619.0,472619.0,466227.0,472619.0,472619.0
mean,12387.432697,264.844435,171.710295,47.301405,6.572311,2025-11-20 08:29:16.520082176,118433.732571,264.844435,318.458133,12389.948138,2.509357,61.94403,1.958362,567.019505,0.202913,0.357843,1.0,0.113204,31.189516
min,0.0,0.0,0.0,-44.67492,-118.89745,2025-11-20 08:15:55.243000,0.0,0.0,0.0,0.0,0.0,-1499667.0,-55.271998,0.0,0.0,0.0,1.0,0.0,0.0
25%,4817.0,189.0,91.0,45.32623,2.12547,2025-11-20 08:22:50.683500032,40913.760169,228.978763,272.65,4818.0,0.441,-19.3662,-13.563223,36.822325,0.049434,0.111035,1.0,0.0,0.0
50%,10444.0,279.0,167.0,50.0134,7.80007,2025-11-20 08:29:21.230000128,84209.47124,281.111953,349.0,10446.0,1.265,0.0,0.0,207.974454,0.12877,0.24253,1.0,0.0,5.0
75%,15959.5,347.0,258.0,51.660975,10.789635,2025-11-20 08:35:42.926500096,188481.772478,323.434524,399.0,15954.0,3.1,20.63558,16.808523,780.272614,0.282894,0.493615,1.0,0.034907,24.0
max,215033.0,729.0,360.0,70.33962,175.73698,2025-11-20 08:42:17.264000,381071.591365,516.854369,536.0,215033.0,1427.761,1134800.0,73.468937,3274.905575,1.201755,12.859335,1.0,3.141593,737.0
std,10026.442881,119.926912,102.749224,12.29369,11.2638,,100612.404316,109.204037,123.197299,10023.093724,11.714419,7336.775,18.712085,775.522787,0.209746,0.40027,0.0,0.365431,81.92438


In [19]:
df_parsed.head()

Unnamed: 0,icao,registration,altitude,velocity,heading,lat,lon,timestamp,median_haversine_distance,mean_velocity,...,alt_shift,dt,climb_rate,median_climb_rate,circular_heading_variance_deg,std_lat,std_lon,linearity_index,turn_angle_rad,sharp_turns
0,A2A3B6,ICAA2A3B6,4819.0,202.0,250.0,50.11797,8.83735,2025-11-20 08:15:55.243,19741.101504,178.578571,...,,,,-16.853933,0.053341,0.019609,0.08247,1.0,0.0,0
1,043EA0,FLR043EA0,820.0,0.0,0.0,50.0,20.0,2025-11-20 08:15:55.245,0.0,0.0,...,,,,0.0,0.0,0.0,0.79446,,0.0,0
2,45AC4F,OY-KBO,15651.0,330.0,212.0,51.56957,4.85748,2025-11-20 08:15:55.246,76743.920583,268.618932,...,,,,-17.817372,386.338559,0.20699,0.05838,1.0,0.0,0
3,440C9B,OE-LFQ,14471.0,334.0,109.0,49.13595,10.37468,2025-11-20 08:15:55.248,105731.803685,262.785276,...,,,,-10.472858,628.089674,0.095454,0.489788,1.0,0.0,31
4,4ACA73,ICA4ACA73,2677.0,142.0,16.0,60.23327,11.10172,2025-11-20 08:15:55.249,47889.762084,273.517007,...,,,,39.112975,18.478528,0.128406,0.010469,1.0,0.0,4


In [20]:
# Liste des colonnes finales à garder
columns_to_keep = [
    'icao', 'registration', 'altitude', 'velocity', 'heading', 'lat', 'lon', 'timestamp',
    'median_haversine_distance', 'mean_velocity', 'max_velocity',
    'median_climb_rate', 'circular_heading_variance_deg',
    'std_lat', 'std_lon', 'linearity_index', 'sharp_turns'
]

# Nettoyage du DataFrame
df_prepared = df_parsed[columns_to_keep]
df_prepared.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 472619 entries, 0 to 472618
Data columns (total 17 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   icao                           472619 non-null  object        
 1   registration                   472619 non-null  object        
 2   altitude                       472619 non-null  float64       
 3   velocity                       472619 non-null  float64       
 4   heading                        472619 non-null  float64       
 5   lat                            472619 non-null  float64       
 6   lon                            472619 non-null  float64       
 7   timestamp                      472619 non-null  datetime64[ns]
 8   median_haversine_distance      472619 non-null  float64       
 9   mean_velocity                  472619 non-null  float64       
 10  max_velocity                   472619 non-null  float64       
 11  

## On sélectionne maintenant un avion en particulier

On trie ensuite par timestamp pour obtenir les messages triés en fonction du temps (du plus vieux au plus récent)

In [21]:
"""
# On teste pour plusieurs avions au cas où on observe des erreurs.
icao_sample = df_parsed['icao'].value_counts().index[0]
icao_sample2 = df_parsed['icao'].value_counts().index[1]
icao_sample3 = df_parsed['icao'].value_counts().index[2]
icao_sample4 = df_parsed['icao'].value_counts().index[3]
icao_sample5 = df_parsed['icao'].value_counts().index[4]
icao_sample6 = df_parsed['icao'].value_counts().index[5]
icao_sample7 = df_parsed['icao'].value_counts().index[6]
icao_sample8 = df_parsed['icao'].value_counts().index[7]
icao_sample9 = df_parsed['icao'].value_counts().index[8]
icao_sample10 = df_parsed['icao'].value_counts().index[9]
print(icao_sample)
print(icao_sample2)
print(icao_sample3)
print(icao_sample4)
print(icao_sample5)
flight = df_parsed[df_parsed['icao'] == icao_sample2].sort_values('timestamp')
flight.info()
"""

"\n# On teste pour plusieurs avions au cas où on observe des erreurs.\nicao_sample = df_parsed['icao'].value_counts().index[0]\nicao_sample2 = df_parsed['icao'].value_counts().index[1]\nicao_sample3 = df_parsed['icao'].value_counts().index[2]\nicao_sample4 = df_parsed['icao'].value_counts().index[3]\nicao_sample5 = df_parsed['icao'].value_counts().index[4]\nicao_sample6 = df_parsed['icao'].value_counts().index[5]\nicao_sample7 = df_parsed['icao'].value_counts().index[6]\nicao_sample8 = df_parsed['icao'].value_counts().index[7]\nicao_sample9 = df_parsed['icao'].value_counts().index[8]\nicao_sample10 = df_parsed['icao'].value_counts().index[9]\nprint(icao_sample)\nprint(icao_sample2)\nprint(icao_sample3)\nprint(icao_sample4)\nprint(icao_sample5)\nflight = df_parsed[df_parsed['icao'] == icao_sample2].sort_values('timestamp')\nflight.info()\n"

On tâche de nettoyer un peu le dataset sur les valeurs manquantes

In [22]:
#flight = flight.drop(columns=[col for col in flight.columns if col.startswith("prev_")])
#flight.info()