<a href="https://colab.research.google.com/github/MGentieu/Data_analytics_aircraft_messages/blob/main/TP3/GENTIEU_Martin_TP2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GENTIEU Martin et GOUESSE Sixtine : TP3 de data analytics


## Préparation des données utiles pour le TP3 :

###On se place dans un premier temps dans le répertoire parent

In [1]:
import os
import sys
from pathlib import Path

# Detect working directory
cwd = Path.cwd().resolve()

# If in Colab (/content), search for repo folder automatically
if cwd.name == "content":
    # Look for the cloned project automatically
    candidates = [d for d in cwd.iterdir() if d.is_dir() and "aircraft" in d.name.lower()]

    if not candidates:
        raise FileNotFoundError(
            f"Aucun dossier contenant 'aircraft' trouvé dans /content.\n"
            f"Dossiers présents : {os.listdir(cwd)}"
        )

    PROJECT_ROOT = candidates[0]/"TP3"
    DATA_ROOT = candidates[0]/"TP2"
else:
    PROJECT_ROOT = cwd
    DATA_ROOT = cwd

print(f"Detected project root: {PROJECT_ROOT}")
print(f"Detected data root: {DATA_ROOT}")

# Validate directory
if not PROJECT_ROOT.exists():
    raise FileNotFoundError(f"Missing directory at {PROJECT_ROOT}")

# Add to Python path
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print("Setup complete.")


Detected project root: /content/Data_analytics_aircraft_messages/TP3
Detected data root: /content/Data_analytics_aircraft_messages/TP2
Setup complete.


On importe ensuite les bibliothèques nécessaires à l'analyse :

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import folium
import shapely
import socket
import glob
import csv
from datetime import datetime

In [5]:
HOST = "sbs.glidernet.org"
PORT = 30003

cols = [
    "MessageType", "TransmissionType", "SessionID", "AircraftID", "HexIdent", "FlightID",
    "DateGenerated", "TimeGenerated", "DateLogged", "TimeLogged", "Callsign", "Altitude",
    "GroundSpeed", "Track", "Latitude", "Longitude", "VerticalRate", "Squawk", "Alert",
    "Emergency", "SPI", "IsOnGround"
]

# Cherche un fichier existant data_TP2_*.csv
existing_files = glob.glob(f"{DATA_ROOT}/adsb_data_*.csv")
output_file=""
if existing_files:
    # Si plusieurs → on prend le plus récent
    output_file = max(existing_files, key=os.path.getmtime)
    print(f"Fichier existant trouvé : {output_file}")

Fichier existant trouvé : /content/Data_analytics_aircraft_messages/TP2/adsb_data_20251120_091555.csv


###Récupération des données utiles pour un avion (en utilisant l'identifiant ICAO)

In [6]:
df = pd.read_csv(output_file)
split_df = df["Message"].str.split(",", expand=True)

# Nommer les colonnes SBS-1
colnames = [
    "MessageType", "TransmissionType", "SessionID", "AircraftID", "HexIdent", "FlightID",
    "DateGenerated", "TimeGenerated", "DateLogged", "TimeLogged", "Callsign", "Altitude",
    "GroundSpeed", "Track", "Latitude", "Longitude", "VerticalRate", "Squawk", "Alert",
    "Emergency", "SPI", "IsOnGround"
]

split_df.columns = colnames[:split_df.shape[1]]

df=split_df.copy()

Analyse initiale du dataset :

In [7]:
df.head()

Unnamed: 0,MessageType,TransmissionType,SessionID,AircraftID,HexIdent,FlightID,DateGenerated,TimeGenerated,DateLogged,TimeLogged,...,GroundSpeed,Track,Latitude,Longitude,VerticalRate,Squawk,Alert,Emergency,SPI,IsOnGround
0,MSG,3,,,A2A3B6,,2025/11/20,08:15:55.243,2025/11/20,08:15:55.243,...,202.0,250.0,50.11797,8.83735,,,0,0,0,0
1,MSG,3,,,043EA0,,2025/11/20,08:15:55.245,2025/11/20,08:15:55.245,...,0.0,0.0,50.0,20.0,,,0,0,0,0
2,MSG,3,,,45AC4F,,2025/11/20,08:15:55.246,2025/11/20,08:15:55.246,...,330.0,212.0,51.56957,4.85748,,,0,0,0,0
3,MSG,3,,,440C9B,,2025/11/20,08:15:55.248,2025/11/20,08:15:55.248,...,334.0,109.0,49.13595,10.37468,,,0,0,0,0
4,MSG,3,,,4ACA73,,2025/11/20,08:15:55.249,2025/11/20,08:15:55.249,...,142.0,16.0,60.23327,11.10172,,,0,0,0,0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549068 entries, 0 to 549067
Data columns (total 22 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   MessageType       549068 non-null  object
 1   TransmissionType  549063 non-null  object
 2   SessionID         549045 non-null  object
 3   AircraftID        549031 non-null  object
 4   HexIdent          549011 non-null  object
 5   FlightID          548904 non-null  object
 6   DateGenerated     548889 non-null  object
 7   TimeGenerated     548739 non-null  object
 8   DateLogged        546124 non-null  object
 9   TimeLogged        530150 non-null  object
 10  Callsign          518004 non-null  object
 11  Altitude          516259 non-null  object
 12  GroundSpeed       510525 non-null  object
 13  Track             497907 non-null  object
 14  Latitude          487700 non-null  object
 15  Longitude         482444 non-null  object
 16  VerticalRate      481943 non-null  obj

In [9]:
df.describe()

Unnamed: 0,MessageType,TransmissionType,SessionID,AircraftID,HexIdent,FlightID,DateGenerated,TimeGenerated,DateLogged,TimeLogged,...,GroundSpeed,Track,Latitude,Longitude,VerticalRate,Squawk,Alert,Emergency,SPI,IsOnGround
count,549068,549063,549045.0,549031.0,549011,548904.0,548889,548739,546124,530150.0,...,510525.0,497907.0,487700.0,482444.0,481943.0,481927.0,481916,481897,481875,481862
unique,1,2,1.0,1.0,1878,1.0,11,546995,11,519066.0,...,1504.0,1058.0,310228.0,344745.0,1.0,1.0,2,2,2,2
top,MSG,3,,,043EA0,,2025/11/20,08:,2025/11/20,,...,0.0,0.0,50.0,20.0,,,0,0,0,0
freq,549068,549055,549045.0,549031.0,5205,548904.0,548750,35,531844,1629.0,...,18858.0,14883.0,4623.0,4594.0,481943.0,481927.0,481908,481886,481866,481857


###1.2. Extraction des colonnes utiles

In [11]:
df_parsed = pd.DataFrame({
    'icao': df["HexIdent"],
    'registration': df["Callsign"],
    'altitude': pd.to_numeric(df["Altitude"], errors='coerce'),
    'velocity': pd.to_numeric(df["GroundSpeed"], errors='coerce'),
    'heading': pd.to_numeric(df["Track"], errors='coerce'),
    'lat': pd.to_numeric(df["Latitude"], errors='coerce'),
    'lon': pd.to_numeric(df["Longitude"], errors='coerce'),
    'timestamp': df["DateGenerated"] + " " + df["TimeGenerated"]
})

# Convertir le timestamp
df_parsed["timestamp"] = pd.to_datetime(df_parsed["timestamp"], errors='coerce')


Unnamed: 0,altitude,velocity,heading,lat,lon,timestamp
count,472210.0,472210.0,472210.0,472210.0,472210.0,472210
mean,12389.935391,264.890803,171.734209,47.301993,6.573087,2025-11-20 08:29:16.480741120
min,0.0,0.0,0.0,-44.67492,-118.89745,2025-11-20 08:15:55.243000
25%,4818.0,189.0,91.0,45.324397,2.12579,2025-11-20 08:22:50.615250176
50%,10446.0,279.0,167.0,50.01366,7.800475,2025-11-20 08:29:21.172000
75%,15961.0,347.0,259.0,51.661195,10.790565,2025-11-20 08:35:42.934500096
max,215033.0,729.0,360.0,70.33962,175.73698,2025-11-20 08:42:17.264000
std,10026.512097,119.888254,102.739173,12.290986,11.261153,


Enfin, avant de construire les différents descripteurs, on va filtrer les datasets pour ne garder que les avions pour lesquels nous avons suffisamment de données (au moins 50 messages) :

In [None]:
# Nettoyage : supprimer les lignes avec données essentielles manquantes
df_parsed = df_parsed.dropna(subset=["icao", "lat", "lon", "timestamp"])

# ---- Filtrage des icao revenant 50 fois ou plus ----
icao_counts = df_parsed["icao"].value_counts()
icao_valides = icao_counts[icao_counts >= 50].index

df_parsed = df_parsed[df_parsed["icao"].isin(icao_valides)].reset_index(drop=True)
df_parsed.describe()

## 1. Construction des descripteurs :

Calcul de la distance totale parcourue avec la méthode Haversine.

Notre dataset contient beaucoup d'erreurs liées aux sauts de valeurs. Nous calculerons donc la distance parcourue à partir des données suivantes :
- La médiane des latitudes et longitudes des 15 premières lignes
- La médiane des latitudes et longitudes des 15 dernières lignes.

On applique ensuite la méthode Haversine sur les données trouvées.

In [12]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371_000  # rayon terrestre en mètres
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    return 2 * R * np.arcsin(np.sqrt(a))

distances = []

for icao, group in df_parsed.groupby("icao"):
    g = group.sort_values("timestamp")

    # 1) 15 premières lignes — médianes
    first15 = g.head(15)
    lat_start = first15["lat"].median()
    lon_start = first15["lon"].median()

    # 2) 15 dernières lignes — médianes
    last15 = g.tail(15)
    lat_end = last15["lat"].median()
    lon_end = last15["lon"].median()

    # 3) Distance Haversine entre les deux points médians
    total_dist = haversine(lat_start, lon_start, lat_end, lon_end)

    distances.append({
        "icao": icao,
        "median_haversine_distance": total_dist
    })

# ---- On transforme en DataFrame puis on merge ----
df_dist = pd.DataFrame(distances)

df_parsed = df_parsed.merge(df_dist, on="icao", how="left")


Calcul de la vitesse (groundspeed) moyenne et maximale

In [10]:
stats_velocity = df_parsed.groupby("icao")["velocity"].agg(
    mean_velocity="mean",
    max_velocity="max"
)

Calcul du taux de montée moyen

In [11]:
df_parsed["alt_shift"] = df_parsed.groupby("icao")["altitude"].shift()
df_parsed["dt"] = df_parsed.groupby("icao")["timestamp"].diff().dt.total_seconds()

# variation d’altitude par seconde (= taux de montée)
df_parsed["climb_rate"] = (df_parsed["altitude"] - df_parsed["alt_shift"]) / df_parsed["dt"]

stats_climb = df_parsed.groupby("icao")["climb_rate"].mean().rename("mean_climb_rate")


Calcul de la variance du cap (variance de l'attribut "headin")

In [12]:
stats_heading = df_parsed.groupby("icao")["heading"].var().rename("var_heading")

Dispersion géographique : calcul de l'écart-type des latitudes et longitudes

In [13]:
stats_geo = df_parsed.groupby("icao")[["lat", "lon"]].std().rename(
    columns={"lat": "std_lat", "lon": "std_lon"}
)


Calcul de l'indice de linéarité de la trajectoire

In [14]:
endpoints = df_parsed.groupby("icao").agg(
    lat_first=("lat", "first"),
    lon_first=("lon", "first"),
    lat_last=("lat", "last"),
    lon_last=("lon", "last"),
    total_distance=("segment_distance", "sum")
)

endpoints["straight_distance"] = haversine(
    endpoints["lat_first"], endpoints["lon_first"],
    endpoints["lat_last"], endpoints["lon_last"]
)

endpoints["linearity_index"] = (
    endpoints["straight_distance"] / endpoints["total_distance"]
)


Calcul du nombre de virages dont l'angle dépasse 25°.

Cette mesure est très complexe, car on doit prendre en compte les différentes erreurs présentes dans le dataset ainsi que les écarts de temps qui sont assez petits entre plusieurs mesures.

L'angle d'un virage sera donc calculé comme tel :
- On prend la médiane des latitudes et longitudes des 10 lignes précédentes
- On fait la même chose pour les 10 lignes suivantes.

**On fait ensuite le calcul de l'angle en se basant sur ces valeurs médianes.**

Cela limite les erreurs liées aux sauts de valeurs qu'on trouve de manière récurrente dans le dataset.

In [15]:
df_parsed["heading_shift"] = df_parsed.groupby("icao")["heading"].shift()
df_parsed["turn_angle"] = abs(df_parsed["heading"] - df_parsed["heading_shift"])

turns = (
    df_parsed[df_parsed["turn_angle"] > 25]
    .groupby("icao")["turn_angle"]
    .count()
    .rename("sharp_turns")
)


## On sélectionne maintenant un avion en particulier

On trie ensuite par timestamp pour obtenir les messages triés en fonction du temps (du plus vieux au plus récent)

In [None]:
# On teste pour plusieurs avions au cas où on observe des erreurs.
icao_sample = df_parsed['icao'].value_counts().index[0]
icao_sample2 = df_parsed['icao'].value_counts().index[1]
icao_sample3 = df_parsed['icao'].value_counts().index[2]
icao_sample4 = df_parsed['icao'].value_counts().index[3]
icao_sample5 = df_parsed['icao'].value_counts().index[4]
icao_sample6 = df_parsed['icao'].value_counts().index[5]
icao_sample7 = df_parsed['icao'].value_counts().index[6]
icao_sample8 = df_parsed['icao'].value_counts().index[7]
icao_sample9 = df_parsed['icao'].value_counts().index[8]
icao_sample10 = df_parsed['icao'].value_counts().index[9]
print(icao_sample)
print(icao_sample2)
print(icao_sample3)
print(icao_sample4)
print(icao_sample5)
flight = df_parsed[df_parsed['icao'] == icao_sample2].sort_values('timestamp')
flight.info()

043EA0
4D2537
E80421
4CA295
4D251B
<class 'pandas.core.frame.DataFrame'>
Index: 4266 entries, 821 to 395733
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   icao          4266 non-null   object        
 1   registration  4047 non-null   object        
 2   altitude      4025 non-null   float64       
 3   velocity      3958 non-null   float64       
 4   heading       3855 non-null   float64       
 5   lat           3784 non-null   float64       
 6   lon           3767 non-null   float64       
 7   timestamp     4258 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(5), object(2)
memory usage: 300.0+ KB


On tâche de nettoyer un peu le dataset sur les valeurs manquantes

In [None]:
flight = flight.drop(columns=[col for col in flight.columns if col.startswith("prev_")])
flight.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3717 entries, 12412 to 549037
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   icao            3717 non-null   object        
 1   registration    3717 non-null   object        
 2   altitude        3717 non-null   float64       
 3   velocity        3717 non-null   float64       
 4   heading         3717 non-null   float64       
 5   lat             3717 non-null   float64       
 6   lon             3717 non-null   float64       
 7   timestamp       3717 non-null   datetime64[ns]
 8   acceleration    3717 non-null   float64       
 9   heading_var     3717 non-null   float64       
 10  vertical_speed  3717 non-null   float64       
 11  distance        3717 non-null   float64       
dtypes: datetime64[ns](1), float64(9), object(2)
memory usage: 377.5+ KB
