In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib

data = pd.read_csv(r"After_Sort.csv", parse_dates=["BaseDateTime"])


# Colonnes utiles
data_tri = data[['MMSI', 'BaseDateTime', 'LAT', 'LON', 'SOG', 'COG', 'Heading']]
# Conversion de la colonne BaseDateTime en format datetime
data_tri['BaseDateTime'] = pd.to_datetime(data_tri['BaseDateTime'], format='mixed', errors='coerce')

# Tri par navire et date
data_tri = data_tri.sort_values(by=['MMSI', 'BaseDateTime']).reset_index(drop=True)

data_tri.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_tri['BaseDateTime'] = pd.to_datetime(data_tri['BaseDateTime'], format='mixed', errors='coerce')


Unnamed: 0,MMSI,BaseDateTime,LAT,LON,SOG,COG,Heading
0,205776000,2023-05-25 00:07:27,25.95847,-97.37876,0.0,77.2,249.0
1,205776000,2023-05-25 00:13:27,25.95846,-97.3788,0.0,77.2,249.0
2,205776000,2023-05-25 00:16:27,25.95847,-97.3788,0.0,77.2,249.0
3,205776000,2023-05-25 00:31:27,25.95848,-97.37879,0.0,77.2,249.0
4,205776000,2023-05-25 00:34:27,25.95846,-97.37882,0.0,77.2,249.0


In [5]:
def create_lag(bateau, lag=2):
    # Pour chaque bateau (groupe de données), on crée des variables "lag" (décalées dans le temps)
    for i in range(1, lag + 1):
        # Décalage des valeurs LAT, LON, SOG, COG, Heading de i pas en arrière (t-i)
        bateau[f'lat_t-{i}'] = bateau['LAT'].shift(i)
        bateau[f'lon_t-{i}'] = bateau['LON'].shift(i)
        bateau[f'sog_t-{i}'] = bateau['SOG'].shift(i)
        bateau[f'cog_t-{i}'] = bateau['COG'].shift(i)
        bateau[f'heading_t-{i}'] = bateau['Heading'].shift(i)

    # Création des cibles à prédire : positions LAT et LON dans le futur
    # Décalage vers l’avant de 4 pas (5 minutes plus tard, si échantillonnage 1min)
    bateau['ciblelat_5'] = bateau['LAT'].shift(-4)
    bateau['ciblelon_5'] = bateau['LON'].shift(-4)

    # Décalage vers l’avant de 9 pas (10 minutes plus tard)
    bateau['ciblelat_10'] = bateau['LAT'].shift(-9)
    bateau['ciblelon_10'] = bateau['LON'].shift(-9)

    # Décalage vers l’avant de 14 pas (15 minutes plus tard)
    bateau['ciblelat_15'] = bateau['LAT'].shift(-14)
    bateau['ciblelon_15'] = bateau['LON'].shift(-14)

    return bateau  # Retourne le DataFrame enrichi des variables retardées et des cibles futures


In [6]:
# Application de la fonction create_lag sur chaque groupe identifié par MMSI (chaque bateau)
# .groupby('MMSI') traite bateau par bateau
# .apply(create_lag) applique la fonction à chaque groupe
# .dropna() supprime les lignes avec des valeurs manquantes (créées par shift)
# .reset_index(drop=True) remet un index propre et continu
data = data.groupby('MMSI').apply(create_lag).dropna().reset_index(drop=True)

  data = data.groupby('MMSI').apply(create_lag).dropna().reset_index(drop=True)


In [7]:
bateau_lags = data  # On stocke le résultat dans bateau_lags

# On s’assure encore une fois qu’il n’y ait pas de valeurs manquantes, puis on réinitialise l’index
bateau_lags = bateau_lags.dropna().reset_index(drop=True)

bateau_lags.head(5)  # Affiche les 5 premières lignes pour vérifier le résultat

Unnamed: 0,id,MMSI,BaseDateTime,LAT,LON,SOG,COG,Heading,VesselName,IMO,...,lon_t-2,sog_t-2,cog_t-2,heading_t-2,ciblelat_5,ciblelon_5,ciblelat_10,ciblelon_10,ciblelat_15,ciblelon_15
0,22886,205776000,2023-05-29 00:03:30,29.07501,-89.29289,12.2,42.6,43.0,LA LUNA,IMO9846079,...,-89.30204,12.1,50.5,47.0,29.08416,-89.28433,29.10208,-89.27389,29.13688,-89.25611
1,23704,205776000,2023-05-29 00:04:41,29.07796,-89.28988,12.2,42.6,42.0,LA LUNA,IMO9846079,...,-89.29583,12.2,42.2,42.0,29.08766,-89.28203,29.11243,-89.26822,29.1408,-89.25527
2,32388,205776000,2023-05-29 00:05:51,29.08094,-89.28694,12.1,36.1,35.0,LA LUNA,IMO9846079,...,-89.29289,12.2,42.6,43.0,29.09125,-89.27998,29.11607,-89.26623,29.13335,-89.2576
3,37801,205776000,2023-05-29 00:01:11,29.06912,-89.29884,11.8,42.6,42.0,LA LUNA,IMO9846079,...,-89.28988,12.2,42.6,42.0,29.09489,-89.27796,29.12721,-89.26069,29.11982,-89.26421
4,42158,205776000,2023-05-29 00:07:00,29.08416,-89.28433,12.4,34.4,31.0,LA LUNA,IMO9846079,...,-89.28694,12.1,36.1,35.0,29.09843,-89.27595,29.12353,-89.26242,29.14466,-89.25519


In [8]:
colonnes_utiles = [
    'LAT', 'LON', 'SOG', 'COG', 'Heading',
    'lat_t-1', 'lon_t-1', 'sog_t-1', 'cog_t-1', 'heading_t-1',
    'lat_t-2', 'lon_t-2', 'sog_t-2', 'cog_t-2', 'heading_t-2',
    'ciblelat_5', 'ciblelon_5',
    'ciblelat_10', 'ciblelon_10',
    'ciblelat_15', 'ciblelon_15'
]

# On élimine toutes les lignes du DataFrame `bateau_lags` qui ont des valeurs manquantes
# dans les colonnes listées dans `colonnes_utiles`, afin d'obtenir un dataset propre
# pour l'entraînement ou l'évaluation.
bateau_clean = bateau_lags.dropna(subset=colonnes_utiles)

# Affiche les 5 premières lignes du DataFrame nettoyé pour vérifier le résultat.
bateau_clean.head(5)

Unnamed: 0,id,MMSI,BaseDateTime,LAT,LON,SOG,COG,Heading,VesselName,IMO,...,lon_t-2,sog_t-2,cog_t-2,heading_t-2,ciblelat_5,ciblelon_5,ciblelat_10,ciblelon_10,ciblelat_15,ciblelon_15
0,22886,205776000,2023-05-29 00:03:30,29.07501,-89.29289,12.2,42.6,43.0,LA LUNA,IMO9846079,...,-89.30204,12.1,50.5,47.0,29.08416,-89.28433,29.10208,-89.27389,29.13688,-89.25611
1,23704,205776000,2023-05-29 00:04:41,29.07796,-89.28988,12.2,42.6,42.0,LA LUNA,IMO9846079,...,-89.29583,12.2,42.2,42.0,29.08766,-89.28203,29.11243,-89.26822,29.1408,-89.25527
2,32388,205776000,2023-05-29 00:05:51,29.08094,-89.28694,12.1,36.1,35.0,LA LUNA,IMO9846079,...,-89.29289,12.2,42.6,43.0,29.09125,-89.27998,29.11607,-89.26623,29.13335,-89.2576
3,37801,205776000,2023-05-29 00:01:11,29.06912,-89.29884,11.8,42.6,42.0,LA LUNA,IMO9846079,...,-89.28988,12.2,42.6,42.0,29.09489,-89.27796,29.12721,-89.26069,29.11982,-89.26421
4,42158,205776000,2023-05-29 00:07:00,29.08416,-89.28433,12.4,34.4,31.0,LA LUNA,IMO9846079,...,-89.28694,12.1,36.1,35.0,29.09843,-89.27595,29.12353,-89.26242,29.14466,-89.25519


In [9]:
# Liste des colonnes utilisées comme variables explicatives (features)
features = [
    'LAT', 'LON', 'SOG', 'COG', 'Heading',
    'lat_t-1', 'lon_t-1', 'sog_t-1', 'cog_t-1', 'heading_t-1',
    'lat_t-2', 'lon_t-2', 'sog_t-2', 'cog_t-2', 'heading_t-2'
]

# Cibles à prédire à 5, 10 et 15 minutes
target5 = ['ciblelat_5', 'ciblelon_5']
target10 = ['ciblelat_10', 'ciblelon_10']
target15 = ['ciblelat_15', 'ciblelon_15']

# Séparation des données en variables explicatives (x) et cibles (y) pour chaque horizon
x = bateau_clean[features]
y5 = bateau_clean[target5]
y10 = bateau_clean[target10]
y15 = bateau_clean[target15]

# Découpage en jeu d'entraînement et de test pour chaque horizon
# Ici, la même division est répétée pour chaque cible, ce qui peut poser problème
# car X_train, X_test sont écrasés à chaque fois.
X_train, X_test, y_train_5, y_test_5 = train_test_split(x, y5, test_size=0.2, random_state=42)
X_train, X_test, y_train_10, y_test_10 = train_test_split(x, y10, test_size=0.2, random_state=42)
X_train, X_test, y_train_15, y_test_15 = train_test_split(x, y15, test_size=0.2, random_state=42)


In [10]:
# Création et entraînement de trois modèles RandomForest pour les horizons 5, 10 et 15 minutes
model5 = RandomForestRegressor(n_estimators=10, random_state=42)
model5.fit(X_train, y_train_5)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [12]:
model10 = RandomForestRegressor(n_estimators=10, random_state=42)
model10.fit(X_train, y_train_10)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [13]:
model15 = RandomForestRegressor(n_estimators=10, random_state=42)
model15.fit(X_train, y_train_15)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [14]:
y_pred_5 = model5.predict(X_test)

In [15]:
# Prédictions avec chaque modèle
y_pred_10 = model10.predict(X_test)  
y_pred_15 = model15.predict(X_test)

In [16]:
# Calcul des MSE et RMSE pour chaque horizon
mse5 = mean_squared_error(y_test_5, y_pred_5)
rmse5 = np.sqrt(mse5)
joblib.dump(model5, 'model5.pkl')  # Sauvegarde du modèle pour h=5

mse10 = mean_squared_error(y_test_10, y_pred_10)
rmse10 = np.sqrt(mse10)
joblib.dump(model10, 'model10.pkl')  # Sauvegarde du modèle pour h=10

mse15 = mean_squared_error(y_test_15, y_pred_15)
rmse15 = np.sqrt(mse15)
joblib.dump(model15, 'model15.pkl')  # Sauvegarde du modèle pour h=15

# Sauvegarde des données de test pour évaluation future
X_test.to_csv('X_test.csv', index=False)
y_test_5.to_csv('y_test_5.csv', index=False)
y_test_10.to_csv('y_test_10.csv', index=False)
y_test_15.to_csv('y_test_15.csv', index=False)