In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

# 1. Charger les données
df = pd.read_csv("../vessel-total-clean-final.csv", na_values=["", "NA", "\\N", "na", "n"])

df = df.sort_values(by='mmsi')
# 2. Sélection des colonnes utiles
features = ['sog', 'cog', 'heading', 'length', 'width', 'draft']
df = df[features + ['vessel_type']].dropna()

# 3. Séparer X et y
X = df[features]
y = df['vessel_type']

# 4. Diviser en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Prétraitement
num_cols = ['sog', 'cog', 'heading', 'length', 'width', 'draft']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols)
])

# 6. Pipeline avec RandomForest
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])

# 7. GridSearchCV pour tuning
param_grid = {
    'clf__n_estimators': [100],
    'clf__max_depth': [10, 20, None]
}

grid = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)

# 8. Évaluation sur l'ensemble de test
print("Best params:", grid.best_params_)
print("Classification report sur l'ensemble de test :")
print(classification_report(y_test, grid.predict(X_test)))

# 9. Sauvegarder le modèle
joblib.dump(grid.best_estimator_, "model_vessel_type.pkl")
 

              id       mmsi       base_date_time       lat       lon   sog  \
16221    1066897  205776000  2023-05-29 03:11:21  29.46829 -89.67173  12.5   
197785  16952869  205776000  2023-05-25 13:34:27  25.95846 -97.37879   0.0   
240380  20893558  205776000  2023-05-27 10:44:50  26.31137 -94.72570  12.5   
197824  16958321  205776000  2023-05-25 20:46:27  25.95845 -97.37879   0.0   
73757    5638428  205776000  2023-05-29 17:11:39  29.83829 -89.99256   0.2   
...          ...        ...                  ...       ...       ...   ...   
57645    4137266  671830000  2023-05-29 12:59:35  25.79636 -80.24541   0.0   
57643    4137239  671830000  2023-05-29 10:56:35  25.79634 -80.24539   0.0   
172185  14519096  671830000  2023-05-25 17:39:30  25.79637 -80.24541   0.0   
58856    4242120  671830000  2023-05-29 13:10:36  25.79637 -80.24541   0.0   
63041    4614993  671830000  2023-05-29 12:20:37  25.79635 -80.24542   0.0   

          cog  heading vessel_name         imo call_sign  vesse

['model_vessel_type.pkl']

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

# 1. Charger les données
df = pd.read_csv("../vessel-total-clean-final.csv", na_values=["", "NA", "\\N", "na", "n"])

# 2. Trier les données par MMSI
df = df.sort_values(by='mmsi')

# 3. Sélection des colonnes utiles
features = ['sog', 'cog', 'heading', 'length', 'width', 'draft']
df = df[features + ['vessel_type']].dropna()

# 4. Séparer X et y
X = df[features]
y = df['vessel_type']

# 5. Diviser en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Prétraitement
num_cols = ['sog', 'cog', 'heading', 'length', 'width', 'draft']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols)
])

# 7. Pipeline avec régression logistique
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('clf', LogisticRegression(random_state=42, max_iter=1000))
])

# 8. GridSearchCV pour tuning
param_grid = {
    'clf__C': [0.1, 1, 10, 100],  # Régularisation inverse (1/C contrôle la pénalisation)
    'clf__penalty': ['l2']  # L2 pour la régularisation de Ridge (standard pour la régression logistique)
}

grid = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)

# 9. Évaluation sur l'ensemble de test
print("Best params:", grid.best_params_)
print("Classification report sur l'ensemble de test :")
print(classification_report(y_test, grid.predict(X_test)))

# 10. Sauvegarder le modèle
joblib.dump(grid.best_estimator_, "model_vessel_type_logistic.pkl")


Best params: {'clf__C': 0.1, 'clf__penalty': 'l2'}
Classification report sur l'ensemble de test :
              precision    recall  f1-score   support

          60       0.72      0.92      0.81      6672
          61       0.00      0.00      0.00       389
          70       0.58      0.50      0.54     17492
          71       0.45      0.34      0.39      1255
          74       0.00      0.00      0.00       334
          79       0.00      0.00      0.00      1151
          80       0.54      0.66      0.59     18520
          82       0.00      0.00      0.00       336
          84       0.00      0.00      0.00       277
          89       0.32      0.13      0.19      1432

    accuracy                           0.58     47858
   macro avg       0.26      0.26      0.25     47858
weighted avg       0.54      0.58      0.56     47858



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


['model_vessel_type_logistic.pkl']

In [20]:
model_random_forest = joblib.load("model_vessel_type.pkl")

In [22]:
model_logistic = joblib.load("model_vessel_type_logistic.pkl")

In [None]:
import pandas as pd

# Exemple de nouvelles données
new_data = pd.DataFrame({
    'sog': [12.0],
    'cog': [180.0],
    'heading': [190.0],
    'length': [150.0],
    'width': [30.0],
    'draft': [8.0]
})

# Utiliser le modèle directement pour faire une prédiction
prediction = model_random_forest.predict(new_data)
print("Prédiction :", prediction)


Prédiction : [70]


In [None]:
# Exemple de nouvelles données
new_data = pd.DataFrame({
    'sog': [12.0],
    'cog': [180.0],
    'heading': [190.0],
    'length': [150.0],
    'width': [30.0],
    'draft': [8.0]
})

# Utiliser le modèle directement pour faire une prédiction
prediction = model_logistic.predict(new_data)
print("Prédiction :", prediction)


Prédiction : [80]
