In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

# 1. Charger les données
df = pd.read_csv("../vessel-total-clean-final.csv", na_values=["", "NA", "\\N", "na", "n"])

df = df.sort_values(by='mmsi')
# 2. Sélection des colonnes utiles
features = ['sog', 'cog', 'heading', 'length', 'width', 'draft']
df = df[features + ['vessel_type']].dropna()

# 3. Encodage de la variable cible si elle est catégorielle
if df['vessel_type'].dtype == object:
    le = LabelEncoder()
    df['vessel_type'] = le.fit_transform(df['vessel_type'])

# 4. Séparer X et y
X = df[features]
y = df['vessel_type']

# 5. Diviser en ensembles d'entraînement et de test stratifiés
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # très important pour préserver la distribution
)

# 6. Prétraitement
num_cols = features
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols)
])

# 7. Pipeline avec RandomForest
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])

# 8. GridSearchCV pour tuning
param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [5, 10, 20]
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,  
    scoring='accuracy',
    n_jobs=-1
)
grid.fit(X_train, y_train)

# 9. Évaluation sur l'ensemble de test
y_pred = grid.predict(X_test)
print("Best params:", grid.best_params_)
print("Classification report :")
print(classification_report(y_test, y_pred))

# 10. Sauvegarder le modèle
joblib.dump(grid.best_estimator_, "model_vessel_type.pkl")


Best params: {'clf__max_depth': 20, 'clf__n_estimators': 100}
Classification report :
              precision    recall  f1-score   support

          60       1.00      1.00      1.00      6762
          61       1.00      1.00      1.00       368
          70       1.00      1.00      1.00     17501
          71       1.00      1.00      1.00      1301
          74       1.00      1.00      1.00       318
          79       1.00      1.00      1.00      1109
          80       1.00      1.00      1.00     18514
          82       1.00      1.00      1.00       326
          84       1.00      1.00      1.00       298
          89       1.00      1.00      1.00      1361

    accuracy                           1.00     47858
   macro avg       1.00      1.00      1.00     47858
weighted avg       1.00      1.00      1.00     47858



['model_vessel_type.pkl']

In [None]:
for feature in features:
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import train_test_split
    Xf = df[[feature]]
    X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(Xf, y, stratify=y, random_state=42)
    clf = RandomForestClassifier().fit(X_train_f, y_train_f)
    acc = clf.score(X_test_f, y_test_f)
    print(f"Accuracy avec '{feature}' seul : {acc:.4f}")


Accuracy avec 'sog' seul : 0.4394
Accuracy avec 'cog' seul : 0.4299
Accuracy avec 'heading' seul : 0.6039
Accuracy avec 'length' seul : 0.8834
Accuracy avec 'width' seul : 0.6923
Accuracy avec 'draft' seul : 0.8861


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

# 1. Charger les données
df = pd.read_csv("../vessel-total-clean-final.csv", na_values=["", "NA", "\\N", "na", "n"])

# 2. Trier les données par MMSI
df = df.sort_values(by='mmsi')

# 3. Sélection des colonnes utiles
features = ['sog', 'cog', 'heading', 'length', 'width', 'draft']
df = df[features + ['vessel_type']].dropna()

# 4. Séparer X et y
X = df[features]
y = df['vessel_type']

# 5. Diviser en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Prétraitement
num_cols = ['sog', 'cog', 'heading', 'length', 'width', 'draft']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols)
])

# 7. Pipeline avec régression logistique
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('clf', LogisticRegression(random_state=42, max_iter=1000))
])

# 8. GridSearchCV pour tuning
param_grid = {
    'clf__C': [0.1, 1, 10, 100],  # Régularisation inverse (1/C contrôle la pénalisation)
    'clf__penalty': ['l2']  # L2 pour la régularisation de Ridge (standard pour la régression logistique)
}

grid = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)

# 9. Évaluation sur l'ensemble de test
print("Best params:", grid.best_params_)
print("Classification report sur l'ensemble de test :")
print(classification_report(y_test, grid.predict(X_test)))

# 10. Sauvegarder le modèle
joblib.dump(grid.best_estimator_, "model_vessel_type_logistic.pkl")


Best params: {'clf__C': 0.1, 'clf__penalty': 'l2'}
Classification report sur l'ensemble de test :
              precision    recall  f1-score   support

          60       0.72      0.92      0.81      6672
          61       0.00      0.00      0.00       389
          70       0.58      0.50      0.54     17492
          71       0.45      0.34      0.39      1255
          74       0.00      0.00      0.00       334
          79       0.00      0.00      0.00      1151
          80       0.54      0.66      0.59     18520
          82       0.00      0.00      0.00       336
          84       0.00      0.00      0.00       277
          89       0.32      0.13      0.19      1432

    accuracy                           0.58     47858
   macro avg       0.26      0.26      0.25     47858
weighted avg       0.54      0.58      0.56     47858



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


['model_vessel_type_logistic.pkl']

In [20]:
model_random_forest = joblib.load("model_vessel_type.pkl")

In [22]:
model_logistic = joblib.load("model_vessel_type_logistic.pkl")

In [None]:
import pandas as pd

# Exemple de nouvelles données
new_data = pd.DataFrame({
    'sog': [12.0],
    'cog': [180.0],
    'heading': [190.0],
    'length': [150.0],
    'width': [30.0],
    'draft': [8.0]
})

# Utiliser le modèle directement pour faire une prédiction
prediction = model_random_forest.predict(new_data)
print("Prédiction :", prediction)


Prédiction : [70]


In [None]:
# Exemple de nouvelles données
new_data = pd.DataFrame({
    'sog': [12.0],
    'cog': [180.0],
    'heading': [190.0],
    'length': [150.0],
    'width': [30.0],
    'draft': [8.0]
})

# Utiliser le modèle directement pour faire une prédiction
prediction = model_logistic.predict(new_data)
print("Prédiction :", prediction)


Prédiction : [80]
