# Analyse Exploratoire des Données (EDA) - Trafic Maritime

In [1]:
import pandas as pd

df = pd.read_csv('../data/raw/ais_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,mmsi,navigationalstatus,sog,cog,heading,shiptype,width,length,draught
0,0,219019621,Unknown value,0.0,86.0,86.0,Fishing,4.0,9.0,
1,1,265628170,Unknown value,0.0,334.5,,Port tender,8.0,27.0,
2,2,219005719,Unknown value,0.0,208.7,,Fishing,4.0,11.0,
3,3,219028066,Unknown value,0.0,,,Pleasure,3.0,12.0,
4,4,212584000,Moored,0.0,153.0,106.0,Cargo,13.0,99.0,6.3


## Informations sur le jeu de données

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 358351 entries, 0 to 358350
Data columns (total 10 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Unnamed: 0          358351 non-null  int64  
 1   mmsi                358351 non-null  int64  
 2   navigationalstatus  358351 non-null  object 
 3   sog                 357893 non-null  float64
 4   cog                 355182 non-null  float64
 5   heading             337737 non-null  float64
 6   shiptype            358351 non-null  object 
 7   width               354640 non-null  float64
 8   length              354608 non-null  float64
 9   draught             332808 non-null  float64
dtypes: float64(6), int64(2), object(2)
memory usage: 27.3+ MB


## Statistiques descriptives

In [3]:
df.describe()

Unnamed: 0.1,Unnamed: 0,mmsi,sog,cog,heading,width,length,draught
count,358351.0,358351.0,357893.0,355182.0,337737.0,354640.0,354608.0,332808.0
mean,186757.775285,293967800.0,12.122554,189.064529,190.076829,19.947854,124.971549,6.571402
std,112181.60187,121386600.0,9.355851,107.588825,107.107604,10.808627,71.268183,2.934392
min,0.0,9112856.0,0.0,0.0,0.0,1.0,2.0,0.4
25%,89587.5,219578000.0,9.2,116.3,120.0,12.0,83.0,4.6
50%,179947.0,248659000.0,11.3,168.7,170.0,17.0,115.0,6.1
75%,283503.5,304665000.0,13.3,300.175,303.0,28.0,181.0,7.9
max,387581.0,992195000.0,214.0,359.9,507.0,78.0,690.0,25.5


## Vérification des valeurs manquantes

In [4]:
df.isnull().sum()

Unnamed: 0                0
mmsi                      0
navigationalstatus        0
sog                     458
cog                    3169
heading               20614
shiptype                  0
width                  3711
length                 3743
draught               25543
dtype: int64

## Mappage des types de navires

In [5]:
def map_vessel_type(vessel_type):
    if 20 <= vessel_type <= 29: return 'Wing in ground'
    if vessel_type == 30: return 'Fishing'
    if 31 <= vessel_type <= 32: return 'Towing'
    if vessel_type == 33: return 'Dredging or underwater ops'
    if vessel_type == 34: return 'Diving ops'
    if vessel_type == 35: return 'Military ops'
    if vessel_type == 36: return 'Sailing'
    if vessel_type == 37: return 'Pleasure Craft'
    if 40 <= vessel_type <= 49: return 'High speed craft'
    if vessel_type == 50: return 'Pilot Vessel'
    if vessel_type == 51: return 'Search and Rescue'
    if vessel_type == 52: return 'Tug'
    if vessel_type == 53: return 'Port Tender'
    if vessel_type == 54: return 'Anti-pollution'
    if vessel_type == 55: return 'Law Enforcement'
    if vessel_type == 58: return 'Medical Transport'
    if 60 <= vessel_type <= 69: return 'Passenger'
    if 70 <= vessel_type <= 79: return 'Cargo'
    if 80 <= vessel_type <= 89: return 'Tanker'
    if 90 <= vessel_type <= 99: return 'Other'
    return 'Not available'

df['VesselType_Desc'] = df['VesselType'].apply(map_vessel_type)
df.head()

KeyError: 'VesselType'

## Visualisations

In [6]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')



### Distribution des types de navires

In [None]:
plt.figure(figsize=(12, 8))
sns.countplot(y='VesselType_Desc', data=df, order = df['VesselType_Desc'].value_counts().index)
plt.title('Distribution des types de navires')
plt.show()

### Distribution de la vitesse des navires (SOG)

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['SOG'], bins=30, kde=True)
plt.title('Distribution de la vitesse des navires (SOG)')
plt.show()

### Carte des positions des navires

In [None]:
plt.figure(figsize=(10, 8))
sns.scatterplot(x='LON', y='LAT', data=df, hue='VesselType_Desc', s=10)
plt.title('Carte des positions des navires')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()

# Nettoyage des données et Feature Engineering

## Gestion des valeurs manquantes

In [None]:
# Stratégie simple : suppression des lignes avec des valeurs manquantes
df.dropna(inplace=True)
df.isnull().sum()

## Gestion des outliers

In [None]:
# On filtre les vitesses irréalistes
df = df[df['SOG'] <= 40]

## Conversion du Timestamp

In [None]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

## Création de caractéristiques temporelles

In [None]:
df['hour'] = df['Timestamp'].dt.hour
df['day_of_week'] = df['Timestamp'].dt.dayofweek
df.head()

# Modélisation

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

## Préparation des données pour la modélisation

In [None]:
# Sélection des caractéristiques et de la cible
features = ['SOG', 'COG', 'LON', 'LAT', 'hour', 'day_of_week']
target = 'VesselType_Desc'

X = df[features]
y = df[target]

# Division en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Entraînement du modèle

In [None]:
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

## Évaluation du modèle

In [None]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}
')

print(classification_report(y_test, y_pred))