In [114]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Load the dataset
data = pd.read_csv('C:\\Users\\gaura\\Downloads\\augmented_historical_ais_data.csv')

#check for null values
# data.isna().sum()

# One-hot encode categorical features
categorical_features = ['PORT OF ORIGIN', 'PORT OF DESTINATION', 'VESSEL TYPE']
data = pd.get_dummies(data, columns=categorical_features)

# Convert 'TIMESTAMP' to datetime format
data['TIMESTAMP'] = pd.to_datetime(data['TIMESTAMP'])

# Extract time features
data['HOUR'] = data['TIMESTAMP'].dt.hour
data['DAY_OF_WEEK'] = data['TIMESTAMP'].dt.dayofweek
data['MONTH'] = data['TIMESTAMP'].dt.month

# # Drop the original 'TIMESTAMP' column
# data = data.drop(columns=['TIMESTAMP'])

# Define the target variable and features
X = data[['COURSE', 'LATITUDE','LONGITUDE','HOUR','DAY_OF_WEEK','MONTH']]
y = data['MMSI']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify =y)

# Initialize and train the RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)


Accuracy: 0.8402061855670103
Classification Report:
               precision    recall  f1-score   support

   419000001       0.83      0.85      0.84       291
   419000002       0.85      0.82      0.84       291
   419000003       0.84      0.80      0.82       291
   419000004       0.84      0.89      0.86       291

    accuracy                           0.84      1164
   macro avg       0.84      0.84      0.84      1164
weighted avg       0.84      0.84      0.84      1164



try with different test data splits ratio

In [100]:

# Define the target variable and features
X = data[['COURSE', 'LATITUDE','LONGITUDE','HOUR','DAY_OF_WEEK','MONTH']]
y = data['MMSI']

# Function to train and evaluate the model with different splits
def evaluate_model(test_size):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42,stratify=y)
    
    # Initialize and train the RandomForestClassifier
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    return accuracy, report

# Evaluate with 85:15 split
accuracy_85_15, report_85_15 = evaluate_model(0.15)

print("Results for 85:15 split")
print("Accuracy:", accuracy_85_15)
print("Classification Report:\n", report_85_15)


Results for 85:15 split
Accuracy: 0.9014891179839634
Classification Report:
               precision    recall  f1-score   support

   419000001       0.88      0.92      0.90       218
   419000002       0.91      0.88      0.89       218
   419000003       0.91      0.88      0.90       219
   419000004       0.90      0.93      0.92       218

    accuracy                           0.90       873
   macro avg       0.90      0.90      0.90       873
weighted avg       0.90      0.90      0.90       873



In [90]:
# Print the size of training and test sets
print(f"Training data size: {X_train.shape[0]} samples")
print(f"Test data size: {X_test.shape[0]} samples")

# Print first few rows of training data
print("\nTraining data samples:")
print(X_train.head())
print(y_train.head())

# Print first few rows of test data
print("\nTest data samples:")
print(X_test.head())
print(y_test.head())

Training data size: 4652 samples
Test data size: 1164 samples

Training data samples:
        COURSE  LATITUDE  LONGITUDE  HOUR  DAY_OF_WEEK  MONTH
718  -0.946983 -1.532075   0.380557    16            2      6
4941  1.368981  0.318056   0.346502    12            0      6
2992  1.199974 -1.004218  -0.673667     0            2      3
988   0.253456 -1.202705  -1.104624    12            3      4
1023  0.311077  1.138099   1.591460     8            2      4
718     419000001
4941    419000003
2992    419000001
988     419000002
1023    419000002
Name: MMSI, dtype: int64

Test data samples:
        COURSE  LATITUDE  LONGITUDE  HOUR  DAY_OF_WEEK  MONTH
5291  1.188812  1.577028  -0.294635    16            0      4
3136 -0.288729 -1.474323   1.103197     0            5      4
639   0.676011  1.315324  -0.935455    12            3      6
2740  1.425085 -0.363752   0.162972     4            4      6
883  -1.638436 -0.154087  -1.585114     0            0      3
5291    419000004
3136    419000001