# 포트별 DESTINATION 예측 (Soft Voting 앙상블)

In [11]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

## 1. 전처리 함수

In [13]:
def clean_destination(dest):
    if isinstance(dest, str):
        return re.sub(r'\s+', '', dest.strip().upper())
    return dest

def create_sequences(data, seq_length, feature_cols, label_col):
    X, y = [], []
    for i in range(len(data) - seq_length):
        seq = data[feature_cols].iloc[i:i+seq_length].values
        target = data[label_col].iloc[i+seq_length]
        X.append(seq)
        y.append(target)
    return np.array(X), np.array(y)

## 2. 데이터 로딩 및 전처리

In [15]:
df = pd.read_csv("./datasets/all_merged.csv")
df["DESTINATION"] = df["DESTINATION"].apply(clean_destination)
df = df.dropna(subset=["DESTINATION"])

# 전체 DESTINATION에 대해 인코딩
le_global = LabelEncoder()
df["DEST_LABEL"] = le_global.fit_transform(df["DESTINATION"])
num_classes = len(le_global.classes_)

# 사용할 feature 목록
feature_cols = ["SPEED", "COG", "HEADING", "DRAFT", "LAT", "LON"]
seq_length = 10

# 예측 가능한 포트만 필터링
valid_ports = df.groupby("PORT_NAME")["DEST_LABEL"].nunique()
valid_ports = valid_ports[valid_ports > 1].index.tolist()

## 3. 포트별 LSTM + RF 앙상블 학습

In [21]:
ensemble_results = {}

for port in valid_ports:
    port_df = df[df["PORT_NAME"] == port].copy()
    if len(port_df) < seq_length + 10:
        continue

    port_df = port_df.sort_values("TIMESTAMP")
    port_df["DEST_LABEL"] = le_global.transform(port_df["DESTINATION"])  # 동일한 인코더 사용

    # 스케일링
    scaler = MinMaxScaler()
    port_df[feature_cols] = scaler.fit_transform(port_df[feature_cols])

    # 시퀀스 생성
    X_seq, y_seq = create_sequences(port_df, seq_length, feature_cols, "DEST_LABEL")
    if len(np.unique(y_seq)) < 2:
        continue

    # RF 입력용: 마지막 타임스텝
    X_rf = X_seq[:, -1, :]
    split = int(len(X_seq) * 0.8)
    X_train_seq, X_test_seq = X_seq[:split], X_seq[split:]
    X_train_rf, X_test_rf = X_rf[:split], X_rf[split:]
    y_train, y_test = y_seq[:split], y_seq[split:]

    # LSTM 모델 정의
    model_lstm = Sequential([
        LSTM(64, input_shape=(seq_length, len(feature_cols))),
        Dense(num_classes, activation='softmax')
    ])
    model_lstm.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model_lstm.fit(X_train_seq, y_train, epochs=5, batch_size=64, verbose=0)
    y_pred_lstm_proba = model_lstm.predict(X_test_seq, verbose=0)

    # 랜덤 포레스트 모델
    model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
    model_rf.fit(X_train_rf, y_train)
    y_pred_rf_proba = model_rf.predict_proba(X_test_rf)

    # soft voting 앙상블
    min_len = min(y_pred_lstm_proba.shape[1], y_pred_rf_proba.shape[1])
    avg_proba = (y_pred_lstm_proba[:, :min_len] + y_pred_rf_proba[:, :min_len]) / 2
    y_pred_ensemble = np.argmax(avg_proba, axis=1)

    # 정확도 저장
    acc = accuracy_score(y_test, y_pred_ensemble)
    ensemble_results[port] = round(acc, 4)

  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__in

## 4. 결과 출력

In [23]:
result_df = pd.DataFrame(ensemble_results.items(), columns=["PORT_NAME", "Ensemble_Accuracy"])
print(result_df.sort_values(by="Ensemble_Accuracy", ascending=False))

   PORT_NAME  Ensemble_Accuracy
7      CNSHA             0.0005
0      CNDAG             0.0000
39     KRMAS             0.0000
29     JPTYO             0.0000
30     JPUKB             0.0000
31     JPWAK             0.0000
32     JPYKK             0.0000
33     JPYOK             0.0000
34     KRBNP             0.0000
35     KRINC             0.0000
36     KRKAN             0.0000
37     KRKCN             0.0000
38     KRKPO             0.0000
40     KRPTK             0.0000
27     JPSMZ             0.0000
41     KRUSN             0.0000
42     KRYOS             0.0000
43     PHMNL             0.0000
44     RUNJK             0.0000
45     RUVVO             0.0000
46     TWKEL             0.0000
47     TWKHH             0.0000
48     TWTPE             0.0000
49     VNCLI             0.0000
50     VNHPH             0.0000
28     JPTOS             0.0000
26     JPSDJ             0.0000
1      CNHUA             0.0000
13     JPHKT             0.0000
2      CNLYG             0.0000
3      C

In [27]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# ----------------------- #
# 전처리 함수
# ----------------------- #
def clean_destination(dest):
    if isinstance(dest, str):
        return re.sub(r'\s+', '', dest.strip().upper())
    return dest

def create_sequences(data, seq_length, feature_cols, label_col):
    X, y = [], []
    for i in range(len(data) - seq_length):
        seq = data[feature_cols].iloc[i:i+seq_length].values
        target = data[label_col].iloc[i+seq_length]
        X.append(seq)
        y.append(target)
    return np.array(X), np.array(y)

# ----------------------- #
# 데이터 준비
# ----------------------- #
df = pd.read_csv("./datasets/all_merged.csv")
df["DESTINATION"] = df["DESTINATION"].apply(clean_destination)
df = df.dropna(subset=["DESTINATION"])

le_global = LabelEncoder()
df["DEST_LABEL"] = le_global.fit_transform(df["DESTINATION"])
num_classes = len(le_global.classes_)

feature_cols = ["SPEED", "COG", "HEADING", "DRAFT", "LAT", "LON"]
seq_length = 10

valid_ports = df.groupby("PORT_NAME")["DEST_LABEL"].nunique()
valid_ports = valid_ports[valid_ports > 1].index.tolist()

# ----------------------- #
# 포트별 Stacking 앙상블
# ----------------------- #
stacking_results = {}

for port in valid_ports:
    port_df = df[df["PORT_NAME"] == port].copy()
    if len(port_df) < seq_length + 10:
        continue

    port_df = port_df.sort_values("TIMESTAMP")
    port_df["DEST_LABEL"] = le_global.transform(port_df["DESTINATION"])

    scaler = MinMaxScaler()
    port_df[feature_cols] = scaler.fit_transform(port_df[feature_cols])

    X_seq, y_seq = create_sequences(port_df, seq_length, feature_cols, "DEST_LABEL")
    if len(np.unique(y_seq)) < 2:
        continue

    X_rf = X_seq[:, -1, :]
    split = int(len(X_seq) * 0.8)
    X_train_seq, X_test_seq = X_seq[:split], X_seq[split:]
    X_train_rf, X_test_rf = X_rf[:split], X_rf[split:]
    y_train, y_test = y_seq[:split], y_seq[split:]

    # (1) LSTM 학습
    model_lstm = Sequential([
        LSTM(64, input_shape=(seq_length, len(feature_cols))),
        Dense(num_classes, activation='softmax')
    ])
    model_lstm.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model_lstm.fit(X_train_seq, y_train, epochs=5, batch_size=64, verbose=0)
    y_pred_lstm_proba = model_lstm.predict(X_test_seq, verbose=0)

    # (2) RandomForest 학습
    model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
    model_rf.fit(X_train_rf, y_train)
    y_pred_rf_proba = model_rf.predict_proba(X_test_rf)

    # (3) Stacking feature 생성
    min_len = min(y_pred_lstm_proba.shape[1], y_pred_rf_proba.shape[1])
    X_meta = np.hstack([y_pred_lstm_proba[:, :min_len], y_pred_rf_proba[:, :min_len]])

    # (4) 메타 모델 학습 및 예측
    meta_model = LogisticRegression(max_iter=1000)
    meta_model.fit(X_meta, y_test)
    y_pred_stacked = meta_model.predict(X_meta)

    # (5) 정확도 저장
    acc = accuracy_score(y_test, y_pred_stacked)
    stacking_results[port] = round(acc, 4)

# ----------------------- #
# 결과 출력
# ----------------------- #
result_df = pd.DataFrame(stacking_results.items(), columns=["PORT_NAME", "Stacking_Accuracy"])
print(result_df.sort_values(by="Stacking_Accuracy", ascending=False))


  super().__init__(**kwargs)
  super().__init__(**kwargs)


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 130

In [35]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# ----------------------- #
# 전처리 함수
# ----------------------- #
def clean_destination(dest):
    if isinstance(dest, str):
        return re.sub(r'\s+', '', dest.strip().upper())
    return dest

def create_sequences(data, seq_length, feature_cols, label_col):
    X, y = [], []
    for i in range(len(data) - seq_length):
        seq = data[feature_cols].iloc[i:i+seq_length].values
        target = data[label_col].iloc[i+seq_length]
        X.append(seq)
        y.append(target)
    return np.array(X), np.array(y)

# ----------------------- #
# 데이터 준비
# ----------------------- #
df = pd.read_csv("./datasets/all_merged.csv")
df["DESTINATION"] = df["DESTINATION"].apply(clean_destination)
df = df.dropna(subset=["DESTINATION"])

le_global = LabelEncoder()
df["DEST_LABEL"] = le_global.fit_transform(df["DESTINATION"])
num_classes = len(le_global.classes_)

feature_cols = ["SPEED", "COG", "HEADING", "DRAFT", "LAT", "LON"]
seq_length = 10

valid_ports = df.groupby("PORT_NAME")["DEST_LABEL"].nunique()
valid_ports = valid_ports[valid_ports > 1].index.tolist()

# ----------------------- #
# 포트별 LSTM + XGBoost Stacking 앙상블
# ----------------------- #
stacking_results_xgb = {}

for port in valid_ports:
    port_df = df[df["PORT_NAME"] == port].copy()
    if len(port_df) < seq_length + 10:
        continue

    port_df = port_df.sort_values("TIMESTAMP")
    port_df["DEST_LABEL"] = le_global.transform(port_df["DESTINATION"])

    scaler = MinMaxScaler()
    port_df[feature_cols] = scaler.fit_transform(port_df[feature_cols])

    X_seq, y_seq = create_sequences(port_df, seq_length, feature_cols, "DEST_LABEL")
    if len(np.unique(y_seq)) < 2:
        continue

    X_rf = X_seq[:, -1, :]
    split = int(len(X_seq) * 0.8)
    X_train_seq, X_test_seq = X_seq[:split], X_seq[split:]
    X_train_rf, X_test_rf = X_rf[:split], X_rf[split:]
    y_train, y_test = y_seq[:split], y_seq[split:]

    # (1) LSTM 모델 학습
    model_lstm = Sequential([
        LSTM(64, input_shape=(seq_length, len(feature_cols))),
        Dense(num_classes, activation='softmax')
    ])
    model_lstm.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model_lstm.fit(X_train_seq, y_train, epochs=5, batch_size=64, verbose=0)
    y_pred_lstm_proba = model_lstm.predict(X_test_seq, verbose=0)

    # (2) RandomForest 모델 학습
    model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
    model_rf.fit(X_train_rf, y_train)
    y_pred_rf_proba = model_rf.predict_proba(X_test_rf)

    # (3) Stacking feature 생성
    min_len = min(y_pred_lstm_proba.shape[1], y_pred_rf_proba.shape[1])
    X_meta = np.hstack([y_pred_lstm_proba[:, :min_len], y_pred_rf_proba[:, :min_len]])

    # (4) y_test 값 확인 및 정리
    print(f"y_test unique values: {np.unique(y_test)}")  # y_test의 유니크 값 확인

    # y_test가 잘못된 클래스를 포함하면, 다시 라벨 인코딩
    y_train_cleaned = le_global.transform(y_train)
    y_test_cleaned = le_global.transform(y_test)

    # (5) XGBoost 메타 모델 학습
    meta_model_xgb = xgb.XGBClassifier(objective="multi:softmax", num_class=num_classes, random_state=42)
    meta_model_xgb.fit(X_meta, y_test_cleaned)
    y_pred_stacked_xgb = meta_model_xgb.predict(X_meta)

    # (6) 정확도 저장
    acc = accuracy_score(y_test_cleaned, y_pred_stacked_xgb)
    stacking_results_xgb[port] = round(acc, 4)

# 결과 출력
result_df_xgb = pd.DataFrame(stacking_results_xgb.items(), columns=["PORT_NAME", "Stacking_XGBoost_Accuracy"])
import ace_tools as tools; tools.display_dataframe_to_user(name="XGBoost Stacking 정확도", dataframe=result_df_xgb.sort_values(by="Stacking_XGBoost_Accuracy", ascending=False))


  super().__init__(**kwargs)


y_test unique values: [127 207]


ValueError: y contains previously unseen labels: 670