<a href="https://colab.research.google.com/github/JasonChuang88/SVR/blob/main/predict_ATA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
!pip install hyperopt --quiet

In [1]:
%%capture
# hide code cell output
from google.colab import drive
drive.mount('/content/drive')

In [16]:
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pylab as py
import scipy.stats as stats
from sklearn import metrics, model_selection, preprocessing, svm
from typing import Union, Optional
# diff: weka use NORMALIZE
sc = preprocessing.StandardScaler()

In [3]:
def data_split(df: Union[pd.DataFrame, pd.Series], split: float = 0.8, features: int = 1, is_dummy: bool = False):
    nd = df.to_numpy()
    train_set, test_set = np.split(nd, [int(split * len(nd))])
    train_set = train_set.reshape(-1, features)
    test_set = test_set.reshape(-1, features)
    if not is_dummy:
        train_set = sc.fit_transform(train_set)
        test_set = sc.transform(test_set)
    return train_set, test_set

In [4]:
def handle_datasets(file_name: str, train_split: float = 0.8) -> list[np.ndarray]:
    df = pd.read_csv(f'/content/drive/MyDrive/Colab Notebooks/{file_name}.csv')
    # drop row if created_at>ATD
    df = df.drop(df[(df['created_at'] > df['ATD'])].index)
    df = df.dropna(subset=['ETA','ETD','ATA','ATD','is_rail'])
    df = df.drop(columns=['scac_code','firms_code'])
    # handle nominal
    dummy = df.drop(columns=['ATA','ATD','ETA','ETD','created_at'])
    dummy = pd.get_dummies(dummy)
    dummy_train, dummy_test = data_split(dummy, split = train_split,features = dummy.shape[1], is_dummy=True)
    # exclude 3 columns, others = input features
    x = df[df.columns.difference(['ATA','is_rail','POD','POL','svc_term_from','svc_term_to','ship_mode','vessel_name'])]
    y = df['ATA']
    x_train, x_test = data_split(x, split = train_split,features = x.shape[1])
    # concatenate dummy variable and numeric variable
    x_train = np.concatenate((dummy_train, x_train), axis=1)
    x_test = np.concatenate((dummy_test, x_test), axis=1)
    y_train, y_test = data_split(y, split = train_split)
    return [x_train, y_train, x_test, y_test]

In [20]:
def train(datasets: list[np.ndarray], kernel: str = 'rbf') -> None:
    x_train, y_train, x_test, y_test = (
        datasets[0],
        datasets[1],
        datasets[2],
        datasets[3],
    )
    model=svm.SVR(kernel=kernel)
    print(f'SVR kernel: {kernel}')
    model.fit(x_train, y_train.ravel())
    # training
    y_train_pred=model.predict(x_train)
    y_train_pred = sc.inverse_transform(y_train_pred.reshape(-1, 1))
    y_train = sc.inverse_transform(y_train)
    rmse = metrics.mean_squared_error(y_train, y_train_pred, squared=False)
    mae = metrics.mean_absolute_error(y_train, y_train_pred)
    r2 = metrics.r2_score(y_train, y_train_pred)
    print(f'training RMSE: {rmse}, MAE: {mae}, R2 score: {r2}')
    # testing
    y_test_pred=model.predict(x_test)
    y_test_pred = sc.inverse_transform(y_test_pred.reshape(-1, 1))
    y_test = sc.inverse_transform(y_test)
    rmse = metrics.mean_squared_error(y_test, y_test_pred, squared=False)
    mae = metrics.mean_absolute_error(y_test, y_test_pred)
    r2 = metrics.r2_score(y_test, y_test_pred)
    print(f'testing RMSE: {rmse}, MAE: {mae}, R2 score: {r2}')
    residual = y_test - y_test_pred
    plt.clf()
    plt.hist(residual, bins=40, range=(-50,50), facecolor="blue", edgecolor="black", alpha=0.7, density=False)
    plt.xlabel("residual")
    plt.ylabel("Probability")
    plt.show(block=False)

In [38]:
def algorithm(params) -> dict:
    # options: training_1000, training_10000, CMDU_training_2000, EGLV_training_4000, COSU_training_4000
    file_name = "EGLV_training_4000"
    x_train, y_train, x_test, y_test = handle_datasets(file_name,0.8)
    model=svm.SVR(**params)
    model.fit(x_train, y_train.ravel())
    # testing
    y_test_pred=model.predict(x_test)
    y_test_pred = sc.inverse_transform(y_test_pred.reshape(-1, 1))
    y_test = sc.inverse_transform(y_test)
    rmse = metrics.mean_squared_error(y_test, y_test_pred, squared=False)
    mae = metrics.mean_absolute_error(y_test, y_test_pred)
    r2 = metrics.r2_score(y_test, y_test_pred)
    return {"loss": mae, "status": STATUS_OK}

In [28]:
space = {
    "kernel": hp.choice("kernel", ['linear', 'poly', 'rbf']),
    "gamma": hp.choice("gamma", ['scale', 'auto']),
    "C": hp.quniform("C", 1, 100, 1),
    "max_iter": hp.choice("max_iter", [-1, 1, 10,100,1000]),
    "epsilon": hp.choice("epsilon", [0.001, 0.01, 0.1, 1]),
    "shrinking": hp.choice("shrinking", [True, False]),
}

In [39]:
from hyperopt import tpe, hp, fmin, STATUS_OK,Trials
from hyperopt.pyll.base import scope
import warnings
if __name__ == "__main__":
    warnings.filterwarnings('ignore')
    # options: training_1000, training_10000, CMDU_training_2000, EGLV_training_4000, COSU_training_4000
    trials = Trials()
    best = fmin(fn=algorithm,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials = trials)
    print("Best: {}".format(best))
    print(trials.results)
    print(trials.losses())
    print(trials.statuses())
    '''
    file_name = "EGLV_training_4000"
    data = handle_datasets(file_name=file_name,train_split=0.8)
    kernel_list = ['linear', 'poly', 'rbf']
    for kernel in kernel_list:
        train(data, kernel=kernel)
    '''

100%|██████████| 100/100 [00:08<00:00, 11.47trial/s, best loss: 0.5426957361017452]
Best: {'C': 2.0, 'epsilon': 1, 'gamma': 0, 'kernel': 0, 'max_iter': 4, 'shrinking': 0}
[{'loss': 2.7103509728093176, 'status': 'ok'}, {'loss': 8.422457962828835, 'status': 'ok'}, {'loss': 5.598710220076444, 'status': 'ok'}, {'loss': 14.258548773759292, 'status': 'ok'}, {'loss': 5.5527587291879765, 'status': 'ok'}, {'loss': 2.9897955722254985, 'status': 'ok'}, {'loss': 2.7103509728093176, 'status': 'ok'}, {'loss': 6.516918317850394, 'status': 'ok'}, {'loss': 2.5082061979249106, 'status': 'ok'}, {'loss': 4.419700420026611, 'status': 'ok'}, {'loss': 8.127957558118961, 'status': 'ok'}, {'loss': 11.080831973144676, 'status': 'ok'}, {'loss': 5.797461199975137, 'status': 'ok'}, {'loss': 8.127957558118961, 'status': 'ok'}, {'loss': 3.72347513463791, 'status': 'ok'}, {'loss': 6.3111695744944, 'status': 'ok'}, {'loss': 5.598710220076444, 'status': 'ok'}, {'loss': 3.466659103418548, 'status': 'ok'}, {'loss': 14.25