# Import Library

In [281]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from sklearn import metrics, preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics, preprocessing
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras import utils
import seaborn as sns

In [282]:
def create_model(data, catcols):    
    inputs = []
    outputs = []
    for c in catcols:
        num_unique_values = int(data[c].nunique())
        embed_dim = int(min(np.ceil((num_unique_values)/2), 50))
        inp = layers.Input(shape=(1,))
        out = layers.Embedding(num_unique_values + 1, embed_dim, name=c)(inp)
        out = layers.SpatialDropout1D(0.3)(out)
        out = layers.Reshape(target_shape=(embed_dim, ))(out)
        inputs.append(inp)
        outputs.append(out)
    
    x = layers.Concatenate()(outputs)
    x = layers.BatchNormalization()(x)
    
    x = layers.Dense(300, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    
    x = layers.Dense(100, activation="relu")(x)
    
    y = layers.Dense(1, activation="sigmoid")(x)

    model = Model(inputs=inputs, outputs=y)
    return model

# 시작

In [283]:
cwd = os.getcwd()
print('현재 폴더 경로 : ', cwd)
# 폴더 위치 변경
os.chdir("C:\\Users\hyunj\\Dropbox\\종합설계프로젝트\\중요\\데이터")
print('변경 후 폴더 경로 : ', cwd)
print('현재 폴더 파일 : ', os.listdir())
data = pd.read_csv('Data.csv', encoding='CP949')
print('feature 항목 : ', data.columns)
data
data.info()

현재 폴더 경로 :  C:\Users\hyunj\Dropbox\종합설계프로젝트\중요\데이터
변경 후 폴더 경로 :  C:\Users\hyunj\Dropbox\종합설계프로젝트\중요\데이터
현재 폴더 파일 :  ['Block perspective.xlsx', 'CHE I perspective.xlsx', 'CHE II perspective.xlsx', 'Container perspective.xlsx', 'ContainerPersp.csv', 'ContainerPerspectiv_DStoLD.csv', 'Data.csv', 'FULL', 'Gate perspective.xlsx', 'hanjin.csv', 'ITV perspective.xlsx', 'POD', 'QC perspective.xlsx', 'Vessel perspective.xlsx', '디스코용 파일.csv', '분포', '분포.csv']
feature 항목 :  Index(['Time of DS-QUAYSIDE', 'Time of DS-MOVE', 'Time of DS-YARDSIDE',
       'ContainerID', 'DS-VESSEL-ID', 'LD-VESSEL-ID', 'DS-VESSEL-YEAR',
       'LD-VESSEL-YEAR', 'LD-POD', 'Full', 'Empty', 'Dwell Time'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24892 entries, 0 to 24891
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Time of DS-QUAYSIDE  24892 non-null  int64 
 1   Time of DS-MOVE      24892 non-null  in

In [284]:
# 데이터 Column 명 바꾸기 
data.rename(columns={'Time of DS-QUAYSIDE' : 'T_DS_QUAYSIDE',
                     'Time of DS-MOVE' : 'T_DS_MOVE',
                   'Time of DS-YARDSIDE' : 'T_DS_YARDSIDE'}, inplace=True)
data.drop(axis=1, columns = ['T_DS_QUAYSIDE', 'T_DS_YARDSIDE'], inplace=True)
data

Unnamed: 0,T_DS_MOVE,ContainerID,DS-VESSEL-ID,LD-VESSEL-ID,DS-VESSEL-YEAR,LD-VESSEL-YEAR,LD-POD,Full,Empty,Dwell Time
0,479,BEAU,MAUO,ALAP,2018,2018,CAVAN,1,0,359405
1,518,BMOU,MAUO,ALAP,2018,2018,USSEA,1,0,345321
2,957,BMOU,MAUO,ALAP,2018,2018,USSEA,1,0,344666
3,851,BMOU,MAUO,ALAP,2018,2018,CAVAN,1,0,347916
4,636,BMOU,MHAH,ALAP,2018,2018,USSEA,1,0,256699
...,...,...,...,...,...,...,...,...,...,...
24887,855,TGHU,SFNZ,SFNZ,2018,2018,CNNGB,0,1,32909
24888,576,TGHU,SFNZ,SFNZ,2018,2018,CNNGB,0,1,31494
24889,237,TGHU,SFNZ,SFNZ,2018,2018,CNSHA,0,1,36648
24890,476,TRLU,SFNZ,SFNZ,2018,2018,CNNGB,0,1,35029


In [285]:
data.describe()
print(data.isnull().sum().sum())

0


In [286]:
# Label Encodering
features = [x for x in data.columns if x not in [ "Dwell Time"]]
features

['T_DS_MOVE',
 'ContainerID',
 'DS-VESSEL-ID',
 'LD-VESSEL-ID',
 'DS-VESSEL-YEAR',
 'LD-VESSEL-YEAR',
 'LD-POD',
 'Full',
 'Empty']

In [287]:
for feature in features:
    lbl_enc = preprocessing.LabelEncoder()
    data[feature] = lbl_enc.fit_transform(data[feature].fillna("-1").astype(str).values)

In [288]:
# y값 스케일링
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X = data.iloc[:,:-1]
Y = data.iloc[:,-1]

X_train, X_test, original_y_train, original_y_test = train_test_split(X,Y,test_size=0.2,random_state=120,shuffle=False)
data['Dwell Time'] = scaler.fit_transform(data['Dwell Time'].values.reshape(-1,1))

In [289]:
X = data.iloc[:,:-1]
Y = data.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2,random_state=120, shuffle=False)
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
model = create_model(data, list(X.columns))
model.summary()

Model: "model_23"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_224 (InputLayer)         [(None, 1)]          0           []                               
                                                                                                  
 input_225 (InputLayer)         [(None, 1)]          0           []                               
                                                                                                  
 input_226 (InputLayer)         [(None, 1)]          0           []                               
                                                                                                  
 input_227 (InputLayer)         [(None, 1)]          0           []                               
                                                                                           

In [290]:
total_columns = list(X.columns)
model.compile(loss='mse', optimizer='adam')
model.fit([X_train.loc[:, feature].values for feature in total_columns], y_train.values,
          epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x17d4c6ddc70>

In [311]:
from sklearn.metrics import mean_squared_error 

y_pred_train = model.predict([X_train.loc[:, feature].values for feature in total_columns])
y_pred_test = model.predict([X_test.loc[:, feature].values for feature in total_columns])



y_pred_train = scaler.inverse_transform(y_pred_train.reshape(-1, 1))
y_pred_test = scaler.inverse_transform(y_pred_test.reshape(-1,1))

mse_train = mean_squared_error(original_y_train.values, y_pred_train)
mse_test = mean_squared_error(original_y_test.values, y_pred_test)
print("Train MSE : ", mse_train)
print("Train RMSE : ", mse_train**0.5)
print("Test MSE : ", mse_test)
print("Test RMSE : ", mse_test**0.5)

mae_train = mean_absolute_error(original_y_train,y_pred_train)
mae_test = mean_absolute_error(original_y_test,y_pred_test)
print('train_MAE: ',mae_train)
print('test_MAE: ',mae_test)

Train MSE :  4698443863.165347
Train RMSE :  68545.19577012927
Test MSE :  57889773211.141045
Test RMSE :  240602.9368298339
train_MAE:  46757.03606606708
test_MAE:  173353.06879754344


In [308]:
y_pred_test.reshape(-1)[:20]

array([628983.2 , 658078.06, 585705.  , 652964.56, 663700.6 , 646617.94,
       645744.8 , 639953.4 , 647878.44, 575926.1 , 886453.25, 899822.06,
       780268.94, 767705.25, 836934.5 , 916985.5 , 793095.5 , 600137.1 ,
       823670.2 , 732517.25], dtype=float32)

In [309]:
original_y_test.values[:20]

array([614285, 498889, 617559, 498306, 496793, 497168, 498325, 513224,
       511690, 604530, 839925, 826751, 661428, 662090, 658385, 629512,
       658965, 596485, 819308, 725222], dtype=int64)

KeyError: 0