In [1]:
import pandas as pd
import numpy as np
import random
import os
import gc

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None  # default='warn'

from autogluon.tabular import TabularPredictor, TabularDataset

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
import bisect
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

train = pd.read_csv('train_fillna.csv')
test = pd.read_csv('test_fillna.csv')

In [4]:
train['U_WIND'].fillna(train['U_WIND'].mean(), inplace=True)
train['V_WIND'].fillna(train['V_WIND'].mean(), inplace=True)
train['AIR_TEMPERATURE'].fillna(train['AIR_TEMPERATURE'].mean(), inplace=True)
train['BN'].fillna(train['BN'].mean(), inplace=True)

In [5]:
test['U_WIND'].fillna(train['U_WIND'].mean(), inplace=True)
test['V_WIND'].fillna(train['V_WIND'].mean(), inplace=True)
test['AIR_TEMPERATURE'].fillna(train['AIR_TEMPERATURE'].mean(), inplace=True)
test['BN'].fillna(train['BN'].mean(), inplace=True)

In [6]:
# datetime 컬럼 처리
train['ATA'] = pd.to_datetime(train['ATA'])

# datetime을 여러 파생 변수로 변환
for df in [train]:
    df['year'] = df['ATA'].dt.year
    df['month'] = df['ATA'].dt.month
    df['day'] = df['ATA'].dt.day
    df['hour'] = df['ATA'].dt.hour
    df['minute'] = df['ATA'].dt.minute

In [7]:
# datetime 컬럼 처리
test['ATA'] = pd.to_datetime(test['ATA'])

# datetime을 여러 파생 변수로 변환
for df in [test]:
    df['year'] = df['ATA'].dt.year
    df['month'] = df['ATA'].dt.month
    df['day'] = df['ATA'].dt.day
    df['hour'] = df['ATA'].dt.hour
    df['minute'] = df['ATA'].dt.minute

In [8]:
train_dist=train
test_dist=test

In [9]:
train_dist['GT/Port_Size']=train_dist['GT']/train_dist['PORT_SIZE'] 
test_dist['GT/Port_Size']=test_dist ['GT']/test_dist ['PORT_SIZE']

In [10]:
train_dist['ARI_CO_mean']=0
test_dist['ARI_CO_mean']=0
# train_dist에 ARI_CO_mean 변수 추가
ARI_CO_mean=[]
for i in train_dist['ARI_CO'].unique():
    ARI_CO_mean.append(train_dist[train_dist['ARI_CO']==i]['CI_HOUR'].mean())
for j in range(len(train_dist['ARI_CO'].unique())):
    train_dist.loc[train_dist['ARI_CO'] == train_dist['ARI_CO'].unique()[j]
                   , 'ARI_CO_mean'] = ARI_CO_mean[j]
for k in range(len(train_dist['ARI_CO'].unique())):
    test_dist.loc[test_dist['ARI_CO'] == train_dist['ARI_CO'].unique()[k]
                   , 'ARI_CO_mean'] = ARI_CO_mean[k]

In [11]:
train_dist['ARI_PO_mean']=0
test_dist['ARI_PO_mean']=0
ARI_PO_mean=[]
for i in train_dist['ARI_PO'].unique():
    ARI_PO_mean.append(train_dist[train_dist['ARI_PO']==i]['CI_HOUR'].mean())
for j in range(len(train_dist['ARI_PO'].unique())):
    train_dist.loc[train_dist['ARI_PO'] == train_dist['ARI_PO'].unique()[j]
                   , 'ARI_PO_mean'] = ARI_PO_mean[j]
for k in range(len(train_dist['ARI_PO'].unique())):
    test_dist.loc[test_dist['ARI_PO'] == train_dist['ARI_PO'].unique()[k]
                   , 'ARI_PO_mean'] = ARI_PO_mean[k]   

In [12]:
from datetime import date
for i in train_dist.index:
    train_dist.loc[i, 'weekday'] = date(train_dist['year'][i],
                                 train_dist['month'][i],
                                 train_dist['day'][i]).weekday()
for i in test_dist.index:
    test_dist.loc[i, 'weekday'] = date(test_dist['year'][i],
                                 test_dist['month'][i],
                                 test_dist['day'][i]).weekday()

In [16]:
train_dist.to_csv('../DAT/train_pro.csv', index=False)

In [17]:
test_dist.to_csv('../DAT/test_pro.csv', index=False)