In [4]:
# !unzip '/content/drive/MyDrive/HD현대 AI Challenge/HD현대AI챌린지.zip'

In [5]:
!unzip 'open.zip'

Archive:  open.zip
  inflating: open/sample_submission.csv  
  inflating: open/test.csv           
  inflating: open/train.csv          


In [7]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import lightgbm as lgb
import bisect
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

train = pd.read_csv('open/train.csv').drop(columns=['SAMPLE_ID'])
test = pd.read_csv('open/test.csv').drop(columns=['SAMPLE_ID'])

# datetime 컬럼 처리
train['ATA'] = pd.to_datetime(train['ATA'])
test['ATA'] = pd.to_datetime(test['ATA'])

# datetime을 여러 파생 변수로 변환
for df in [train, test]:
    df['year'] = df['ATA'].dt.year
    df['month'] = df['ATA'].dt.month
    df['day'] = df['ATA'].dt.day
    df['hour'] = df['ATA'].dt.hour
    df['minute'] = df['ATA'].dt.minute
    df['weekday'] = df['ATA'].dt.weekday

# datetime 컬럼 제거
train.drop(columns='ATA', inplace=True)
test.drop(columns='ATA', inplace=True)

# Categorical 컬럼 인코딩
categorical_features = ['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'ID', 'SHIPMANAGER', 'FLAG']
encoders = {}

for feature in tqdm(categorical_features, desc="Encoding features"):
    le = LabelEncoder()
    train[feature] = le.fit_transform(train[feature].astype(str))
    le_classes_set = set(le.classes_)
    test[feature] = test[feature].map(lambda s: '-1' if s not in le_classes_set else s)
    le_classes = le.classes_.tolist()
    bisect.insort_left(le_classes, '-1')
    le.classes_ = np.array(le_classes)
    test[feature] = le.transform(test[feature].astype(str))
    encoders[feature] = le

# # 결측치 처리  갈룬이 전처리도 담당
# train.fillna(train.mean(), inplace=True)
# test.fillna(train.mean(), inplace=True)

Encoding features: 100%|██████████| 6/6 [00:01<00:00,  3.07it/s]


In [8]:
# CI_HOUR

In [9]:
!pip install autogluon

Collecting autogluon
  Downloading autogluon-0.8.2-py3-none-any.whl (9.7 kB)
Collecting autogluon.core[all]==0.8.2 (from autogluon)
  Downloading autogluon.core-0.8.2-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.0/224.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.features==0.8.2 (from autogluon)
  Downloading autogluon.features-0.8.2-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.tabular[all]==0.8.2 (from autogluon)
  Downloading autogluon.tabular-0.8.2-py3-none-any.whl (285 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m285.7/285.7 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.multimodal==0.8.2 (from autogluon)
  Downloading autogluon.multimodal-0.8.2-py3-none-any.whl (372 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3

In [10]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [11]:
train_data = TabularDataset(train)
save_path = 'automodel'
predictor =  TabularPredictor(
    label = 'CI_HOUR',
    path=save_path,
    eval_metric = 'mean_absolute_error',
    problem_type = 'regression'
)
predictor.fit(
    train_data = train_data,
    presets = 'best_quality',
    time_limit = 60*20
)

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 1200s
AutoGluon will save models to "automodel/"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Wed Aug 30 11:19:59 UTC 2023
Disk Space Avail:   49.32 GB / 83.96 GB (58.7%)
Train Data Rows:    367441
Train Data Columns: 30
Label Column: CI_HOUR
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    12102.84 MB
	Train Data (Original)  Memory Usage: 88.19 MB (0.7% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	S

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7e114db3fbe0>

In [12]:
predictor.leaderboard()

                 model  score_val  pred_time_val     fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0    LightGBMXT_BAG_L2 -44.511558     164.087631  1119.468996               44.730926         341.780554            2       True          4
1  WeightedEnsemble_L3 -44.511558     164.093781  1123.382717                0.006149           3.913721            3       True          6
2    LightGBMXT_BAG_L1 -50.686939     117.465014   715.212135              117.465014         715.212135            1       True          1
3  WeightedEnsemble_L2 -50.686939     117.479397   718.335683                0.014383           3.123548            2       True          3
4      LightGBM_BAG_L1 -59.023236       1.891692    62.476307                1.891692          62.476307            1       True          2
5      LightGBM_BAG_L2 -76.865855     119.756216   818.513071                0.399511          40.824629            2       True          5


Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBMXT_BAG_L2,-44.511558,164.087631,1119.468996,44.730926,341.780554,2,True,4
1,WeightedEnsemble_L3,-44.511558,164.093781,1123.382717,0.006149,3.913721,3,True,6
2,LightGBMXT_BAG_L1,-50.686939,117.465014,715.212135,117.465014,715.212135,1,True,1
3,WeightedEnsemble_L2,-50.686939,117.479397,718.335683,0.014383,3.123548,2,True,3
4,LightGBM_BAG_L1,-59.023236,1.891692,62.476307,1.891692,62.476307,1,True,2
5,LightGBM_BAG_L2,-76.865855,119.756216,818.513071,0.399511,40.824629,2,True,5


In [13]:
preds = predictor.predict(test)

In [14]:
submit = pd.read_csv('open/sample_submission.csv')
submit['CI_HOUR'] = preds
submit.to_csv('open/baseline_submit.csv', index=False)