In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import lightgbm as lgb
import bisect
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler

train = pd.read_csv('drive/MyDrive/HD_AI Challenge/open/train.csv').drop(columns=['SAMPLE_ID'])
test = pd.read_csv('drive/MyDrive/HD_AI Challenge/open/test.csv').drop(columns=['SAMPLE_ID'])

In [2]:
# datetime 컬럼 처리
train['ATA'] = pd.to_datetime(train['ATA'])
test['ATA'] = pd.to_datetime(test['ATA'])

# datetime을 여러 파생 변수로 변환
for df in [train, test]:
    df['year'] = df['ATA'].dt.year
    df['month'] = df['ATA'].dt.month
    df['day'] = df['ATA'].dt.day
    df['hour'] = df['ATA'].dt.hour
    df['minute'] = df['ATA'].dt.minute
    df['weekday'] = df['ATA'].dt.weekday

# datetime 컬럼 제거
train.drop(columns='ATA', inplace=True)
test.drop(columns='ATA', inplace=True)

# Categorical 컬럼 라벨 인코딩
categorical_features = ['ID' ,'SHIPMANAGER']
encoders = {}

for feature in tqdm(categorical_features, desc="Encoding features"):
    le = LabelEncoder()
    train[feature] = le.fit_transform(train[feature].astype(str))
    le_classes_set = set(le.classes_)
    test[feature] = test[feature].map(lambda s: '-1' if s not in le_classes_set else s)
    le_classes = le.classes_.tolist()
    bisect.insort_left(le_classes, '-1')
    le.classes_ = np.array(le_classes)
    test[feature] = le.transform(test[feature].astype(str))
    encoders[feature] = le

# HI_HOUR 중위값으로 인코딩 'ARI_CO', 'ARI_PO','SHIP_TYPE_CATEGORY', 'FLAG' ,'year' ,'month' ,'day' ,'weekday', 'hour'
NONZERO_HOUR_data = train[train['CI_HOUR'] > 0]

# 중앙값 인코딩할 열 목록
columns_to_encode = ['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'FLAG', 'year', 'weekday']

for col in columns_to_encode:
    # 해당 열에 대한 중앙값 계산
    median_col = NONZERO_HOUR_data.groupby(col)['CI_HOUR'].mean()

    # 훈련 데이터에 중앙값 매핑
    train[col] = train[col].map(median_col)

    # 테스트 데이터에 중앙값 매핑 (전체 중앙값으로 대체)
    test[col] = test[col].map(lambda s: median_col.median() if s not in median_col.index else median_col[s])

# 결측치 처리
train.fillna(train.mean(), inplace=True)
test.fillna(train.mean(), inplace=True)

Encoding features: 100%|██████████| 2/2 [00:00<00:00,  2.36it/s]


In [3]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import math

train['IS_WEEKEND'] = train['weekday'].isin([4,5,6])
train['DIST_ZERO'] = train['DIST'].apply(lambda x: 0 if x == 0 else 1)
train['u*v_speed'] = train.apply(lambda row: math.sqrt(row['U_WIND']**2 + row['V_WIND']**2), axis=1)
train['CI_HOUR_Add'] = train['CI_HOUR'].apply(lambda x: 0 if x == 0 else (1 if x <= 500 else 2))
train['CI_HOUR_500'] = train['CI_HOUR'].apply(lambda x: 0 if x <= 500 else 1)

test['IS_WEEKEND'] = test['weekday'].isin([4,5,6])
test['DIST_ZERO'] = test['DIST'].apply(lambda x: 0 if x == 0 else 1)
test['u*v_speed'] = test.apply(lambda row: math.sqrt(row['U_WIND']**2 + row['V_WIND']**2), axis=1)


In [4]:
# 데이터 분할을 위한 전처리
target = 'CI_HOUR_Add'
x = train.drop(labels=[target], axis = 1)
x = x.drop(labels='CI_HOUR', axis = 1)
x = x.drop(labels='CI_HOUR_500', axis = 1)
y = train.loc[:, target]

from sklearn.preprocessing import StandardScaler

#MIN-MAX 스케일링
scaler = MinMaxScaler()

# train 데이터 스케일링 후 데이터 프라임으로 생성
x = pd.DataFrame(scaler.fit_transform(x), columns=x.columns)

# test 데이터 스케일링 후 데이터 프라임으로 생성
test = pd.DataFrame(scaler.fit_transform(test), columns=test.columns)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import *

from sklearn.neighbors import KNeighborsClassifier


# 데이터분할
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = .2)

# XGBoost 모델 생성
model1 = xgb.XGBClassifier()
# 모델 학습
model1.fit(x_train, y_train)

# 예측
pred = model1.predict(x_val)

# 성능 평가
print('Accuracy:', accuracy_score(y_val, pred))
print('=' * 60)
print('Confusion Matrix:')
print(confusion_matrix(y_val, pred))
print('=' * 60)
print('Classification Report:')
print(classification_report(y_val, pred))

Accuracy: 0.9782492218196662
Confusion Matrix:
[[31578     0     0]
 [    6 44898    76]
 [    0  1623   207]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     31578
           1       0.97      1.00      0.98     44980
           2       0.73      0.11      0.20      1830

    accuracy                           0.98     78388
   macro avg       0.90      0.70      0.73     78388
weighted avg       0.97      0.98      0.97     78388



In [6]:
ZERO_HOUR_data = train[train['CI_HOUR_Add'] == 0]

In [7]:
# CI_HOUR가 0이 나온 데이터 분할을 위한 전처리
target = 'CI_HOUR'
x = ZERO_HOUR_data.drop(labels=[target], axis = 1)
x = x.drop(labels='CI_HOUR_Add', axis = 1)
x = x.drop(labels='CI_HOUR_500', axis = 1)
y = ZERO_HOUR_data.loc[:, target]
x = pd.DataFrame(scaler.fit_transform(x), columns=x.columns)

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = .2, random_state=42)

In [8]:
# LightGBM 모델 초기화
model2_1 = lgb.LGBMRegressor(
    boosting_type='gbdt',  # Gradient Boosting Decision Tree
    num_leaves=31,         # 트리의 최대 리프 노드 수
    learning_rate=0.05,    # 학습률
    n_estimators=1000      # 트리의 개수 (반복 횟수)
)

# 모델 학습
model2_1.fit(x_train, y_train)

# 테스트 데이터에 대한 예측
y_pred = model2_1.predict(x_val)

# 평가 지표 계산 (예: 평균 제곱 오차, MSE)
mse = mean_squared_error(y_val, y_pred)
print("Mean Squared Error:", mse)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2890
[LightGBM] [Info] Number of data points in the train set: 125812, number of used features: 26
Mean Squared Error: 0.0


In [9]:
#모델 2를 위한 데이터 처리
ZERO_HOUR_data1 = train[train['CI_HOUR_Add'] == 1]
ZERO_HOUR_data1

Unnamed: 0,ARI_CO,ARI_PO,SHIP_TYPE_CATEGORY,DIST,ID,BREADTH,BUILT,DEADWEIGHT,DEPTH,DRAUGHT,...,month,day,hour,minute,weekday,IS_WEEKEND,DIST_ZERO,u*v_speed,CI_HOUR_Add,CI_HOUR_500
0,155.873621,155.873621,80.827501,30.881018,24710,30.0,24,24300,10.0,10.0,...,12,17,21,29,85.866161,False,1,0.395712,1,0
4,155.873621,155.873621,80.827501,27.037650,911,50.0,10,116000,20.0,10.0,...,1,26,7,51,108.000334,False,1,3.296559,1,0
5,123.572883,100.838391,115.653917,49.953585,18068,40.0,7,183000,20.0,20.0,...,3,5,18,36,129.740341,False,1,6.728715,1,0
6,102.273391,100.963836,80.827501,42.276281,731,20.0,30,6800,10.0,10.0,...,12,11,3,0,108.000334,False,1,0.395712,1,0
9,75.597910,60.384307,80.827501,101.521598,17774,50.0,7,124000,30.0,20.0,...,11,30,19,29,129.740341,False,1,0.395712,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
391929,155.873621,155.873621,80.827501,7.304235,16354,40.0,9,51700,20.0,10.0,...,12,3,22,51,87.707819,False,1,3.914652,1,0
391932,75.597910,56.184249,115.653917,21.866691,20710,40.0,13,169000,20.0,20.0,...,6,1,15,27,99.167606,False,1,0.395712,1,0
391935,155.873621,155.873621,115.653917,5.884603,10196,10.0,12,3160,10.0,10.0,...,10,16,0,36,90.895777,False,1,1.173243,1,0
391936,136.563191,108.471666,115.653917,70.660241,8823,30.0,8,60300,20.0,10.0,...,3,23,22,35,87.707819,False,1,8.699063,1,0


In [10]:
target = 'CI_HOUR'
x = ZERO_HOUR_data1.drop(labels=[target], axis = 1)
x = x.drop(labels='CI_HOUR_Add', axis = 1)
x = x.drop(labels='CI_HOUR_500', axis = 1)
y = ZERO_HOUR_data1.loc[:, target]
y

0           3.450000
4         253.554444
5          68.391389
6          31.700556
9          58.193056
             ...    
391929     23.914444
391932     89.884167
391935    144.061389
391936     41.482222
391937      7.485278
Name: CI_HOUR, Length: 225655, dtype: float64

In [11]:
#모델 2_2
x = pd.DataFrame(scaler.fit_transform(x), columns=x.columns)

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = .2, random_state=42)
# LightGBM 모델 초기화
model2_2 = lgb.LGBMRegressor(
    boosting_type='gbdt',  # Gradient Boosting Decision Tree
    num_leaves=31,         # 트리의 최대 리프 노드 수
    learning_rate=0.05,    # 학습률
    n_estimators=1000       # 트리의 개수 (반복 횟수)
)

# 모델 학습
model2_2.fit(x_train, y_train)

# 테스트 데이터에 대한 예측
y_pred = model2_2.predict(x_val)

# 평가 지표 계산 (예: 평균 제곱 오차, MSE)
mse = mean_squared_error(y_val, y_pred)
print("Mean Squared Error:", mse)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3160
[LightGBM] [Info] Number of data points in the train set: 180524, number of used features: 28
[LightGBM] [Info] Start training from score 68.681419
Mean Squared Error: 5978.282421618357


In [12]:
#모델 2_3을 위한 데이터 처리
ZERO_HOUR_data2 = train[train['CI_HOUR_Add'] == 2]
ZERO_HOUR_data2

# CI_HOUR가 0이 아닌 데이터 분할을 위한 전처리
target = 'CI_HOUR'
x = ZERO_HOUR_data2.drop(labels=[target], axis = 1)
x = x.drop(labels='CI_HOUR_Add', axis = 1)
x = x.drop(labels='CI_HOUR_500', axis = 1)
y = ZERO_HOUR_data2.loc[:, target]
y

28        1510.709167
35         795.939722
71        1608.799167
82         644.975278
125        671.185833
             ...     
391852     570.612778
391878    1017.009722
391930     994.503333
391931    1163.874722
391933    1095.597222
Name: CI_HOUR, Length: 9019, dtype: float64

In [13]:
#모델 2_3
x = pd.DataFrame(scaler.fit_transform(x), columns=x.columns)

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = .2, random_state=42)


In [14]:
#모델 2_3
# LightGBM 모델 초기화
model2_3 = lgb.LGBMRegressor(
    boosting_type='gbdt',  # Gradient Boosting Decision Tree
    num_leaves=31,         # 트리의 최대 리프 노드 수
    learning_rate=0.05,    # 학습률
    n_estimators=1000       # 트리의 개수 (반복 횟수)
)

# 모델 학습
model2_3.fit(x_train, y_train)

# 테스트 데이터에 대한 예측
y_pred = model2_3.predict(x_val)

# 평가 지표 계산 (예: 평균 제곱 오차, MSE)
mse = mean_squared_error(y_val, y_pred)
print("Mean Squared Error:", mse)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3082
[LightGBM] [Info] Number of data points in the train set: 7215, number of used features: 27
[LightGBM] [Info] Start training from score 970.274223
Mean Squared Error: 134327.26106952544


In [15]:
# 모델1으로 시간 0 , 1 분류
pred = model1.predict(test)

# 0으로 예측된 데이터 추출
x_val_0 = test[pred == 0]
# 1로 예측된 데이터 추출
x_val_1 = test[pred == 1]
# 2로 예측된 데이터 추출
x_val_2 = test[pred == 2]

#분류된 모델로 2_1 , 2_2
pred2_1 = model2_1.predict(x_val_0)
pred2_2 = model2_2.predict(x_val_1)
pred2_3 = model2_3.predict(x_val_2)

# 대기시간이 500 이하로 예측된 데이터 추출
#x_val_3_1 = x_val_1[pred2_2 == 0]
# 대기시간이 500 초과로 예측된 데이터 추출
#x_val_3_2 = x_val_1[pred2_2 == 1]

#분류된 모델로 3_1 , 3_2
#pred3_1 = model3_1.predict(x_val_3_1)
#pred3_2 = model3_2.predict(x_val_3_2)

In [16]:
df1 = x_val_0.assign(final_predictions =pred2_1)
df2 = x_val_1.assign(final_predictions =pred2_2)
df3 = x_val_2.assign(final_predictions =pred2_3 )

In [17]:

dataframes = [df1, df2, df3]
result = pd.concat(dataframes, axis=0)
result = result.sort_index()

# 'final_predictions' 열의 값이 음수인 경우 0으로 바꾸기
result['final_predictions'] = result['final_predictions'].apply(lambda x: max(0, x))

final_predictions = result['final_predictions']
final_predictions

0          12.948829
1          42.010740
2          11.477643
3          72.372878
4           0.000000
             ...    
220486    123.213556
220487     79.547309
220488     39.541528
220489      0.000000
220490     94.673629
Name: final_predictions, Length: 220491, dtype: float64

In [18]:
submit = pd.read_csv('drive/MyDrive/HD_AI Challenge/open/sample_submission.csv')
submit['CI_HOUR'] = final_predictions
submit.to_csv('drive/MyDrive/HD_AI Challenge/open/baseline_submit.csv', index=False)