In [1]:
import pandas as pd
import numpy as np
import joblib
import math
from keras.models import load_model

import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Bidirectional
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# 전처리 끝낸 데이터
df_train = joblib.load('df_train.pkl')
df_test = joblib.load('df_test.pkl')


In [None]:
df2 = df_train.copy()

# light GBM 적용

In [4]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import random

# 시드 설정
seed = 123
random.seed(seed)
np.random.seed(seed)

# 시정 구간 분류 함수
def classify_visibility(vis1):
    if vis1 < 200:
        return 1
    elif vis1 < 500:
        return 2
    elif vis1 < 1000:
        return 3
    else:
        return 4

# 필요한 열만 선택
columns = ['year', 'month', 'day', 'time', 'minute', 'stn_id', 'ws10_deg', 'ws10_ms', 'ta', 're', 'hm', 'sun10', 'ts', 'dew_point']

# 학습 데이터와 테스트 데이터
X_train = df2[columns]
y_train = df2['vis1']
X_test = df_test[columns]

# LightGBM 데이터셋 생성
train_data = lgb.Dataset(X_train, label=y_train)

# LightGBM 파라미터 설정
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting': 'gbdt',
    'learning_rate': 0.1,
    'num_leaves': 31,
    'verbose': -1
}

# 모델 학습
model = lgb.train(params, train_data, num_boost_round=1000)

# 예측 수행
predicted_vis1 = model.predict(X_test, num_iteration=model.best_iteration)

# 예측된 vis1 값을 통해 클래스 분류
predicted_classes = np.array([classify_visibility(vis) for vis in predicted_vis1])

# df_test의 'class' 열에 예측값 입력
df_test['class'] = np.nan
df_test.loc[:, 'class'] = predicted_classes

# NaN 값을 클래스 1,2,3 중 하나로 랜덤하게 설정 (없을 경우 대비)
nan_indices = df_test[df_test['class'].isna()].index
num_nan = len(nan_indices)

# 랜덤한 클래스 1,2,3를 num_nan 개수만큼 생성
random_classes = np.random.choice([1, 2, 3], num_nan, replace=True)

# NaN 값을 랜덤한 클래스 값으로 설정
df_test.loc[nan_indices, 'class'] = random_classes

# 클래스 분포 확인
print(df_test['class'].value_counts())

4    52379
1      107
3       47
2       27
Name: class, dtype: int64


In [5]:
# df_test.to_csv('pred_E.csv') # 경로 설정후 지희한테 전달

In [6]:
pred_E = df_test

In [7]:
pred_E

Unnamed: 0,year,month,day,time,minute,stn_id,ws10_deg,ws10_ms,ta,re,hm,sun10,ts,dew_point,class
210241,3,1,1,0,0,4,0.741039,0.412844,0.437931,0.0,0.472579,0.0,0.086162,-19.467553,4
210242,3,1,1,0,10,4,0.778272,0.376147,0.434483,0.0,0.487748,0.0,0.080940,-19.467968,4
210243,3,1,1,0,20,4,0.799944,0.376147,0.431034,0.0,0.508751,0.0,0.078329,-19.467215,4
210244,3,1,1,0,30,4,0.763545,0.403670,0.434483,0.0,0.499417,0.0,0.080940,-19.465634,4
210245,3,1,1,0,40,4,0.754932,0.449541,0.434483,0.0,0.507585,0.0,0.080940,-19.464000,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262796,3,12,31,23,10,4,0.941373,0.559633,0.520690,0.0,0.691949,0.0,0.177546,-19.340921,4
262797,3,12,31,23,20,4,0.931370,0.532110,0.524138,0.0,0.693116,0.0,0.177546,-19.337239,4
262798,3,12,31,23,30,4,0.936649,0.486239,0.520690,0.0,0.701284,0.0,0.174935,-19.339054,4
262799,3,12,31,23,40,4,0.894693,0.339450,0.513793,0.0,0.705951,0.0,0.164491,-19.345017,4


# 모델 합치기

In [None]:
#### 여기까지하고 지희한테 보내기

In [107]:
# fog_test.csv부르기
df_original = pd.read_csv("fog_test.csv", index_col=1)
df_original['fog_test.class'].replace(-99, float('nan'), inplace=True)

In [108]:
# pred_A = pd.read_csv('C:\\Users\\Hong_PC\\Documents\\카카오톡 받은 파일\\240371.csv', index_col=0)
pred_C = pd.read_csv('C:\\Users\\Hong_PC\\Documents\\카카오톡 받은 파일\\240371_2번지역.csv', index_col=0)

In [109]:
# 지역 주의, 예측된 class 값만 df_original에 반영
# df_original.loc[df_original['fog_test.stn_id'].str[0] == 'A', 'fog_test.class'] = pred_A['fog_test.class'].values
# df_original.loc[df_original['fog_test.stn_id'].str[0] == 'B', 'fog_test.class'] = pred_B['class'].values
df_original.loc[df_original['fog_test.stn_id'].str[0] == 'C', 'fog_test.class'] = pred_C['class'].values
# df_original.loc[df_original['fog_test.stn_id'].str[0] == 'D', 'fog_test.class'] = pred_D['class'].values
df_original.loc[df_original['fog_test.stn_id'].str[0] == 'E', 'fog_test.class'] = pred_E['class'].values


In [110]:
df_original

Unnamed: 0_level_0,Unnamed: 0,fog_test.month,fog_test.day,fog_test.time,fog_test.minute,fog_test.stn_id,fog_test.ws10_deg,fog_test.ws10_ms,fog_test.ta,fog_test.re,fog_test.hm,fog_test.sun10,fog_test.ts,fog_test.class
fog_test.year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
L,1,1,1,0,0,AI,329.5,0.6,-2.5,0.0,87.5,0.0,-1.6,
L,2,1,1,0,10,AI,321.8,1.2,-2.5,0.0,88.2,0.0,-1.6,
L,3,1,1,0,20,AI,0.4,0.4,-2.5,0.0,88.6,0.0,-1.7,
L,4,1,1,0,30,AI,323.6,0.7,-2.6,0.0,88.7,0.0,-1.6,
L,5,1,1,0,40,AI,208.4,0.2,-2.6,0.0,89.0,0.0,-1.6,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L,262796,12,31,23,10,ED,338.8,6.1,4.5,0.0,71.3,0.0,1.9,4.0
L,262797,12,31,23,20,ED,335.2,5.8,4.6,0.0,71.4,0.0,1.9,4.0
L,262798,12,31,23,30,ED,337.1,5.3,4.5,0.0,72.1,0.0,1.8,4.0
L,262799,12,31,23,40,ED,322.0,3.7,4.3,0.0,72.5,0.0,1.4,4.0


In [111]:
# 혹시 결측값 있으면 4로 채움 
df_original['fog_test.class'].fillna(4, inplace=True)

In [112]:
df_original['fog_test.class'].value_counts()

4.0    262519
1.0       205
3.0        50
2.0        26
Name: fog_test.class, dtype: int64

In [102]:
df_original.to_csv('240371.csv', index=True)