In [4]:
import pandas as pd
import numpy as np
import joblib
import math
from keras.models import load_model
import random
import tensorflow as tf

import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Bidirectional
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# 전처리 끝낸 데이터
df_train = joblib.load('df_train.pkl')
df_test = joblib.load('df_test.pkl')


In [7]:
df2 = df_train.copy()

# Bi-directional LSTM 모델링(분류)

In [None]:
# Bi-directional LSTM 모델 정의
model = Sequential()
model.add(Bidirectional(LSTM(32, activation='tanh', return_sequences=True), input_shape=(sequence_length, len(columns))))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(16, activation='tanh')))
model.add(Dropout(0.3))
model.add(Dense(4, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# 조기 종료 설정
early_stopping = EarlyStopping(monitor='val_loss',
                               patience=5,
                               verbose=1,
                               restore_best_weights=True)

# 모델 학습
history = model.fit(X_train, y_train, batch_size=32, epochs=20, 
                    validation_split=0.3, 
                    class_weight=class_weight_dict,
                    callbacks=[early_stopping])

# 학습 결과 시각화
pd.DataFrame.from_dict(history.history).plot()

In [None]:
# 시퀀스 생성 함수 정의
def create_sequences_for_test(data, sequence_length):
    sequences = []
    for i in range(len(data) - sequence_length + 1):
        sequences.append(data[i:i + sequence_length])
    return np.array(sequences)

# df_test에 시퀀스 생성
X_test_sequences = create_sequences_for_test(test_data, sequence_length)

# 예측 수행
predictions = model.predict(X_test_sequences)

# 예측된 클래스 찾기 (softmax로 인해 확률 분포로 나옴)
predicted_classes = np.argmax(predictions, axis=1) + 1

df_test['class'] = np.nan
df_test.iloc[sequence_length-1:, df_test.columns.get_loc('class')] = predicted_classes

# 예측 결과 확인
df_test['class'].value_counts()

In [None]:
pred_E = df_test

# Bi-directional LSTM 모델링(회귀)

In [None]:
'''
# 양방향 LSTM 모델 구성
model = Sequential()
model.add(Bidirectional(LSTM(50, return_sequences=True), input_shape=(sequence_length, X_train.shape[-1])))
model.add(Dropout(0.2))  # Dropout 추가
model.add(Bidirectional(LSTM(50)))
model.add(Dropout(0.2))  # Dropout 추가
model.add(Dense(1, activation='linear'))

model.compile(optimizer='adam', loss='mse')

# Early Stopping 콜백 설정
early_stopping = EarlyStopping(monitor='val_loss', 
                               patience=3, 
                               verbose=1, # 로그출력
                               restore_best_weights=True)

# 모델 학습
history = model.fit(X_train, y_train, validation_split=0.2, epochs=100, batch_size=64, callbacks=[early_stopping])
'''

In [8]:
# 시드 설정
seed = 123
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

# 시퀀스 생성 함수 정의
def create_sequences(data, sequence_length, target_column):
    sequences = []
    targets = []
    for i in range(len(data) - sequence_length):
        sequences.append(data[i:i + sequence_length, :-1])
        targets.append(data[i + sequence_length, target_column])
    return np.array(sequences), np.array(targets)

# 필요한 열만 선택
columns = ['year', 'month', 'day', 'time', 'minute', 'stn_id', 'ws10_deg', 'ws10_ms', 'ta', 're', 'hm', 'sun10', 'ts', 'dew_point']
train_data = df2[columns + ['vis1']].values
test_data = df_test[columns].values

# 시퀀스 생성
sequence_length = 6
X_train, y_train = create_sequences(train_data, sequence_length, -1)
X_test, _ = create_sequences(test_data, sequence_length, -1)

In [9]:
# 양방향 LSTM 모델 구성
model = Sequential()
model.add(Bidirectional(LSTM(32, activation='tanh', return_sequences=True), input_shape=(sequence_length, X_train.shape[-1]))) # X_train.shape[-1]대신 len(columns)
model.add(Dropout(0.2))  # Dropout 추가
model.add(Bidirectional(LSTM(16, activation='tanh')))
model.add(Dropout(0.2))  # Dropout 추가
model.add(Dense(1))

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Early Stopping 콜백 설정
early_stopping = EarlyStopping(monitor='val_loss', 
                               patience=3, 
                               verbose=1, # 로그출력
                               restore_best_weights=True)

# 모델 학습
history = model.fit(X_train, y_train, validation_split=0.2, epochs=100, batch_size=64, callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
1278/5919 [=====>........................] - ETA: 47s - loss: 736118272.0000 - mae: 22964.5859

In [None]:
# 학습 결과 시각화
pd.DataFrame.from_dict(history.history).plot()
plt.show()

# 시퀀스 생성 함수 정의
def create_sequences_for_test(data, sequence_length):
    sequences = []
    for i in range(len(data) - sequence_length + 1):
        sequences.append(data[i:i + sequence_length])
    return np.array(sequences)

# df_test에 시퀀스 생성
X_test_sequences = create_sequences_for_test(test_data, sequence_length)

# 예측 수행
predicted_vis1 = model.predict(X_test_sequences)

# 시정 구간 분류 함수 정의
def classify_visibility(vis1):
    if vis1 < 200:
        return 1
    elif vis1 < 500:
        return 2
    elif vis1 < 1000:
        return 3
    else:
        return 4

# 예측된 vis1 값을 통해 클래스 분류
predicted_classes = np.array([classify_visibility(vis) for vis in predicted_vis1.flatten()])

# df_test의 'class' 열에 예측값 입력
df_test['class'] = np.nan
df_test.iloc[sequence_length-1:sequence_length-1+len(predicted_classes), df_test.columns.get_loc('class')] = predicted_classes

# NaN 값을 클래스 1,2,3 중 하나로 랜덤하게 설정
nan_indices = df_test[df_test['class'].isna()].index
num_nan = len(nan_indices)

# 랜덤한 클래스 1,2,3를 num_nan 개수만큼 생성
random_classes = np.random.choice([1, 2, 3], num_nan, replace=True)

# NaN 값을 랜덤한 클래스 값으로 설정
df_test.loc[nan_indices, 'class'] = random_classes

# 클래스 분포 확인
print(df_test['class'].value_counts())

# 모델 합치기

In [None]:
#### 여기까지하고 지희한테 보내기

In [107]:
# fog_test.csv부르기
df_original = pd.read_csv("fog_test.csv", index_col=1)
df_original['fog_test.class'].replace(-99, float('nan'), inplace=True)

In [108]:
# pred_A = pd.read_csv('C:\\Users\\Hong_PC\\Documents\\카카오톡 받은 파일\\240371.csv', index_col=0)
pred_C = pd.read_csv('C:\\Users\\Hong_PC\\Documents\\카카오톡 받은 파일\\240371_2번지역.csv', index_col=0)

In [109]:
# 지역 주의, 예측된 class 값만 df_original에 반영
# df_original.loc[df_original['fog_test.stn_id'].str[0] == 'A', 'fog_test.class'] = pred_A['fog_test.class'].values
# df_original.loc[df_original['fog_test.stn_id'].str[0] == 'B', 'fog_test.class'] = pred_B['class'].values
df_original.loc[df_original['fog_test.stn_id'].str[0] == 'C', 'fog_test.class'] = pred_C['class'].values
# df_original.loc[df_original['fog_test.stn_id'].str[0] == 'D', 'fog_test.class'] = pred_D['class'].values
df_original.loc[df_original['fog_test.stn_id'].str[0] == 'E', 'fog_test.class'] = pred_E['class'].values


In [110]:
df_original

Unnamed: 0_level_0,Unnamed: 0,fog_test.month,fog_test.day,fog_test.time,fog_test.minute,fog_test.stn_id,fog_test.ws10_deg,fog_test.ws10_ms,fog_test.ta,fog_test.re,fog_test.hm,fog_test.sun10,fog_test.ts,fog_test.class
fog_test.year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
L,1,1,1,0,0,AI,329.5,0.6,-2.5,0.0,87.5,0.0,-1.6,
L,2,1,1,0,10,AI,321.8,1.2,-2.5,0.0,88.2,0.0,-1.6,
L,3,1,1,0,20,AI,0.4,0.4,-2.5,0.0,88.6,0.0,-1.7,
L,4,1,1,0,30,AI,323.6,0.7,-2.6,0.0,88.7,0.0,-1.6,
L,5,1,1,0,40,AI,208.4,0.2,-2.6,0.0,89.0,0.0,-1.6,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L,262796,12,31,23,10,ED,338.8,6.1,4.5,0.0,71.3,0.0,1.9,4.0
L,262797,12,31,23,20,ED,335.2,5.8,4.6,0.0,71.4,0.0,1.9,4.0
L,262798,12,31,23,30,ED,337.1,5.3,4.5,0.0,72.1,0.0,1.8,4.0
L,262799,12,31,23,40,ED,322.0,3.7,4.3,0.0,72.5,0.0,1.4,4.0


In [111]:
# 혹시 결측값 있으면 4로 채움 
df_original['fog_test.class'].fillna(4, inplace=True)

In [112]:
df_original['fog_test.class'].value_counts()

4.0    262519
1.0       205
3.0        50
2.0        26
Name: fog_test.class, dtype: int64

In [102]:
df_original.to_csv('240371.csv', index=True)