# 모듈 불러오기

Ref1)
import gc
gc.collect()

중간중간에 불필요한 메모리를 정리해 준다. (https://blog.naver.com/pica4star/221443758311)

Ref2)
import pickle

pickle은 프로그램상에서 사용하고 있는 데이터를 파일형태로 저장한다.
(https://blog.naver.com/wjdwngkdsla/221978274816)
(https://blog.naver.com/mania9899/221624931960)

Ref3)
import seaborn as sns

seaborn은 시각화 라이브러리이다. 그리고 이걸 써줘야 에폭으로 학습이 진행됬다.
(https://blog.naver.com/tkdzma8080/221793003678)




In [None]:
# 파일관리 및 파일선택

import os
import pickle
import random
import gc

# 시각화
import seaborn as sns
import matplotlib.pyplot as plt


import numpy as np
import pandas as pd

from sklearn.metrics import f1_score
from sklearn.model_selection import KFold

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import SeparableConv2D, Input, Conv2D, Add, BatchNormalization, concatenate, AveragePooling2D, add, MaxPooling2D, Conv2DTranspose, Activation, Dropout, ZeroPadding2D, LeakyReLU
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, CSVLogger


SEED = 30
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)

# Evaluation metric 정의

In [None]:
def mae(y_true, y_pred) :
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    y_true = y_true.reshape(1, -1)[0]
    y_pred = y_pred.reshape(1, -1)[0]
    over_threshold = y_true >= 0.1

    return np.mean(np.abs(y_true[over_threshold] - y_pred[over_threshold]))

def fscore(y_true, y_pred):    
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    y_true = y_true.reshape(1, -1)[0]
    y_pred = y_pred.reshape(1, -1)[0]
    remove_NAs = y_true >= 0
    y_true = np.where(y_true[remove_NAs] >= 0.1, 1, 0)
    y_pred = np.where(y_pred[remove_NAs] >= 0.1, 1, 0)
    
    return(f1_score(y_true, y_pred))

def maeOverFscore(y_true, y_pred):
    
    return mae(y_true, y_pred) / (fscore(y_true, y_pred) + 1e-07)

# 데이터 전처리

Ref1)

pickle.dump protocol

파이썬 3.6을 쓴다면 프로토콜을 4를 써야 할 것 같다.
프로토콜이 음수 또는 HIGHEST_PROTOCOL로 지정되면 사용 가능한 최고 프로토콜 버전이 사용됩니다.
(https://ko.coder.work/so/python/73422)

In [None]:
dir_train = 'data/train/'
dir_test = 'data/test/'
UPPER = 50

def make_dataset(dir_train, dir_test, UPPER):
    # train dataset
    train = []
    train_y = []

    for i in os.listdir(dir_train):
        npy = np.load(dir_train + i)

        # missing value 제거
        if npy[:, :, -1].sum() < 0:
            continue
        
        # 0.1이상 내린 픽셀이 UPPER 값 이상인 사진만
        if (npy[:, :, -1] >= 0.1).sum() >= UPPER:
            train.append(npy[:, :, :-1])
            train_y.append(npy[:, :, -1])

    train = np.array(train)
    train_y = np.array(train_y)
    
    
    # 여기까진 내가 대회에서 해오던 방식과 유사하다
    # 아래 구문은 RAM 용량을 효율적으로 사용하기 위해 train, train_y 리스트를 pickle로 저장후 list를 지우는 방법인듯하다.

    with open(f'data/train{UPPER}.pickle', 'wb') as f:
        pickle.dump(train, f, protocol=4)

    with open(f'data/train_y{UPPER}.pickle', 'wb') as f:
        pickle.dump(train_y, f, protocol=4)

    del train
    del train_y

    # test dataset
    test = []

    for i in os.listdir(dir_test):
        npy = np.load(dir_test + i)
        test.append(npy)
    test = np.array(test)

    with open('data/test.pickle', 'wb') as f:
        pickle.dump(test, f, protocol=4)
    del test
    
make_dataset(dir_train, dir_test, 50)

# Data 불러오기

In [None]:
with open('data/train50.pickle', 'rb') as f:
    train = pickle.load(f)
    
# 0~9번채널만 사용
train = train[:, :, :, :10]

with open('data/train_y50.pickle', 'rb') as f:
    train_y = pickle.load(f)
train_y = train_y.reshape(train_y.shape[0], 40, 40, 1)


with open('data/test.pickle', 'rb') as f:
    TEST = pickle.load(f)
TEST = TEST[:, :, :, :10] 

# 탐색적 자료분석 (Exploratory Data Analysis)

In [None]:
# ㅍ 채널과 h 채널의 이미지를 만들어주는 과정. 각 채널의 value.sum()으로 구한다.
def show_img(img):
    ch15_v = 0
    for i in [0,2,4,5,7]:
        ch15_v += img[:,:,i]
    ch15_h = 0
    for i in [1,3,6,8]:
        ch15_h += img[:,:,i]
    ch15_v = ch15_v.reshape(40,40,1)
    ch15_h = ch15_h.reshape(40,40,1)
    img = np.concatenate([img, ch15_v], -1)
    img = np.concatenate([img, ch15_h], -1)
    return img


# image_dir[random.randrange(len(image_dir))] 을 통해 image dir에 있는 사진 중 임의로 한 장을 선택한다.
image_dir = os.listdir('data/train/')
image_sample = np.load(f'data/train/{image_dir[random.randrange(len(image_dir))]}')
image_sample = show_img(image_sample)

color_map = plt.cm.get_cmap('RdBu')
color_map = color_map.reversed()
plt.style.use('fivethirtyeight')
plt.figure(figsize=(10, 10))

for i in range(9):
    plt.subplot(2,6,i+1)
    plt.imshow(image_sample[:, :, i], cmap=color_map)
    plt.title(f'ch_{i}', fontdict= {'fontsize': 16})

plt.subplot(2,6,10)
plt.imshow(image_sample[:,:,-3], cmap = color_map)
plt.title('rain', fontdict= {'fontsize': 16})

plt.subplot(2,6,11)
plt.imshow(image_sample[:,:,-2], cmap = color_map)
plt.title('v_sum', fontdict= {'fontsize': 16})

plt.subplot(2,6,12)
plt.imshow(image_sample[:,:,-1], cmap = color_map)
plt.title('h_sum', fontdict= {'fontsize': 16})

plt.subplots_adjust(top=0.5)
plt.show()


In [None]:
train2 = train.reshape(train.shape[0] * train.shape[1] * train.shape[2], train.shape[3])
train_y2 = train_y.reshape(train_y.shape[0] * train_y.shape[1] * train_y.shape[2], train_y.shape[3])
train_y2 = np.log(train_y2+1)
train2 = np.concatenate([train2, train_y2], -1)

df_corr = pd.DataFrame(train2).reset_index(drop=True)
del train2, train_y2
df_corr = df_corr.iloc[400::1600, :] # every other element, starting at index 400, 400부터 시작해서 1600만큼 건너뜀
df_corr = df_corr.reset_index(drop=True)
df_corr = df_corr.rename(columns={0:'v1',1:'h1',2:'v2',3:'h2',4:'v3',5:'v4',
                                  6:'h4',7:'v5',8:'h5',9:'surface',10:'target'})