# 생육 기간 예측 프로젝트

## 개요

- 한 쌍의 이미지를 입력 값으로 받아 작물의 생육 기간을 예측하는 모델 개발
  - 현재는 성장 기간 예측만 진행하지만 회차가 진행되며 환경 변수를 추가로 제공할 예정
- <a href='https://dacon.io/competitions/official/235851/overview/description' target='_blink'>데이콘</a> 데이터 사용
  - 훈련용 이미지 : 총 753개
    - 청경채 : 353개
    - 적상추 : 400개
  - 테스트 이미지 : 총 307개
    - 청경채 : 139개
    - 적상추 : 168개
  - 작물별 이미지 2장씩을 조합하여 2장의 이미지간 경과일 기준으로 학습 및 평가 진행
- 모델 평가 기준 : RMSE(Root Mean Squared Error)

## 구글 드라이브 연결

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 사용 Package 선언

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from torch import optim
from torch import nn
from torchvision.transforms import ToTensor
from torchvision import transforms
from tqdm.auto import tqdm

import os
import random
import pandas as pd
import numpy as np
from glob import glob
from PIL import Image

## 기본 세팅

#### - seed 고정

In [None]:
# seed 고정
def seed_everything(seed):
  # 파이토치, 넘파이, random 등 관련 모듈의 seed 일괄 설정
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)  # multi-gpu인 경우
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
  np.random.seed(seed)
  random.seed(seed)
  
random_seed = 2048
seed_everything(random_seed)

#### - 파일 Path

 - 데이터 경로는 본인의 환경에 맞추어 변경

In [None]:
# 파일 path
train_path = '/content/drive/MyDrive/Colab Notebooks/DL PROJECT/Data/train_dataset/'
test_path = '/content/drive/MyDrive/Colab Notebooks/DL PROJECT/Data/test_dataset/'

#### - 모델 특성

In [None]:
# 모델 특성
device = 'cuda' if torch.cuda.is_available() else 'cpu'
lr = 1e-5
epochs = 10
batch_size = 64
valid_batch_size = 50

## 데이터 관련 함수 정의 및 데이터셋 선언

In [None]:
# 각 파일에서 날짜 추출(Train 데이터)
def extract_day(file_name):
  day = int(file_name.split('.')[-2][-2:])
  return day

In [None]:
# path를 통해 각 파일의 날짜 array 생성
def make_day_array(image_pathes):
  day_array = np.array([extract_day(file_name) for file_name in image_pathes])
  return day_array

In [None]:
# image path array 생성
def make_image_path_array(root_path = None):
  # 디렉토리 확인
  if root_path is None:
    bc_directories = glob('./BC/*')
    lt_directories = glob('./LT/*')
  else:
    bc_directories = glob(root_path + 'BC_RESIZE/*')
    lt_directories = glob(root_path + 'LT_RESIZE/*')
  
  # 각 디렉토리 별 bc 이미지 확인
  bc_image_path = []  # 각 bc 이미지 경로
  bc_dir = []         # 각 bc 이미지 경로가 있는 디렉토리 인덱스
  dir_idx = 0
  for bc_path in bc_directories:
    images = glob(bc_path + '/*.png')
    bc_image_path.extend(images)
    bc_dir.extend(['bc' + str(dir_idx)] * len(images))
    dir_idx += 1
  
  # 각 디렉토리 별 lt 이미지 확인
  lt_image_path = []  # 각 lt 이미지 경로
  lt_dir = []         # 각 lt 이미지 경로가 있는 디렉토리 인덱스
  dir_idx = 0
  for lt_path in lt_directories:
    images = glob(lt_path + '/*.png')
    lt_image_path.extend(images)
    lt_dir.extend(['lt' + str(dir_idx)] * len(images))
    dir_idx += 1

  return bc_image_path, lt_image_path, bc_dir, lt_dir

In [None]:
# 각 이미지별 dataframe 생성
def make_dataframe(root_path=None):
  bc_image_path, lt_image_path, bc_dir, lt_dir = make_image_path_array(root_path)
  bc_day_array = make_day_array(bc_image_path)
  lt_day_array = make_day_array(lt_image_path)

  # 이미지 path와 day를 이용해 생성
  bc_df = pd.DataFrame({"file_name": bc_image_path,
                        "day" : bc_day_array,
                        "dir" : bc_dir})
  # 품종 컬럼 추가
  bc_df['species'] = 'bc'

  # 이미지 path와 day를 이용해 생성
  lt_df = pd.DataFrame({"file_name": lt_image_path,
                        "day" : lt_day_array,
                        "dir" : lt_dir})
  # 품종 컬럼 추가
  lt_df['species'] = 'lt'

  total_data_frame = pd.concat([bc_df, lt_df]).reset_index(drop=True)

  return total_data_frame

In [None]:
# dataframe을 통해 before, after, delta 형식 형성
# 각 디렉토리별 이미지로 모든 경우의 수 생성
def make_combination(species, data_frame):
  before_file_path = []
  after_file_path = []
  time_delta = []

  for dir in tqdm(data_frame[data_frame['species'] == species]['dir'].unique()):
    for i in range(0, len(data_frame[data_frame['dir'] == dir])-1):
      for j in range(0, len(data_frame[data_frame['dir'] == dir])):
        after = data_frame[data_frame['dir'] == dir].iloc[j].reset_index(drop=True)  
        before = data_frame[data_frame['dir'] == dir].iloc[i].reset_index(drop=True)

        if int(after[1]) > int(before[1]):  
          before_file_path.append(before.iloc[0])
          after_file_path.append(after.iloc[0])
          delta = int(after.iloc[1] - before.iloc[1])
          time_delta.append(delta)

  combination_df = pd.DataFrame({
      'before_file_path' : before_file_path,
      'after_file_path' : after_file_path,
      'time_delta' : time_delta,
  })

  combination_df['species'] = species

  return combination_df

In [None]:
# 데이터 설정
class KistDataset(Dataset):
  def __init__(self, combination_df, is_test=None):
    self.combination_df = combination_df
    # 텐서 변환
    self.transform = transforms.Compose([
      transforms.ToTensor()
    ])
    # 훈련용 이미지 변환
    self.transform2 = transforms.Compose([
      transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
      transforms.RandomHorizontalFlip(p=0.5),
      transforms.RandomVerticalFlip(p=0.5),
      transforms.RandomAffine((-20, 20)),
      transforms.RandomRotation(degrees=(0, 90)),
    ])
    self.is_test = is_test
  
  def __getitem__(self, idx):
    before_image = Image.open(self.combination_df.iloc[idx]['before_file_path'])
    after_image = Image.open(self.combination_df.iloc[idx]['after_file_path'])

    before_image = self.transform(before_image)
    after_image = self.transform(after_image)
    if self.is_test:
      return before_image, after_image
    before_image = self.transform2(before_image)
    after_image = self.transform2(after_image)
    time_delta = self.combination_df.iloc[idx]['time_delta']
    return before_image, after_image, time_delta
  
  def __len__(self):
    return len(self.combination_df)

## 이미지 Resize

- torchvision의 transforms을 통해 resize가 가능하다.
- 하지만, 이미지의 크기가 커서 학습 진행마다 resize하는 시간이 길게 걸린다.
- 따라서, `mobilenet_v2`의 입력 사이즈인 (224, 224)로 훈련, 테스트 데이터를 조절한다.
  - **추후 모델 변경시 입력 사이즈에 맞게 조절한다.**
- 필요에 따라 아래 코드 실행

#### - Resize 디렉토리 생성

In [None]:
# Train bc_resize 디렉토리
# os.mkdir(train_path+'BC_RESIZE')
# for bc in os.listdir(train_path+'BC/'):
#     os.mkdir(train_path+'BC_RESIZE/'+bc)

In [None]:
# Train lt_resize 디렉토리
# os.mkdir(train_path+'LT_RESIZE')
# for lt in os.listdir(train_path+'LT/'):
#     os.mkdir(train_path+'LT_RESIZE/'+lt)

In [None]:
# Test bc_resize 디렉토리
# os.mkdir(test_path+'BC_RESIZE')
# for dir in os.listdir(test_path+'BC/'):
#     os.mkdir(test_path+'BC_RESIZE/'+dir)

In [None]:
# Test lt_resize 디렉토리
# os.mkdir(test_path+'LT_RESIZE')
# for dir in os.listdir(test_path+'LT/'):
#     os.mkdir(test_path+'LT_RESIZE/'+dir)

#### - Resize 이미지 저장

In [None]:
# Train bc resize 저장
# for bc in glob(train_path+'BC/*'):
#     bc_num = bc[-5:]
#     print(bc_num)
#     for img in os.listdir(train_path+'BC/'+bc_num):
#         img_re = Image.open(train_path+'BC/'+bc_num+'/'+img)
#         img_re = img_re.resize((224, 224))
#         img_re.save(train_path+'BC_RESIZE/'+bc_num+'/'+img)

In [None]:
# Train lt resize 저장
# for lt in glob(train_path+'LT/*'):
#     lt_num = lt[-5:]
#     print(lt_num)
#     for img in os.listdir(train_path+'LT/'+lt_num):
#         img_re = Image.open(train_path+'LT/'+lt_num+'/'+img)
#         img_re = img_re.resize((224, 224))
#         img_re.save(train_path+'LT_RESIZE/'+lt_num+'/'+img)

In [None]:
# Test bc resize 저장
# for dir in os.listdir(test_path+'BC/'):
#     print(dir)
#     for img in os.listdir(test_path+'BC/'+ dir):
#         img_re = Image.open(test_path+'BC/'+dir+'/'+img)
#         img_re = img_re.resize((224, 224))
#         img_re.save(test_path+'BC_RESIZE/'+dir+'/'+img)

In [None]:
# Test lt resize 저장
# for dir in os.listdir(test_path+'LT/'):
#     print(dir)
#     for img in os.listdir(test_path+'LT/'+ dir):
#         img_re = Image.open(test_path+'LT/'+dir+'/'+img)
#         img_re = img_re.resize((224, 224))
#         img_re.save(test_path+'LT_RESIZE/'+dir+'/'+img)

## 모델 선언

### - 모델 구현

In [None]:
import torch
from torch import nn
from torchvision.models import mobilenet_v2

# 이미지를 통해 생육기간 회귀 분석
class CompareCNN(nn.Module):
  def __init__(self):
    super(CompareCNN, self).__init__()
    self.mobile_net = mobilenet_v2(pretrained=True)
    self.fc_layer = nn.Linear(1000, 1)

  def forward(self, input):
    x = self.mobile_net(input)
    output = self.fc_layer(x)
    return output

# before, after 이미지의 생육 기간을 추출해 delta 계산
class CompareNet(nn.Module):
  def __init__(self):
    super(CompareNet, self).__init__()
    self.before_net = CompareCNN()
    self.after_net = CompareCNN()

  def forward(self, before_input, after_input):
    before = self.before_net(before_input)
    after = self.after_net(after_input)
    delta = before - after
    return delta

### - 데이터 준비

- 전체 훈련용 데이터 생성

In [None]:
# 전체 train 이미지 데이터프레임
total_dataframe = make_dataframe(root_path=train_path)
# 청경채 학습 데이터 생성
bc_combination = make_combination('bc', total_dataframe)
# 적상추 학습 데이터 생성
lt_combination = make_combination('lt', total_dataframe)

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

- 청경채 6812개, 적상추 7801개
- 총 14613개

In [None]:
bc_combination.shape, lt_combination.shape, len(bc_combination) + len(lt_combination)

((6812, 4), (7801, 4), 14613)

- 데이터를 sample을 통해 무작위로 섞어준다.

In [None]:
bc_combination = bc_combination.sample(len(bc_combination), random_state=random_seed)
lt_combination = lt_combination.sample(len(lt_combination), random_state=random_seed)

- Train : Validation = 0.9 : 0.1
  - 청경채(bc) : Train(6132 개), Valid(680 개)
  - 적상추(lt) : Train(7021 개), Valid(780 개)

In [None]:
# train, valid 구분
bc_train = bc_combination.iloc[:6132]
bc_valid = bc_combination.iloc[6132:]
# train, valid 구분
lt_train = lt_combination.iloc[:7201]
lt_valid = lt_combination.iloc[7201:]

# 학습 train, valid 데이터 연결
train_set = pd.concat([bc_train, lt_train])
valid_set = pd.concat([bc_valid, lt_valid])

# 학습 train, valid 데이터 데이터셋으로 변경
train_dataset = KistDataset(train_set)
valid_dataset = KistDataset(valid_set)

# DataLoader 선언
train_data_loader = DataLoader(train_dataset,
                               batch_size = batch_size,
                               shuffle = True)
valid_data_loader = DataLoader(valid_dataset,
                               batch_size = valid_batch_size)

#### - 모델 구성

In [None]:
# 모델 선언
model = CompareNet().to(device)
# 옵티마이저 설정
optimizer = optim.Adam(model.parameters(), lr=lr)

Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth


  0%|          | 0.00/13.6M [00:00<?, ?B/s]

### - 모델 학습

In [None]:
for epoch in tqdm(range(epochs)):
  for step, (before_image, after_image, time_delta) in tqdm(enumerate(train_data_loader)):
    before_image = before_image.to(device)
    after_image = after_image.to(device)
    time_delta = time_delta.to(device)

    optimizer.zero_grad()
    logit = model(before_image, after_image)
    train_loss = (torch.sum(torch.abs(logit.squeeze(1).float() - time_delta.float())) / torch.LongTensor([batch_size]).squeeze(0).to(device))
    train_loss.backward()
    optimizer.step()

    if step % 15 == 0:
      print('\n=====================loss=====================')
      print(f'\n=====================EPOCH:{epoch}=====================')
      print(f'\n=====================step:{step}=====================')
      print('MAE_loss : ', train_loss.detach().cpu().numpy())
  
  valid_losses = []
  with torch.no_grad():
    for valid_before, valid_after, time_delta in tqdm(valid_data_loader):
      valid_before = valid_before.to(device)
      valid_after = valid_after.to(device)
      valid_time_delta = time_delta.to(device)

      logit = model(valid_before, valid_after)
      valid_loss = (torch.sum(torch.abs(logit.squeeze(1).float() - valid_time_delta.float())) / torch.LongTensor([valid_batch_size]).squeeze(0).to(device))
      valid_losses.append(valid_loss.detach().cpu())
  
  print(f'VALIDATION_LOSS MAE : {sum(valid_losses)/len(valid_losses)}')
  checkpoint = {
        'model': model.state_dict(),
  }
  torch.save(checkpoint, 'checkpoint_128.pt')

  0%|          | 0/10 [00:00<?, ?it/s]

0it [00:00, ?it/s]




MAE_loss :  15.491607



MAE_loss :  15.346284



MAE_loss :  15.71446



MAE_loss :  14.994125



MAE_loss :  14.544198



MAE_loss :  12.850525



MAE_loss :  13.526178



MAE_loss :  12.397675



MAE_loss :  16.06599



MAE_loss :  13.107269



MAE_loss :  13.738126



MAE_loss :  14.410624



MAE_loss :  13.796077



MAE_loss :  11.2327


  0%|          | 0/26 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 12.465831756591797


0it [00:00, ?it/s]




MAE_loss :  9.802456



MAE_loss :  12.949774



MAE_loss :  7.703787



MAE_loss :  9.167379



MAE_loss :  8.409913



MAE_loss :  6.286083



MAE_loss :  7.405017



MAE_loss :  3.0826778



MAE_loss :  2.8092084



MAE_loss :  2.811863



MAE_loss :  2.1500654



MAE_loss :  2.217747



MAE_loss :  2.06371



MAE_loss :  2.9219496


  0%|          | 0/26 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 3.401888370513916


0it [00:00, ?it/s]




MAE_loss :  2.6669827



MAE_loss :  3.6220388



MAE_loss :  2.4307826



MAE_loss :  2.0418124



MAE_loss :  2.4349785



MAE_loss :  3.1147974



MAE_loss :  2.0330858



MAE_loss :  2.3527312



MAE_loss :  1.9440337



MAE_loss :  2.024662



MAE_loss :  2.1239066



MAE_loss :  2.7107701



MAE_loss :  2.435749



MAE_loss :  2.022049


  0%|          | 0/26 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 2.9643445014953613


0it [00:00, ?it/s]




MAE_loss :  3.0423875



MAE_loss :  2.5096054



MAE_loss :  2.3270442



MAE_loss :  1.8153615



MAE_loss :  1.7874441



MAE_loss :  2.0214887



MAE_loss :  1.8226719



MAE_loss :  2.0799582



MAE_loss :  1.9494612



MAE_loss :  2.422011



MAE_loss :  1.8624972



MAE_loss :  1.5478035



MAE_loss :  1.6978226



MAE_loss :  1.6150848


  0%|          | 0/26 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 2.651411533355713


0it [00:00, ?it/s]




MAE_loss :  1.7848313



MAE_loss :  1.81567



MAE_loss :  1.7705257



MAE_loss :  2.111545



MAE_loss :  1.7883557



MAE_loss :  2.0035446



MAE_loss :  2.1275454



MAE_loss :  1.6934255



MAE_loss :  1.8397739



MAE_loss :  2.178767



MAE_loss :  1.7215583



MAE_loss :  2.2546625



MAE_loss :  1.9911007



MAE_loss :  1.9768764


  0%|          | 0/26 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 3.049731731414795


0it [00:00, ?it/s]




MAE_loss :  1.8699377



MAE_loss :  1.9603734



MAE_loss :  1.6233087



MAE_loss :  1.5003939



MAE_loss :  1.9289318



MAE_loss :  1.2158666



MAE_loss :  1.4702995



MAE_loss :  2.313602



MAE_loss :  2.4969556



MAE_loss :  1.2733371



MAE_loss :  1.4022565



MAE_loss :  2.0750132



MAE_loss :  1.4252915



MAE_loss :  1.2750878


  0%|          | 0/26 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 3.078289270401001


0it [00:00, ?it/s]




MAE_loss :  2.1586692



MAE_loss :  1.7993351



MAE_loss :  2.1376352



MAE_loss :  1.5696335



MAE_loss :  1.5012294



MAE_loss :  1.5131962



MAE_loss :  1.8878493



MAE_loss :  1.6041307



MAE_loss :  2.1127987



MAE_loss :  2.2041159



MAE_loss :  1.8921112



MAE_loss :  2.2509294



MAE_loss :  1.4352578



MAE_loss :  1.3551966


  0%|          | 0/26 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 2.7115519046783447


0it [00:00, ?it/s]




MAE_loss :  2.5623717



MAE_loss :  5.557169



MAE_loss :  1.8346488



MAE_loss :  1.3889068



MAE_loss :  1.7108493



MAE_loss :  1.5137851



MAE_loss :  1.8523412



MAE_loss :  1.9347569



MAE_loss :  1.7641785



MAE_loss :  1.9609947



MAE_loss :  1.7382106



MAE_loss :  2.1781595



MAE_loss :  1.864669



MAE_loss :  1.9391097


  0%|          | 0/26 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 2.623138427734375


0it [00:00, ?it/s]




MAE_loss :  1.6890792



MAE_loss :  1.5732067



MAE_loss :  1.2429699



MAE_loss :  1.4488509



MAE_loss :  2.5181081



MAE_loss :  1.7194632



MAE_loss :  1.3018043



MAE_loss :  2.8432662



MAE_loss :  2.3118522



MAE_loss :  1.4888415



MAE_loss :  1.915811



MAE_loss :  1.7156537



MAE_loss :  1.8762546



MAE_loss :  1.4426095


  0%|          | 0/26 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 2.7431488037109375


0it [00:00, ?it/s]




MAE_loss :  1.5507158



MAE_loss :  1.2578428



MAE_loss :  1.6003792



MAE_loss :  1.5257568



MAE_loss :  1.4748442



MAE_loss :  1.4502277



MAE_loss :  1.7389481



MAE_loss :  1.4230248



MAE_loss :  1.4503875



MAE_loss :  1.5414975



MAE_loss :  1.4076717



MAE_loss :  1.5956385



MAE_loss :  1.7285793



MAE_loss :  1.6107864


  0%|          | 0/26 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 2.5961647033691406


## 테스트 데이터 예측

### - 테스트 데이터 준비

In [None]:
test_set = pd.read_csv(test_path + 'test_data.csv')
test_set['l_root'] = test_set['before_file_path'].map(lambda x : test_path + x.split('_')[1] + '_RESIZE/' + x.split('_')[2])
test_set['r_root'] = test_set['after_file_path'].map(lambda x : test_path + x.split('_')[1] + '_RESIZE/' + x.split('_')[2])
test_set['before_file_path'] = test_set['l_root'] + '/' + test_set['before_file_path'] + '.png'
test_set['after_file_path'] = test_set['r_root'] + '/' + test_set['after_file_path'] + '.png'
test_dataset = KistDataset(test_set, is_test = True)
test_data_loader = DataLoader(test_dataset, batch_size = batch_size)

### - 테스트 데이터 예측

In [None]:
test_value = []
with torch.no_grad():
  for test_before, test_after in tqdm(test_data_loader):
    test_before = test_before.to(device)
    test_after = test_after.to(device)
    logit = model(test_before, test_after)
    value = logit.squeeze(1).detach().cpu().float()

    test_value.extend(value)

  0%|          | 0/62 [00:00<?, ?it/s]

In [None]:
len(test_value), test_value[:5]

(3960,
 [tensor(10.4355),
  tensor(17.8528),
  tensor(-7.4413),
  tensor(-4.3439),
  tensor(14.7588)])

- 결과는 간단하게 numpy.array형태로 저장해둔다.

In [None]:
test_value = np.array(test_value)

In [None]:
np.save('test_value.npy', test_value)

In [None]:
t = np.load('test_value.npy')

In [None]:
t[:5], test_value[:5]

(array([10.435497 , 17.852795 , -7.4413376, -4.343938 , 14.758828 ],
       dtype=float32),
 array([10.435497 , 17.852795 , -7.4413376, -4.343938 , 14.758828 ],
       dtype=float32))