# 생육 기간 예측 프로젝트

## 개요

- 한 쌍의 이미지를 입력 값으로 받아 작물의 생육 기간을 예측하는 모델 개발
  - 현재는 성장 기간 예측만 진행하지만 회차가 진행되며 환경 변수를 추가로 제공할 예정
- <a href='https://dacon.io/competitions/official/235851/overview/description' target='_blink'>데이콘</a> 데이터 사용
  - 훈련용 이미지 : 총 753개
    - 청경채 : 353개
    - 적상추 : 400개
  - 테스트 이미지 : 총 307개
    - 청경채 : 139개
    - 적상추 : 168개
  - 작물별 이미지 2장씩을 조합하여 2장의 이미지간 경과일 기준으로 학습 및 평가 진행
- 모델 평가 기준 : RMSE(Root Mean Squared Error)

## 구글 드라이브 연결

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 사용 Package 선언

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from torch import optim
from torch import nn
from torchvision.transforms import ToTensor
from torchvision import transforms
from tqdm.auto import tqdm

import os
import random
import pandas as pd
import numpy as np
from glob import glob
from PIL import Image

## 기본 세팅

In [None]:
# seed 고정
def seed_everything(seed):
  # 파이토치, 넘파이, random 등 관련 모듈의 seed 일괄 설정
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)  # multi-gpu인 경우
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
  np.random.seed(seed)
  random.seed(seed)

seed_everything(2048)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
lr = 1e-5
epochs = 10
batch_size = 64
valid_batch_size = 50

##   
---
```
데이터 경로는 본인의 환경에 맞추어 변경
```
---
##  

## 데이터 관련 함수 정의 및 데이터셋 선언

In [None]:
# 각 파일에서 날짜 추출(Train 데이터)
def extract_day(file_name):
  day = int(file_name.split('.')[-2][-2:])
  return day

# path를 통해 각 파일의 날짜 array 생성
def make_day_array(image_pathes):
  day_array = np.array([extract_day(file_name) for file_name in image_pathes])
  return day_array

# image path array 생성
def make_image_path_array(root_path = None):
  # 디렉토리 확인
  if root_path is None:
    bc_directories = glob('./BC/*')
    lt_directories = glob('./LT/*')
  else:
    bc_directories = glob(root_path + 'BC_RESIZE/*')
    lt_directories = glob(root_path + 'LT_RESIZE/*')
  
  # 각 디렉토리 별 bc 이미지 확인
  bc_image_path = []
  for bc_path in bc_directories:
    images = glob(bc_path + '/*.png')
    bc_image_path.extend(images)
  
  # 각 디렉토리 별 lt 이미지 확인
  lt_image_path = []
  for lt_path in lt_directories:
    images = glob(lt_path + '/*.png')
    lt_image_path.extend(images)
  
  return bc_image_path, lt_image_path

# 각 이미지별 dataframe 생성
def make_dataframe(root_path=None):
  bc_image_path, lt_image_path = make_image_path_array(root_path)
  bc_day_array = make_day_array(bc_image_path)
  lt_day_array = make_day_array(lt_image_path)

  # 이미지 path와 day를 이용해 생성
  bc_df = pd.DataFrame({"file_name": bc_image_path,
                        "day" : bc_day_array})
  # 품종 컬럼 추가
  bc_df['species'] = 'bc'

  # 이미지 path와 day를 이용해 생성
  lt_df = pd.DataFrame({"file_name": lt_image_path,
                        "day" : lt_day_array})
  # 품종 컬럼 추가
  lt_df['species'] = 'lt'

  total_data_frame = pd.concat([bc_df, lt_df]).reset_index(drop=True)

  return total_data_frame

# dataframe을 통해 before, after, delta 형식 형성
def make_combination(length, species, data_frame):
  before_file_path = []
  after_file_path = []
  time_delta = []

  for i in range(length):
    sample = data_frame[data_frame['species'] == species].sample(2)
    after = sample[sample['day'] == max(sample['day'])].reset_index(drop=True)
    before = sample[sample['day'] == min(sample['day'])].reset_index(drop=True)

    before_file_path.append(before.iloc[0]['file_name'])
    after_file_path.append(after.iloc[0]['file_name'])
    delta = int(after.iloc[0]['day'] - before.iloc[0]['day'])
    time_delta.append(delta)

  combination_df = pd.DataFrame({
      'before_file_path' : before_file_path,
      'after_file_path' : after_file_path,
      'time_delta' : time_delta,
  })

  combination_df['species'] = species

  return combination_df

# 데이터 설정
class KistDataset(Dataset):
  def __init__(self, combination_df, is_test=None):
    self.combination_df = combination_df
    self.transform = transforms.Compose([
      # transforms.Resize(224),
      transforms.ToTensor()
    ])
    self.is_test = is_test
  
  def __getitem__(self, idx):
    before_image = Image.open(self.combination_df.iloc[idx]['before_file_path'])
    after_image = Image.open(self.combination_df.iloc[idx]['after_file_path'])

    before_image = self.transform(before_image)
    after_image = self.transform(after_image)
    if self.is_test:
      return before_image, after_image
    time_delta = self.combination_df.iloc[idx]['time_delta']
    return before_image, after_image, time_delta
  
  def __len__(self):
    return len(self.combination_df)

## 이미지 Resize

- torchvision의 transforms을 통해 resize가 가능하다.
- 하지만, 이미지의 크기가 커서 학습 진행마다 resize하는 시간이 길게 걸린다.
- 따라서, `mobilenet_v2`의 입력 사이즈인 (224, 224)로 훈련, 테스트 데이터를 조절한다.
  - **추후 모델 변경시 입력 사이즈에 맞게 조절한다.**
- 필요에 따라 아래 코드 변경하여 실행
- Resize(224, 224) : <a href='https://drive.google.com/file/d/1YAiw-7hJP9PPy8oIslJuzMq9AMjK81XN/view?usp=sharing' target='_blink'>구글 드라이브</a>

In [None]:
# Train bc_resize 디렉토리
# train_root = './data/train_dataset/'
# os.mkdir(train_root+'BC_RESIZE')
# for bc in os.listdir(train_root+'BC/'):
#     os.mkdir(train_root+'BC_RESIZE/'+bc)

In [None]:
# Train lt_resize 디렉토리
# train_root = './data/train_dataset/'
# os.mkdir(train_root+'LT_RESIZE')
# for lt in os.listdir(train_root+'LT/'):
#     os.mkdir(train_root+'LT_RESIZE/'+lt)

In [None]:
# Test bc_resize 디렉토리
# test_root = './data/test_dataset/'
# os.mkdir(test_root+'BC_RESIZE')
# for dir in os.listdir(test_root+'BC/'):
#     os.mkdir(test_root+'BC_RESIZE/'+dir)

In [None]:
# Test lt_resize 디렉토리
# test_root = './data/test_dataset/'
# os.mkdir(test_root+'LT_RESIZE')
# for dir in os.listdir(test_root+'LT/'):
#     os.mkdir(test_root+'LT_RESIZE/'+dir)

In [None]:
# Train bc resize 저장
# train_root = './data/train_dataset/'
# for bc in glob(train_root+'BC/*'):
#     bc_num = bc[-5:]
#     print(bc_num)
#     for img in os.listdir(train_root+'BC/'+bc_num):
#         img_re = Image.open(train_root+'BC/'+bc_num+'/'+img)
#         img_re = img_re.resize((224, 224))
#         img_re.save(train_root+'BC_RESIZE/'+bc_num+'/'+img)

In [None]:
# Train lt resize 저장
# train_root = './data/train_dataset/'
# for lt in glob(train_root+'LT/*'):
#     lt_num = lt[-5:]
#     print(lt_num)
#     for img in os.listdir(train_root+'LT/'+lt_num):
#         img_re = Image.open(train_root+'LT/'+lt_num+'/'+img)
#         img_re = img_re.resize((224, 224))
#         img_re.save(train_root+'LT_RESIZE/'+lt_num+'/'+img)

In [None]:
# Test bc resize 저장
# test_root = './data/test_dataset/'
# for dir in os.listdir(test_root+'BC/'):
#     print(dir)
#     for img in os.listdir(test_root+'BC/'+ dir):
#         img_re = Image.open(test_root+'BC/'+dir+'/'+img)
#         img_re = img_re.resize((224, 224))
#         img_re.save(test_root+'BC_RESIZE/'+dir+'/'+img)

In [None]:
# Test lt resize 저장
# test_root = './data/test_dataset/'
# for dir in os.listdir(test_root+'LT/'):
#     print(dir)
#     for img in os.listdir(test_root+'LT/'+ dir):
#         img_re = Image.open(test_root+'LT/'+dir+'/'+img)
#         img_re = img_re.resize((224, 224))
#         img_re.save(test_root+'LT_RESIZE/'+dir+'/'+img)

## 모델 선언

### - 모델 구현

In [None]:
import torch
from torch import nn
from torchvision.models import mobilenet_v2

# 이미지를 통해 생육기간 회귀 분석
class CompareCNN(nn.Module):
  def __init__(self):
    super(CompareCNN, self).__init__()
    self.mobile_net = mobilenet_v2(pretrained=True)
    self.fc_layer = nn.Linear(1000, 1)

  def forward(self, input):
    x = self.mobile_net(input)
    output = self.fc_layer(x)
    return output

# before, after 이미지의 생육 기간을 추출해 delta 계산
class CompareNet(nn.Module):
  def __init__(self):
    super(CompareNet, self).__init__()
    self.before_net = CompareCNN()
    # self.after_net = CompareCNN()

  def forward(self, before_input, after_input):
    before = self.before_net(before_input)
    # after = self.after_net(after_input)
    after = self.before_net(after_input)
    delta = before - after
    return delta

### - 데이터 준비

In [None]:
# 전체 train 이미지 데이터프레임
total_dataframe = make_dataframe(root_path='/content/drive/MyDrive/Colab Notebooks/data/train_dataset/')
# 청경채 학습 데이터 5000개 생성
bc_combination = make_combination(5000, 'bc', total_dataframe)
# 적상추 학습 데이터 5000개 생성
lt_combination = make_combination(5000, 'lt', total_dataframe)
# train, valid 구분
bc_train = bc_combination.iloc[:4500]
bc_valid = bc_combination.iloc[4500:]
# train, valid 구분
lt_train = lt_combination.iloc[:4500]
lt_valid = lt_combination.iloc[4500:]

# 학습 train, valid 데이터 연결
train_set = pd.concat([bc_train, lt_train])
valid_set = pd.concat([bc_valid, lt_valid])

# 학습 train, valid 데이터 데이터셋으로 변경
train_dataset = KistDataset(train_set)
valid_dataset = KistDataset(valid_set)

# 모델 선언
model = CompareNet().to(device)
# 옵티마이저 설정
optimizer = optim.Adam(model.parameters(), lr=lr)

# DataLoader 선언
train_data_loader = DataLoader(train_dataset,
                               batch_size = batch_size,
                               shuffle = True)
valid_data_loader = DataLoader(valid_dataset,
                               batch_size = valid_batch_size)

Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth


  0%|          | 0.00/13.6M [00:00<?, ?B/s]

### - 모델 학습

In [None]:
for epoch in tqdm(range(epochs)):
  for step, (before_image, after_image, time_delta) in tqdm(enumerate(train_data_loader)):
    before_image = before_image.to(device)
    after_image = after_image.to(device)
    time_delta = time_delta.to(device)

    optimizer.zero_grad()
    logit = model(before_image, after_image)
    train_loss = (torch.sum(torch.abs(logit.squeeze(1).float() - time_delta.float())) / torch.LongTensor([batch_size]).squeeze(0).to(device))
    train_loss.backward()
    optimizer.step()

    if step % 15 == 0:
      print('\n=====================loss=====================')
      print(f'\n=====================EPOCH:{epoch}=====================')
      print(f'\n=====================step:{step}=====================')
      print('MAE_loss : ', train_loss.detach().cpu().numpy())
  
  valid_losses = []
  with torch.no_grad():
    for valid_before, valid_after, time_delta in tqdm(valid_data_loader):
      valid_before = valid_before.to(device)
      valid_after = valid_after.to(device)
      valid_time_delta = time_delta.to(device)

      logit = model(valid_before, valid_after)
      valid_loss = (torch.sum(torch.abs(logit.squeeze(1).float() - valid_time_delta.float())) / torch.LongTensor([valid_batch_size]).squeeze(0).to(device))
      valid_losses.append(valid_loss.detach().cpu())
  
  print(f'VALIDATION_LOSS MAE : {sum(valid_losses)/len(valid_losses)}')
  checkpoint = {
        'model': model.state_dict(),
  }
  torch.save(checkpoint, 'checkpoint_128.pt')

  0%|          | 0/10 [00:00<?, ?it/s]

0it [00:00, ?it/s]




MAE_loss :  12.967581



MAE_loss :  12.482145



MAE_loss :  12.30397



MAE_loss :  13.949727



MAE_loss :  12.088702



MAE_loss :  13.679344



MAE_loss :  11.973982



MAE_loss :  12.530972



MAE_loss :  12.704079



MAE_loss :  10.220896


  0%|          | 0/20 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 12.438105583190918


0it [00:00, ?it/s]




MAE_loss :  10.93224



MAE_loss :  10.138224



MAE_loss :  10.687585



MAE_loss :  9.787713



MAE_loss :  9.069434



MAE_loss :  8.692747



MAE_loss :  7.64851



MAE_loss :  5.189163



MAE_loss :  4.53138



MAE_loss :  3.6164427


  0%|          | 0/20 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 6.218888759613037


0it [00:00, ?it/s]




MAE_loss :  2.6727195



MAE_loss :  2.753472



MAE_loss :  2.479315



MAE_loss :  3.6511116



MAE_loss :  2.7347198



MAE_loss :  2.7021549



MAE_loss :  2.984307



MAE_loss :  3.126693



MAE_loss :  1.7314959



MAE_loss :  1.7963612


  0%|          | 0/20 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 5.0130414962768555


0it [00:00, ?it/s]




MAE_loss :  3.9647346



MAE_loss :  1.8545438



MAE_loss :  2.18733



MAE_loss :  1.7638526



MAE_loss :  1.5444818



MAE_loss :  1.4443831



MAE_loss :  1.5500358



MAE_loss :  1.6916814



MAE_loss :  1.2798468



MAE_loss :  1.507843


  0%|          | 0/20 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 4.419803142547607


0it [00:00, ?it/s]




MAE_loss :  1.4585402



MAE_loss :  1.7654293



MAE_loss :  1.2347202



MAE_loss :  1.9013665



MAE_loss :  2.3086805



MAE_loss :  1.2403208



MAE_loss :  1.8034217



MAE_loss :  1.441324



MAE_loss :  1.1805241



MAE_loss :  1.4735377


  0%|          | 0/20 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 4.214031219482422


0it [00:00, ?it/s]




MAE_loss :  1.6350124



MAE_loss :  1.1687597



MAE_loss :  1.517611



MAE_loss :  1.8568029



MAE_loss :  1.2754463



MAE_loss :  1.4637691



MAE_loss :  2.1572561



MAE_loss :  1.7904874



MAE_loss :  1.278141



MAE_loss :  1.4395111


  0%|          | 0/20 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 4.91057825088501


0it [00:00, ?it/s]




MAE_loss :  2.1985383



MAE_loss :  1.7424774



MAE_loss :  1.413871



MAE_loss :  1.4778397



MAE_loss :  1.5158852



MAE_loss :  1.0137241



MAE_loss :  1.7735059



MAE_loss :  1.2304485



MAE_loss :  1.0698495



MAE_loss :  1.7174056


  0%|          | 0/20 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 4.4719743728637695


0it [00:00, ?it/s]




MAE_loss :  1.6019961



MAE_loss :  1.1973047



MAE_loss :  1.1369879



MAE_loss :  1.386455



MAE_loss :  1.6231084



MAE_loss :  1.3759525



MAE_loss :  1.1650186



MAE_loss :  0.9681053



MAE_loss :  1.5737517



MAE_loss :  1.4050698


  0%|          | 0/20 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 4.953733921051025


0it [00:00, ?it/s]




MAE_loss :  1.30844



MAE_loss :  1.7683239



MAE_loss :  1.6638706



MAE_loss :  2.1905046



MAE_loss :  1.0891696



MAE_loss :  1.7568222



MAE_loss :  1.2232933



MAE_loss :  1.1529934



MAE_loss :  1.3424315



MAE_loss :  1.6338558


  0%|          | 0/20 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 4.678656578063965


0it [00:00, ?it/s]




MAE_loss :  1.2499087



MAE_loss :  1.8831718



MAE_loss :  1.8943495



MAE_loss :  0.91302574



MAE_loss :  2.0378098



MAE_loss :  1.1871966



MAE_loss :  2.441084



MAE_loss :  1.0238857



MAE_loss :  1.1413982



MAE_loss :  0.9215268


  0%|          | 0/20 [00:00<?, ?it/s]

VALIDATION_LOSS MAE : 4.601240158081055


## 테스트 데이터 예측

### - 테스트 데이터 준비

In [None]:
test_set = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/test_dataset/test_data.csv')
test_set['l_root'] = test_set['before_file_path'].map(lambda x : '/content/drive/MyDrive/Colab Notebooks/data/test_dataset/' + x.split('_')[1] + '_RESIZE/' + x.split('_')[2])
test_set['r_root'] = test_set['after_file_path'].map(lambda x : '/content/drive/MyDrive/Colab Notebooks/data/test_dataset/' + x.split('_')[1] + '_RESIZE/' + x.split('_')[2])
test_set['before_file_path'] = test_set['l_root'] + '/' + test_set['before_file_path'] + '.png'
test_set['after_file_path'] = test_set['r_root'] + '/' + test_set['after_file_path'] + '.png'
test_dataset = KistDataset(test_set, is_test = True)
test_data_loader = DataLoader(test_dataset, batch_size = 64)

### - 테스트 데이터 예측

In [None]:
test_value = []
with torch.no_grad():
  for test_before, test_after in tqdm(test_data_loader):
    test_before = test_before.to(device)
    test_after = test_after.to(device)
    logit = model(test_before, test_after)
    value = logit.squeeze(1).detach().cpu().float()

    test_value.extend(value)

  0%|          | 0/62 [00:00<?, ?it/s]

In [None]:
len(test_value), test_value[:5]

(3960,
 [tensor(16.3052),
  tensor(23.0813),
  tensor(0.4986),
  tensor(0.3185),
  tensor(25.4677)])

- 결과는 간단하게 numpy.array형태로 저장해둔다.

In [None]:
test_value = np.array(test_value)

In [None]:
np.save('test_value.npy', test_value)

In [None]:
# 넘파이 불러오기
t = np.load('test_value.npy')

In [None]:
t[:5], test_value[:5]

(array([16.305199  , 23.081339  ,  0.49857903,  0.31848335, 25.467714  ],
       dtype=float32),
 array([16.305199  , 23.081339  ,  0.49857903,  0.31848335, 25.467714  ],
       dtype=float32))