## preprocessing

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.applications import VGG16
from tensorflow.keras.layers import Dense, Flatten, MaxPooling2D
from tensorflow.keras.layers import Dropout, BatchNormalization
from tensorflow.keras import Input

import numpy as np
import matplotlib.pyplot as plt

In [None]:
import splitfolders
import os   # 리눅스를 파이썬에서 쓰고 싶을 때
import cv2
import matplotlib.cm as cm
import pickle
from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from sklearn.model_selection import train_test_split
import pandas as pd

## 데이터 로드 및 탐색

In [None]:
!rm /content/dataset/test/casual/*
!rm /content/dataset/test/dandy/*
!rm /content/dataset/test/sports/*
!rm /content/dataset/train/casual/*
!rm /content/dataset/train/dandy/*
!rm /content/dataset/train/sports/*
!rm /content/dataset/val/casual/*
!rm /content/dataset/val/dandy/*
!rm /content/dataset/val/sports/*

## 데이터 로드

In [None]:
!unzip -q all.zip

### 데이터 전처리

In [None]:
folder_path = '/content/all'
label_names = os.listdir(folder_path)

label_names

['dandy', 'sports', 'casual']

In [None]:
dataset = {}

# 이미지와 라벨 리스트에 담기
for label in os.listdir(folder_path):
    sub_path = folder_path+'/'+label+'/'
    dataset[label] = []
    for filename in os.listdir(sub_path):
        dataset[label].append(sub_path+filename)

### rezise, reshape

In [None]:
!mkdir resized

In [None]:
!mkdir resized/casual
!mkdir resized/dandy
!mkdir resized/sports

In [None]:
for label, filenames in dataset.items():
    for filename in filenames:
        img = cv2.imread(filename)

        ############### bigger 244 -> resize ###############
        # 이미지의 x, y가 224이 넘을 경우 작게해주기
        percent = 1
        if(img.shape[1] > img.shape[0]) :       # 이미지의 가로가 세보다 크면 가로를 640으로 맞추고 세로를 비율에 맞춰서
            percent = 224/img.shape[1]    # img.shape = (h, w, c) = (height, width, color)
        else :
            percent = 224/img.shape[0]

                                             # ↓ 큰쪽 비율에↓ 맞춰서 무조건 작아짐!!
        img = cv2.resize(img, dsize=(0, 0), fx=percent, fy=percent, interpolation=cv2.INTER_LINEAR)
                # 이미지 범위 지정
        y,x,h,w = (0,0,img.shape[0], img.shape[1])

        
        ############### zero(black) padding ###############
        # 그림 주변에 검은색으로 칠하기
        w_x = (224-(w-x))/2  # w_x = (224 - 그림)을 뺀 나머지 영역 크기 [ 그림나머지/2 [그림] 그림나머지/2 ]
        h_y = (224-(h-y))/2

        if(w_x < 0):         # 크기가 -면 0으로 지정.
            w_x = 0
        elif(h_y < 0):
            h_y = 0


        M = np.float32([[1,0,w_x], [0,1,h_y]])  #(2*3 이차원 행렬)
        img_re = cv2.warpAffine(img, M, (224, 224)) #이동변환  
       
        # cv2.imwrite('{0}.jpg',image .format(file)) #파일저장
        cv2.imwrite('resized/{0}/{1}'.format(label, filename.split("/")[-1]) , img_re)

In [None]:
splitfolders.ratio('resized', output='dataset', seed=77, ratio=(0.6, 0.2, 0.2))
                   # ↑ 알아서 분.                            #  ↑ train val test
                   # ↑ resized의 각각 폴더들을 test, train, val로 각각 나눠줘야함

Copying files: 8551 files [00:00, 9567.14 files/s] 


In [None]:
folder_path = '/content/dataset/train'
dataset_train = {}

# 이미지와 라벨 리스트에 담기
for label in os.listdir(folder_path):
    sub_path = folder_path+'/'+label+'/'
    dataset_train[label] = []
    for filename in os.listdir(sub_path):
        dataset_train[label].append(sub_path+filename)

# dataset_train

In [None]:
folder_path = '/content/dataset/test'
dataset_test = {}

# 이미지와 라벨 리스트에 담기
for label in os.listdir(folder_path):
    sub_path = folder_path+'/'+label+'/'
    dataset_test[label] = []
    for filename in os.listdir(sub_path):
        dataset_test[label].append(sub_path+filename)

# dataset_test

In [None]:
folder_path = '/content/dataset/val'
dataset_val = {}

# 이미지와 라벨 리스트에 담기
for label in os.listdir(folder_path):
    sub_path = folder_path+'/'+label+'/'
    dataset_val[label] = []
    for filename in os.listdir(sub_path):
        dataset_val[label].append(sub_path+filename)

# dataset_val

## label to index (dictorary로 지정)

In [None]:
label2index = {'dandy' : 0, 'sports' : 1 , 'casual' : 2}

In [None]:
x_train, y_train = [], []

for label, filenames in dataset_train.items():
    for filename in filenames:
        image = cv2.imread(filename) # img를 array 형태로 변경

        x_train.append(image)
        y_train.append(label2index[label]) # label을 index로 변경

In [None]:
x_test, y_test = [], []

for label, filenames in dataset_test.items():
    for filename in filenames:
        image = cv2.imread(filename) # img를 array 형태로 변경

        x_test.append(image)
        y_test.append(label2index[label]) # label을 index로 변경

In [None]:
x_val, y_val = [], []

for label, filenames in dataset_val.items():
    for filename in filenames:
        image = cv2.imread(filename) # img를 array 형태로 변경

        x_val.append(image)
        y_val.append(label2index[label]) # label을 index로 변경

## array로 형변환

In [None]:
# array로 형변환
x_train, y_train = np.array(x_train), np.array(y_train)
x_test, y_test = np.array(x_test), np.array(y_test)
x_val, y_val = np.array(x_val), np.array(y_val)

In [None]:
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_val = x_val.astype('float32')

In [None]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape, x_val.shape, y_val.shape

((6384, 224, 224, 3),
 (6384,),
 (1713, 224, 224, 3),
 (1713,),
 (1709, 224, 224, 3),
 (1709,))

In [None]:
len(dataset_train['casual']), len(dataset_train['sports']), len(dataset_train['dandy'])

(2418, 1922, 2044)

## 증강 (Augmentation)

In [None]:
datagen = ImageDataGenerator(
    rotation_range=40,      # 랜덤으로 돌리기
    width_shift_range=0.2,  # 지정된 수평방향 이동 범위내에서 임의로 원본이미지를 이동
    height_shift_range=0.2, # 지정된 수직방향 이동 범위내에서 임의로 원본이미지를 이동
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    vertical_flip=True,
)

In [None]:
folder_path = '/content/dataset/train'
folder_path

'/content/dataset/train'

In [None]:
###################### 각 라벨별 증강 ######################
for label in  os.listdir(folder_path):
    label_path = folder_path + '/' + label + '/'
    for filename in os.listdir(label_path): 
        filepath = label_path + filename

        img = load_img(filepath)
        x = img_to_array(img)
        x = x.reshape((1,) + x.shape)    

        i = 0           
        for batch in datagen.flow(x, batch_size=1,
                                save_to_dir=label_path, save_prefix=label, save_format='jpg'):
            i += 1
            if i > 2:  
                break  

## Zero Centering

In [None]:
def zero_mean(image):
    # zero-centering
    return np.mean(image, axis=0)

In [None]:
zero_mean_img = zero_mean(x_train)

In [None]:
x_train -= zero_mean_img

In [None]:
x_val -= zero_mean_img
x_test -= zero_mean_img