# 서로 다른 모델과 분할된 데이터, 그 결과에 대한 병합
---

## 개요
승순님이 만들어주신 100개의 chunk파일을 적절하게 사용하여 최소 3개의 모델을 학습시킨다. 각 모델의 성능을 병합하는 앙상블 기법을 사용하여 최종 결과를 산출한다.  

## 목차
* 데이터 압축해제 및 확인하기
* 파이프라인 구축하기
* 모델 구조 설계하기
* 각각의 모델 학습하기
* 학습 결과 병합하기

### STEP 1. 데이터 압축해제 및 확인하기
승순님이 만들어 놓은 청크 파일을 압축해제하고, 파일이 어떻게 구성되어 있는지 확인한다.

In [1]:
import os
import zipfile

compressed_file_path = os.getenv('HOME') + '/aiffel/Kaggle_Hackathon/data/simple_shuffle_data.zip'
gz_file_path = os.getenv('HOME') + '/aiffel/Kaggle_Hackathon/data/compressed_simple'

In [2]:
def extract_zip(current_file_path, extract_file_path):
    with zipfile.ZipFile(current_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_file_path)
        
        
extract_zip(compressed_file_path, gz_file_path)

print('All Done!')

KeyboardInterrupt: 

In [None]:
import gzip
import shutil

def extract_gz(extract_file_path):
    for i in range(100):
        compressed_file_path = os.path.join(gz_file_path,f'train_k{i}.gz')
#         file_name = os.path.basename(compressed_file_path)
        extract_path = os.path.join(extract_file_path, os.path.splitext(file_name)[0])

        with gzip.open(compressed_file_path, 'rb') as f_in:
            with open(extract_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
            
extract_to_path = os.getenv('HOME') + '/aiffel/Kaggle_Hackathon/data/simple'

data = extract_gz(extract_to_path)

print('All Done!')

In [None]:
import pandas as pd
import os

file_path = os.getenv('HOME') + '/aiffel/Kaggle_Hackathon/data/simple/train_k0'
data_df = pd.read_csv(file_path)
print(data_df.head())

# STEP 2. 파이프라인 구축하기
압축해제된 데이터들 중 학습에 사용할 데이터를 핸들링하여 모델에 바로 입력할 수 있도록 준비한다.

In [5]:
import os
import pandas as pd
import ast
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.inception_v3 import preprocess_input
import pickle

def data_pipeline(dir_path, pkl_path, file_idx):
    data_df = pd.DataFrame()
    
    data = pd.read_csv(os.path.join(dir_path, f'train_k{file_idx}'))
#     print(data.head())
    
#     drawing = []
#     for i in data['drawing']:
#         drawing.append(ast.literal_eval(i))
    
    for draw_idx in data['drawing']:
        img = Image.new('L', (256, 256), color='white')
        draw = ImageDraw.Draw(img)

        for stroke in ast.literal_eval(draw_idx):
            for i in range(len(stroke[0])-1):
                draw.line([stroke[0][i], stroke[1][i], stroke[0][i+1], stroke[1][i+1]], fill='black', width=3)

        img = img.resize((28, 28))

        image_array = img_to_array(img)
        image_array = preprocess_input(image_array)
        
        data_df = data_df.append({'img': tf.convert_to_tensor(image_array, dtype=tf.float32)}, ignore_index=True)
     
    
    print(data_df['img'].head())
       
    
    with open(pkl_path, 'rb') as f:
        label_names = pickle.load(f)
         
    for i in data['y']:
        data_df = data_df.append({'label': label_names[i]}, ignore_index=True)
#         labels.append(label_names[i])
        
    print(data['y'].head()


root_path = os.getenv('HOME') + '/aiffel/Kaggle_Hackathon/data'
dir_path = os.path.join(root_path, 'simple')
pkl_path = os.path.join(root_path, 'label_names.pkl')
test = data_pipeline(dir_path, pkl_path, 50)

SyntaxError: invalid syntax (2595014484.py, line 50)

In [None]:
import os
import pandas as pd
import ast
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.inception_v3 import preprocess_input
import pickle

def data_pipeline(dir_path, pkl_path, file_idx):
    # 데이터와 레이블을 담을 리스트 생성
    data_list = []
    label_list = []

    # 데이터 읽기
    data = pd.read_csv(os.path.join(dir_path, f'train_k{file_idx}'))
    drawing = data['drawing'].apply(ast.literal_eval).tolist()  # drawing 열을 리스트로 변환
    
    with open(pkl_path, 'rb') as f:
        label_names = pickle.load(f)

    for draw_idx, label_idx in zip(drawing, data['y']):
        img = Image.new('L', (256, 256), color='white')
        draw = ImageDraw.Draw(img)

        for stroke in draw_idx:
            for i in range(len(stroke[0]) - 1):
                draw.line([stroke[0][i], stroke[1][i], stroke[0][i+1], stroke[1][i+1]], fill='black', width=3)

        img = img.resize((28, 28))

        image_array = img_to_array(img)
        image_array = preprocess_input(image_array)

        data_list.append(tf.convert_to_tensor(image_array, dtype=tf.float32))
        label_list.append(label_names[label_idx])

    # DataFrame 생성
    data_df = pd.DataFrame({'img': data_list, 'label': label_list})

    return data_df


root_path = os.getenv('HOME') + '/aiffel/Kaggle_Hackathon/data'
dir_path = os.path.join(root_path, 'simple')
pkl_path = os.path.join(root_path, 'label_names.pkl')
test = data_pipeline(dir_path, pkl_path, 50)