# 0207-Data preprocessing
- 대용량 데이터를 효율적으로 처리할수 있는 파이프라인 구축 및 데이터 전처리 시도 1


# try 1 : pyconkr 2019 대용량 데이터 핸들링 관련 자료 참고
- 데이터 타입 변경

In [5]:
# import libraries
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

## 데이터 파일명 수정
- train_simplified 데이터 폴더명은 각 레이블로 되어있는데, \_없이 공백으로 파일명이 작성되어있음
    - e.g. alarm clock.csv
- 이 부분 전처리 필요

In [4]:
import zipfile

# Function to extract and rename folders
def extract_and_rename_folders(zip_file_path, extract_path):
    # Extract the zip file
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    
    # Get a list of folders in the extracted path
    extracted_folders = os.listdir(extract_path)
    
    # Iterate over the extracted folders
    for folder_name in extracted_folders:
        old_path = os.path.join(extract_path, folder_name)
        # Check if the folder name contains spaces
        if ' ' in folder_name:
            # New folder name with spaces removed
            new_folder_name = folder_name.replace(' ', '_')
            new_path = os.path.join(extract_path, new_folder_name)
            # Rename the folder
            os.rename(old_path, new_path)
            print(f"Renamed folder '{folder_name}' to '{new_folder_name}'")
        else:
            print(f"No spaces found in folder '{folder_name}', skipping renaming")

In [5]:
# Path to the train_simplified.zip file
train_simplified_zip_path = '../data/train_simplified.zip'
# Directory where you want to extract the zip file
extract_path = '../data/train_simplified'

# Extract and rename folders in the zip file
extract_and_rename_folders(train_simplified_zip_path, extract_path)

Renamed folder 'tennis racquet.csv' to 'tennis_racquet.csv'
No spaces found in folder 'spreadsheet.csv', skipping renaming
No spaces found in folder 'scissors.csv', skipping renaming
No spaces found in folder 'belt.csv', skipping renaming
No spaces found in folder 'whale.csv', skipping renaming
No spaces found in folder 'table.csv', skipping renaming
No spaces found in folder 'moustache.csv', skipping renaming
No spaces found in folder 'envelope.csv', skipping renaming
Renamed folder 'washing machine.csv' to 'washing_machine.csv'
No spaces found in folder 'camel.csv', skipping renaming
No spaces found in folder 'lighthouse.csv', skipping renaming
No spaces found in folder 'scorpion.csv', skipping renaming
No spaces found in folder 'pig.csv', skipping renaming
No spaces found in folder 'snake.csv', skipping renaming
No spaces found in folder 'stitches.csv', skipping renaming
No spaces found in folder 'trombone.csv', skipping renaming
No spaces found in folder 'cup.csv', skipping renamin

In [8]:
# 현재 작업 디렉토리의 주소를 얻음
current_directory = os.getcwd()

# 상위 폴더의 주소
ROOT_PATH = os.path.dirname(current_directory)

print('상위폴더 주소', ROOT_PATH)

상위폴더 주소 /aiffel/aiffel/Sidethon


In [12]:
TRAIN_PATH = ROOT_PATH + '/data/train_simplified'
print(TRAIN_PATH)

/aiffel/aiffel/Sidethon/data/train_simplified


In [10]:
TEST_PATH = ROOT_PATH + '/data/test_simplified.csv'
print(TEST_PATH)

/aiffel/aiffel/Sidethon/data/test_simplified.csv


In [13]:
train_filenames = tf.io.gfile.glob(TRAIN_PATH)
test_filename = tf.io.gfile.glob(TEST_PATH)

train_count = len(train_filenames)
test_count = len(test_filename)
print("train:", train_count)
print("test:", test_count)

train: 1
test: 1


In [18]:
print(train_filenames[:4])
print(test_filename)

['/aiffel/aiffel/Sidethon/data/train_simplified/snowman.csv', '/aiffel/aiffel/Sidethon/data/train_simplified/potato.csv', '/aiffel/aiffel/Sidethon/data/train_simplified/bear.csv', '/aiffel/aiffel/Sidethon/data/train_simplified/matches.csv']
['/aiffel/aiffel/Sidethon/data/test_simplified.csv']


- 각 csv 파일명이 변수에 담겨있음
- 각 파일에서 필요한 컬럼 정보만 추출해서 데이터프레임에 담고 이 데이터프레임으로 데이터셋을 만든다
    - 이 과정에서 각 데이터에 적합한 dtype을 찾아서 해당 타입으로 변환하는 방법을 시도해본다

## check_dtypes() 함수
- 각 컬럼 내 데이터의 최소, 최대 범위를 계산해 적절한 Data Type을 찾아내는 함수
- 각 컬럼의 데이터 형식을 모를때 자동으로 체크하여 사용하고 데이터를 불러올때 체크된 데이터 형식으로 데이터를 불러옴
- 형식을 지정하지 않으면 가장 메모리를 많이 차지하는 방식으로 데이터를 불러옴 -> 형식을 지정해서 데이터를 불러오면 메모리를 줄여서 큰 데이터도 불러올수 있음
    - 데이터 형식 크기 비교: Object > complex > datetime64, float64, int64 > float32, int32 > …

In [25]:
def check_dtypes(file_path):
    print(file_path)
    tmp = pd.read_csv(file_path, nrows=0)
    col_dtypes = {}
    print(tmp.columns)
    for col in tmp.columns:
        df = pd.read_csv(file_path, usecols=[col])
        dtype = df[col].dtype
        
        if dtype == 'int' or dtype == 'float':
            c_min = df[col].min()
            c_max = df[col].max()
        elif dtype == 'object':
            n_unique = df[col].nunique()
            threshold = n_unique / df.shape[0]
            
        if dtype == 'int':
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                col_dtype = 'int8'
            elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:  
                col_dtype = 'uint8'  
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                col_dtype = 'int16'
            elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:  
                col_dtype = 'uint16'  
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                col_dtype = 'int32'
            elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:  
                col_dtype = 'uint32' 
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                col_dtype = 'int64'
            elif c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max:  
                col_dtype = 'uint64'  
            
        elif dtype == 'float':
            # ERROR occured When using float32 in feather, parquet
#             if c_min > np.finfo(np.float18).min and c_max < np.finfo(np.float18).max:
#                 col_dtype = 'float18'
            if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                col_dtype = 'float32'
            else:
                col_dtype = 'float64'
                
        elif dtype == 'object':
            if threshold > 0.7 :
                col_dtype = 'object'
            else:
                col_dtype = 'category'
        
        col_dtypes[col] = col_dtype
    
    return col_dtypes

In [26]:
# 테스트 : train데이터 경로 한개를 가져와 테스트
col_dtypes = check_dtypes(train_filenames[0])
col_dtypes

/aiffel/aiffel/Sidethon/data/train_simplified/snowman.csv
Index(['countrycode', 'drawing', 'key_id', 'recognized', 'timestamp', 'word'], dtype='object')


{'countrycode': 'category',
 'drawing': 'object',
 'key_id': 'int64',
 'recognized': 'int64',
 'timestamp': 'object',
 'word': 'category'}

1. 원하는 컬럼만 추출 - key_id, drawing, word(label)
2. key_id int32형식으로 변환

---


# try 2 : tf.data의 AUTOTUNE, prefetch 기능 확인후 적용 시도

In [27]:
# # preprocess one csv file
# # Function to preprocess data and create a TensorFlow dataset
# def preprocess_data(file_path, columns_to_use, batch_size):
#     # Read CSV file with selected columns
#     df = pd.read_csv(file_path, usecols=columns_to_use)
    
#     # Convert 'object' columns to 'category' data type
#     for col in df.select_dtypes(include=['object']).columns:
#         df[col] = df[col].astype('category')
        
#     # Convert 'int64' columns to 'int32' data type
#     for col in df.select_dtypes(include=['int64']).columns:
#         df[col] = df[col].astype('int32')
    
#     # Convert DataFrame to TensorFlow dataset
#     dataset = tf.data.Dataset.from_tensor_slices(dict(df))
    
#     # Shuffle and batch the dataset
#     dataset = dataset.shuffle(buffer_size=10000).batch(batch_size)
    
#     # Prefetch and autotune for performance optimization
#     dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    
#     return dataset

In [28]:
# Function to preprocess data from multiple CSV files in a folder and create a TensorFlow dataset
def preprocess_data_from_folder(file_paths, columns_to_use, batch_size):
        # Initialize an empty list to store datasets for each CSV file
    datasets = []
    
    # Iterate over each CSV file
    for file_path in file_paths:
        # Read CSV file with selected columns
        df = pd.read_csv(file_path, usecols=columns_to_use)
        
        # Convert 'object' columns to 'category' data type
        for col in df.select_dtypes(include=['object']).columns:
            df[col] = df[col].astype('category')
            
        # Convert 'int64' columns to 'int32' data type
        for col in df.select_dtypes(include=['int64']).columns:
            df[col] = df[col].astype('int32')
        
        # Convert DataFrame to TensorFlow dataset
        dataset = tf.data.Dataset.from_tensor_slices(dict(df))
        
        # Append the dataset to the list
        datasets.append(dataset)
    
    # Concatenate datasets from all CSV files into one dataset
    combined_dataset = tf.data.experimental.sample_from_datasets(datasets)
    
    # Shuffle and batch the combined dataset
    combined_dataset = combined_dataset.shuffle(buffer_size=10000).batch(batch_size)
    
    # Prefetch and autotune for performance optimization
    combined_dataset = combined_dataset.prefetch(tf.data.experimental.AUTOTUNE)
    
    return combined_dataset

In [None]:
# Specify the folder path, columns to use, and batch size
# folder_path = '/aiffel/aiffel/Sidethon/data/train_simplified/'
columns_to_use = ['key_id', 'drawing', 'word']
batch_size = 32

# Preprocess data from folder and create TensorFlow dataset
train_dataset = preprocess_data_from_folder(train_filenames, columns_to_use, batch_size)

# Iterate over the dataset (for demonstration purposes)
for batch in train_dataset.take(1):
    # Accessing a batch of data
    print("Batch:", batch)

커널이 죽는 문제 발생!

---

# try 3 : tf.data 방식 변경

아래 방식으로 변경
- `tf.data.Dataset.list_files` : to create a dataset of file paths in the folder
- `tf.data.experimental.CsvDataset` : to read and parse CSV records directly within the TensorFlow pipeline
- `tf.cast`: to optimize data types within the pipeline
- `num_parallel_calls` : to process each CSV file in parallel using map


In [22]:
def preprocess_data_from_folder(folder_path, columns_to_use, batch_size):
    # Create a dataset of file paths in the folder
    file_paths_dataset = tf.data.Dataset.list_files(folder_path + '/*.csv')
    
    # Function to process a single CSV file
    def process_file(file_path):
        # Read CSV file
        records = tf.data.experimental.CsvDataset(file_path, record_defaults=[tf.string]*len(columns_to_use), header=True)
        
        # Parse CSV records
        parsed_records = records.map(lambda *x: dict(zip(columns_to_use, x)))
        print(parsed_records)
        
        # Convert data types
        for col in parsed_records:
            if col in ['int64', 'int32']:
                parsed_records[col] = tf.cast(parsed_records[col], tf.int32)
            elif col == 'object':
                parsed_records[col] = tf.cast(parsed_records[col], tf.string)
            else:
                parsed_records[col] = tf.cast(parsed_records[col], tf.float32)
        
        return parsed_records
    
    # Process each CSV file in parallel
    datasets = file_paths_dataset.map(process_file, num_parallel_calls=tf.data.AUTOTUNE)
    
    # Concatenate datasets from all CSV files into one dataset
    combined_dataset = tf.data.experimental.sample_from_datasets(datasets)
    
    # Shuffle and batch the combined dataset
    combined_dataset = combined_dataset.shuffle(buffer_size=10000).batch(batch_size)
    
    # Prefetch for performance optimization
    combined_dataset = combined_dataset.prefetch(tf.data.AUTOTUNE)
    
    return combined_dataset

In [23]:
# Specify the folder path, columns to use, and batch size
columns_to_use = ['key_id', 'drawing', 'word']
batch_size = 32

# Preprocess data from folder and create TensorFlow dataset
train_dataset = preprocess_data_from_folder(TRAIN_PATH, columns_to_use, batch_size)

# Iterate over the dataset (for demonstration purposes)
for batch in train_dataset.take(1):
    # Accessing a batch of data
    print("Batch:", batch)

<MapDataset shapes: {key_id: (), drawing: (), word: ()}, types: {key_id: tf.string, drawing: tf.string, word: tf.string}>


TypeError: in user code:

    /tmp/ipykernel_152/1188280286.py:21 process_file  *
        parsed_records[col] = tf.cast(parsed_records[col], tf.float32)

    TypeError: 'MapDataset' object is not subscriptable


- process_file 함수 cast 방식변경

In [20]:
def preprocess_data_from_folder(folder_path, columns_to_use, batch_size):
    # Create a dataset of file paths in the folder
    file_paths_dataset = tf.data.Dataset.list_files(folder_path + '/*.csv')
    
    # Check if any files matched the pattern
    if file_paths_dataset.cardinality().numpy() == 0:
        raise ValueError(f"No CSV files found in the folder: {folder_path}")
    
    # Function to process a single CSV file
    def process_file(file_path):
        # Read CSV file
        records = tf.data.experimental.CsvDataset(file_path, record_defaults=[tf.string]*len(columns_to_use), header=True)
        
        # Parse CSV records
        parsed_records = records.map(lambda *x: dict(zip(columns_to_use, x)))
        
        # Convert data types
        parsed_records = {col: tf.cast(parsed_records[col], tf.float32) if col != 'word' else parsed_records[col] for col in columns_to_use}
        
        return parsed_records
    
    # Process each CSV file in parallel
    datasets = file_paths_dataset.map(process_file, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    
    # Concatenate datasets from all CSV files into one dataset
    combined_dataset = tf.data.experimental.sample_from_datasets(datasets)
    
    # Shuffle and batch the combined dataset
    combined_dataset = combined_dataset.shuffle(buffer_size=10000).batch(batch_size)
    
    # Prefetch for performance optimization
    combined_dataset = combined_dataset.prefetch(tf.data.experimental.AUTOTUNE)
    
    return combined_dataset

In [21]:
# Specify the folder path, columns to use, and batch size
columns_to_use = ['key_id', 'drawing', 'word']
batch_size = 32

# Preprocess data from folder and create TensorFlow dataset
train_dataset = preprocess_data_from_folder(TRAIN_PATH, columns_to_use, batch_size)

# Iterate over the dataset (for demonstration purposes)
for batch in train_dataset.take(1):
    # Accessing a batch of data
    print("Batch:", batch)

TypeError: in user code:

    /tmp/ipykernel_152/2068426100.py:18 process_file  *
        parsed_records = {col: tf.cast(parsed_records[col], tf.float32) if col != 'word' else parsed_records[col] for col in columns_to_use}

    TypeError: 'MapDataset' object is not subscriptable


- GPT가 만든 코드를 그대로 사용하니 어느 부분에서 문제가 생겼는지 이해하기가 어려움
- tf.data docs를 참고해서 이해가 되는 내용 기반으로 코드르 수정해봄
