This notebook reads .csv files, merges them, and saves the result.  
The merged .csv file will be saved in 'merged_dataset'.

In [1]:
import os
import glob

import shutil
import zipfile

import pandas as pd
from tqdm import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### copy & unzip Dataset

In [3]:
# 소스 파일 경로
source_file_path = '/content/drive/MyDrive/archive.zip'

# 대상 파일 경로
target_file_path = '/content/archive.zip'

# 파일 복사
shutil.copyfile(source_file_path, target_file_path)

'/content/archive.zip'

In [4]:
# 언집할 폴더 경로
target_folder_path = '/content'

# ZIP 파일 언집
with zipfile.ZipFile(target_file_path, 'r') as zip_ref:
    zip_ref.extractall(target_folder_path)

### merge & save csv

In [5]:
def resd_csv(dir_path):
    colname = ['Hour' , 'Minute' , 'Second', 'microsecond', 'Horiz', 'Vert']

    # 디렉토리 내의 모든 CSV 파일 경로를 가져오기
    file_paths = glob.glob(dir_path + '/*.csv')

    # 파일 경로를 순서대로 정렬
    file_paths = sorted(file_paths)

    # 파일들을 담을 빈 DataFrame 생성
    combined_data = pd.DataFrame()

    # acc 파일만 불러오기 (파일명에 'acc'가 포함된 파일들)
    acc_file_paths = [file_path for file_path in file_paths if 'acc' in file_path]

    # 각 파일을 순회하며 데이터를 불러온 뒤 빈 DataFrame에 추가
    for file_path in acc_file_paths:
        if dir_path == '/content/ieee-phm-2012-data-challenge-dataset-master/Full_Test_Set/Bearing1_4':
            # Split the values in the single column into separate columns using the 'str.split()' function
            # Bearing1_4의 경우 한열에 모든 열이 들어 있다 ';'으로 구분됨
            df = pd.read_csv(file_path, sep=';', names=colname, header=None)
        else:
            df = pd.read_csv(file_path, names=colname, header=None)  # 첫 번째 행을 인덱스로 사용하지 않음
        combined_data = pd.concat([combined_data, df], ignore_index=True, axis=0)
    return combined_data

In [6]:
def convert_timestamp(df):
    # Convert timestamp columns to integers and create a new DataFrame with these values
    timestamp_integers = df[['Hour' , 'Minute' , 'Second', 'microsecond']].astype(int)
    timestamp_integers.columns = ['hour', 'minute', 'second', 'microsecond']

    # Combine the integer values to form a string in the format 'HH:MM:SS.microseconds'
    df['timestamp'] = timestamp_integers.apply(lambda x: f"{x['hour']:02d}:{x['minute']:02d}:{x['second']:02d}.{x['microsecond']:06d}", axis=1)

    # Convert the 'timestamp' column to a Pandas datetime object
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='%H:%M:%S.%f')

    # Remove the date part from the 'timestamp' column
    df['timestamp'] = df['timestamp'].dt.time

    df = df.drop(['Hour' , 'Minute' , 'Second', 'microsecond'], axis=1)
    df = df.reindex(columns=['timestamp', 'Horiz', 'Vert'])

    return df

In [7]:
def get_second_level_folders(directory):
    second_level_folders = []
    for root, dirs, files in os.walk(directory):
        if root != directory:  # Exclude the top-level directory
            for folder_name in dirs:
                second_level_folders.append(os.path.join(root, folder_name))
    return second_level_folders

In [8]:
def save_csv(path):
    df = resd_csv(path)
    df = convert_timestamp(df)
    if not os.path.exists('/content/drive/MyDrive/merged_dataset'):
        os.makedirs('/content/drive/MyDrive/merged_dataset')
    file_name = '_'.join([path.split('/')[3], path.split('/')[4]])
    df.to_csv(f'/content/drive/MyDrive/merged_dataset/{file_name}.csv', index=False)

In [9]:
directory_path = '/content/ieee-phm-2012-data-challenge-dataset-master'
second_level_folders = get_second_level_folders(directory_path)

In [10]:
for bearing in tqdm(second_level_folders):
    save_csv(bearing)

100%|██████████| 28/28 [1:30:56<00:00, 194.89s/it]
