# I. 데이터 전처리

## 0. Spark Session 생성

In [2]:
pip install pyspark==3.1.2

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pyspark
print(pyspark.__version__)

3.1.2


In [4]:
from pyspark.sql import SparkSession

# SparkSession 생성
spark = SparkSession.builder \
    .appName("HDFS File TEST") \
    .config("spark.hadoop.fs.defaultFS", HDFS_CONFIG["defaultFS"]) \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .getOrCreate()

In [None]:
spark.stop()

## 1. JSON, WAV 파일 DataFrame으로 변환(테스트, 파일 1개)

### 1) JSON 파일 DataFrame으로 변환

In [2]:
import config

In [3]:
# HDFS에서 JSON 파일 읽기
file_path = f"{config.HDFS_BASE_PATH}/label_data/1.Car/1.horn_of_car/1.car_horn_1.json"

In [22]:
df_test = spark.read.json(file_path, multiLine= True)

In [25]:
df_test.printSchema()

root
 |-- annotations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- area: struct (nullable = true)
 |    |    |    |-- end: double (nullable = true)
 |    |    |    |-- start: double (nullable = true)
 |    |    |-- categories: struct (nullable = true)
 |    |    |    |-- category_01: string (nullable = true)
 |    |    |    |-- category_02: string (nullable = true)
 |    |    |    |-- category_03: string (nullable = true)
 |    |    |-- decibel: long (nullable = true)
 |    |    |-- labelName: string (nullable = true)
 |    |    |-- soundQuality: string (nullable = true)
 |    |    |-- subCategory: string (nullable = true)
 |-- audio: struct (nullable = true)
 |    |-- bitRate: string (nullable = true)
 |    |-- duration: double (nullable = true)
 |    |-- fileFormat: string (nullable = true)
 |    |-- fileName: string (nullable = true)
 |    |-- fileSize: long (nullable = true)
 |    |-- recodingType: string (nullable = true)
 |    |-- sample

In [26]:
df_test.show()

+--------------------+--------------------+-----------------------------+--------------------+--------------------+
|         annotations|               audio|                  environment|                info|             license|
+--------------------+--------------------+-----------------------------+--------------------+--------------------+
|[{{4.88, 3.45}, {...|{705kbps, 9.2, wa...|{갤럭시탭S6, 자연적, 제작,...|{IMR, 2021-09-04,...|{CC 0, https://ww...|
+--------------------+--------------------+-----------------------------+--------------------+--------------------+



In [27]:
from pyspark.sql.functions import col, explode

# annotations 배열을 개별 행으로 변환
df_flattened = df_test.withColumn("annotation", explode(col("annotations")))

# 구조체 내부의 필드를 개별 컬럼으로 변환
df_flattened = df_flattened.select(
    # annotation 내부 필드
    col("annotation.area.start").alias("area_start"),
    col("annotation.area.end").alias("area_end"),
    col("annotation.categories.category_01").alias("category_01"),
    col("annotation.categories.category_02").alias("category_02"),
    col("annotation.categories.category_03").alias("category_03"),
    col("annotation.decibel").alias("decibel"),
    col("annotation.labelName").alias("labelName"),
    col("annotation.soundQuality").alias("soundQuality"),
    col("annotation.subCategory").alias("subCategory"),

    # audio 내부 필드
    col("audio.bitRate").alias("bitRate"),
    col("audio.duration").alias("duration"),
    col("audio.fileFormat").alias("fileFormat"),
    col("audio.fileName").alias("fileName"),
    col("audio.fileSize").alias("fileSize"),
    col("audio.recodingType").alias("recodingType"),
    col("audio.sampleRate").alias("sampleRate"),

    # environment 내부 필드
    col("environment.acqDevice").alias("acqDevice"),
    col("environment.acqMethod").alias("acqMethod"),
    col("environment.acqType").alias("acqType"),
    col("environment.areaUse").alias("areaUse"),
    col("environment.dayNight").alias("dayNight"),
    col("environment.direction").alias("direction"),
    col("environment.distance").alias("distance"),
    col("environment.district").alias("district"),
    col("environment.gps.latitude").alias("latitude"),
    col("environment.gps.longitude").alias("longitude"),
    col("environment.micClass").alias("micClass"),
    col("environment.obstacle").alias("obstacle"),
    col("environment.place").alias("place"),
    col("environment.recordingTime").alias("recordingTime"),
    col("environment.urban").alias("urban"),
    col("environment.weather").alias("weather"),

    # info 내부 필드
    col("info.contributor").alias("contributor"),
    col("info.dateCreated").alias("dateCreated"),
    col("info.description").alias("description"),
    col("info.uri").alias("uri"),
    col("info.version").alias("version"),
    col("info.year").alias("year"),

    # license 내부 필드
    col("license.name").alias("license_name"),
    col("license.url").alias("license_url")
)

# 결과 출력 (줄임 없이)
df_flattened.show(truncate=False)

25/02/14 13:25:37 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+----------+--------+-----------+-----------+-----------+-------+----------------+------------+-----------+-------+--------+----------+--------------+--------+------------+----------+----------+---------+-------+--------+--------+---------+--------+--------+--------+---------+--------+--------+------+-------------+----------+-------+-----------+-----------+----------------+-----------------------+-------+----+------------+-----------------------+
|area_start|area_end|category_01|category_02|category_03|decibel|labelName       |soundQuality|subCategory|bitRate|duration|fileFormat|fileName      |fileSize|recodingType|sampleRate|acqDevice |acqMethod|acqType|areaUse |dayNight|direction|distance|district|latitude|longitude|micClass|obstacle|place |recordingTime|urban     |weather|contributor|dateCreated|description     |uri                    |version|year|license_name|license_url            |
+----------+--------+-----------+-----------+-----------+-------+----------------+------------+---

In [28]:
display(df_flattened.toPandas())

Unnamed: 0,area_start,area_end,category_01,category_02,category_03,decibel,labelName,soundQuality,subCategory,bitRate,...,urban,weather,contributor,dateCreated,description,uri,version,year,license_name,license_url
0,3.45,4.88,교통소음,자동차,차량경적,72,1.자동차_1_1.wav,정상,소형차경적,705kbps,...,서울특별시,맑음,IMR,2021-09-04,도시 소리 데이터,https://www.aihub.or.kr,1.0,2021,CC 0,https://www.aihub.or.kr


### 2) WAV 파일 DataFrame으로 변환

In [5]:
import pyspark
from pyspark.sql import SparkSession
import io
from scipy.io import wavfile
import librosa
import numpy as np
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, FloatType

# HDFS 경로 설정
hdfs_path = f"{config.HDFS_BASE_PATH}/raw_data/1.Car/1.horn_of_car/1.car_horn_9_1.wav"

# HDFS에서 WAV 파일 읽기 (binaryFile 포맷 사용)
binary_df = spark.read.format("binaryFile").load(hdfs_path)

# 바이너리 데이터 추출
binary_data = binary_df.select("content").collect()[0][0]

# 바이너리 데이터를 메모리 파일로 변환
audio_bytes = io.BytesIO(binary_data)

# scipy로 WAV 파일 읽기
sr, audio = wavfile.read(audio_bytes)
print("WAV 파일 샘플링 레이트 (scipy):", sr)

# librosa를 사용해 WAV 파일을 리샘플링
audio_librosa, sr_librosa = librosa.load(audio_bytes, sr=None)
print("librosa로 처리한 샘플링 레이트:", sr_librosa)

# MFCC 추출
mfcc = librosa.feature.mfcc(y=audio_librosa, sr=sr_librosa, n_mfcc=50)  # 13개의 MFCC 특징 추출

# MFCC 데이터 프레임으로 변환
# 각 MFCC 값을 배열로 변환
mfcc_list = mfcc.T.tolist()  # MFCC는 2D 배열이므로 이를 각 행별로 리스트로 변환

# Spark DataFrame으로 변환
df_mfcc = spark.createDataFrame([(i, *mfcc_list[i]) for i in range(len(mfcc_list))], 
                                ['index'] + [f'mfcc_{i+1}' for i in range(13)])

df_mfcc.show()

WAV 파일 샘플링 레이트 (scipy): 44100
librosa로 처리한 샘플링 레이트: 44100
+-----+-------------------+------------------+-------------------+------------------+--------------------+------------------+-------------------+------------------+--------------------+-------------------+-------------------+--------------------+-------------------+--------------------+--------------------+--------------------+-------------------+-------------------+--------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+--------------------+-------------------+--------------------+--------------------+--------------------+-------------------+-------------------+--------------------+-------------------+--------------------+--------------------+-------------------+--------------------+-------------------+--------------------+--------------------+-------------------+-------------------

In [14]:
display(df_mfcc.toPandas())

Unnamed: 0,index,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13
0,0,-238.612213,133.487045,-1.284238,22.370865,22.564083,6.518252,-3.192569,1.113762,-7.715192,-3.464986,2.922217,7.885314,1.067608
1,1,-234.819626,152.233673,-10.859320,34.342796,19.500042,10.937912,1.760601,13.675949,-4.060371,-0.560500,-1.439846,-0.002022,-2.775231
2,2,-254.145233,165.326401,-27.593834,47.074913,6.318587,17.216387,-1.539557,20.285751,-3.986101,4.778692,-5.949762,-1.714797,-5.803444
3,3,-253.305832,159.803238,-33.335686,42.717648,4.379756,20.070423,1.405201,16.080278,-5.679544,3.018893,-3.229465,5.645140,-6.073312
4,4,-245.418915,153.193115,-32.850662,44.121162,0.510546,17.596771,-4.350383,5.619840,-7.632173,-1.984774,-6.154417,4.396658,-13.112883
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,915,-224.675110,166.235565,-21.254143,35.928074,9.500803,16.222107,4.120435,23.599682,7.030793,9.776043,4.211914,3.731602,0.523003
916,916,-216.340363,174.400177,-18.497841,32.879936,12.142467,19.131794,0.880060,14.683958,7.002939,8.431215,0.941998,4.359944,-2.096164
917,917,-220.770462,165.227661,-12.930012,33.392899,8.737799,18.818878,-3.677454,12.477207,5.245459,8.977969,-4.575911,-3.837185,-8.276455
918,918,-220.862579,161.087646,-15.604593,41.479973,11.880829,19.706432,-1.011476,15.012062,2.033504,9.690787,-2.105361,0.087690,-10.177952


In [18]:
import pyspark
from pyspark.sql import SparkSession
import io
from scipy.io import wavfile
import librosa
import numpy as np
import os  # os 모듈을 추가하여 경로에서 파일 이름만 추출
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, FloatType

# HDFS 경로 설정
hdfs_path = f"{config.HDFS_BASE_PATH}/raw_data/1.Car/1.horn_of_car/1.car_horn_9_1.wav"

# HDFS에서 WAV 파일 읽기 (binaryFile 포맷 사용)
binary_df = spark.read.format("binaryFile").load(hdfs_path)

# 바이너리 데이터 추출
binary_data = binary_df.select("content").collect()[0][0]
file_path = binary_df.select("path").collect()[0][0]  # 전체 경로를 추출

# 파일 이름만 추출 (HDFS 경로에서 파일 이름만 분리)
file_name = os.path.basename(file_path)

# 바이너리 데이터를 메모리 파일로 변환
audio_bytes = io.BytesIO(binary_data)

# scipy로 WAV 파일 읽기
sr, audio = wavfile.read(audio_bytes)
print("WAV 파일 샘플링 레이트 (scipy):", sr)

# librosa를 사용해 WAV 파일을 리샘플링
audio_librosa, sr_librosa = librosa.load(audio_bytes, sr=None)
print("librosa로 처리한 샘플링 레이트:", sr_librosa)

# MFCC 추출
mfcc = librosa.feature.mfcc(y=audio_librosa, sr=sr_librosa, n_mfcc=13)  # 13개의 MFCC 특징 추출

# MFCC 평균 계산
mfcc_mean = np.mean(mfcc, axis=1).astype(float)  # (13, ) 크기의 평균 MFCC 값 배열, numpy float32에서 float로 변환

# 파일 이름과 평균 MFCC 값을 DataFrame에 넣기
df_mfcc_mean = spark.createDataFrame([(file_name, *mfcc_mean.tolist())],  # numpy 배열을 리스트로 변환
                                     ['file_name'] + [f'mfcc_{i+1}' for i in range(13)])

df_mfcc_mean.show()


WAV 파일 샘플링 레이트 (scipy): 44100
librosa로 처리한 샘플링 레이트: 44100
+------------------+-------------------+-----------------+-------------------+------------------+-----------------+------------------+------------------+-----------------+-----------------+-----------------+-------------------+-----------------+------------------+
|         file_name|             mfcc_1|           mfcc_2|             mfcc_3|            mfcc_4|           mfcc_5|            mfcc_6|            mfcc_7|           mfcc_8|           mfcc_9|          mfcc_10|            mfcc_11|          mfcc_12|           mfcc_13|
+------------------+-------------------+-----------------+-------------------+------------------+-----------------+------------------+------------------+-----------------+-----------------+-----------------+-------------------+-----------------+------------------+
|1.car_horn_9_1.wav|-251.55934143066406|173.8914794921875|-30.198183059692383|45.578636169433594|8.823283195495605|17.660017013549805|-4.3403968811

In [19]:
display(df_mfcc_mean.toPandas())

Unnamed: 0,file_name,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13
0,1.car_horn_9_1.wav,-251.559341,173.891479,-30.198183,45.578636,8.823283,17.660017,-4.340397,10.337169,4.226048,1.150458,-1.714232,2.508801,-1.459536


## 2. 데이터 전처리 - JSON

### 1) Json 파일 로드 및 DataFrame으로 변환

In [4]:
hdfs_path = f"{config.HDFS_BASE_PATH}"

# JSON 파일만 불러오기
df = spark.read.option("recursiveFileLookup", "true") \
               .option("pathGlobFilter", "*.json") \
               .json(hdfs_path, multiLine=True)

df.show(5)

25/02/17 12:11:33 WARN SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.
25/02/17 12:11:48 WARN TaskSetManager: Stage 0 contains a task of very large size (2196 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

+--------------------+--------------------+-----------------------------+--------------------+--------------------+
|         annotations|               audio|                  environment|                info|             license|
+--------------------+--------------------+-----------------------------+--------------------+--------------------+
|[{{7.55, 3.85}, {...|{1411kbps, 9.548,...|{갤럭시탭S6, 자연적, 제작,...|{IMR, 2021-09-04,...|{CC 0, https://ww...|
|[{{8.06, 2.45}, {...|{1411kbps, 10.064...|  {갤럭시S6Tab, 자연적, 제...|{IMR, 2021-09-04,...|{CC 0, https://ww...|
|[{{8.59, 4.02}, {...|{1411kbps, 11.066...|  {갤럭시S6Tab, 자연적, 제...|{IMR, 2021-09-04,...|{CC 0, https://ww...|
|[{{16.425, 2.0}, ...|{1411kbps, 18.425...|    {갤럭시TabS6Lite, 자연...|{IMR, 2021-09-04,...|{CC 0, https://ww...|
|[{{23.86, 7.44}, ...|{1411kbps, 28.611...|{갤럭시탭S6, 자연적, 제작,...|{IMR, 2021-09-04,...|{CC 0, https://ww...|
+--------------------+--------------------+-----------------------------+--------------------+---------------

### 2) JSON 데이터 구조 변환 (Flattening)

In [5]:
from pyspark.sql.functions import col

def flatten_json(df):
    """
    중첩된 JSON 구조를 평탄화하는 함수
    """
    # annotations 배열 내 요소들을 별도 컬럼으로 분리
    df = df.select(
        col("annotations.area.start").alias("area_start"),
        col("annotations.area.end").alias("area_end"),
        col("annotations.categories.category_01").alias("category_01"),
        col("annotations.categories.category_02").alias("category_02"),
        col("annotations.categories.category_03").alias("category_03"),
        col("annotations.decibel").alias("decibel"),
        col("annotations.labelName").alias("labelName"),
        col("annotations.soundQuality").alias("soundQuality"),
        col("annotations.subCategory").alias("subCategory"),
        col("audio.bitRate").alias("bitRate"),
        col("audio.duration").alias("duration"),
        col("audio.fileFormat").alias("fileFormat"),
        col("audio.fileName").alias("fileName"),
        col("audio.fileSize").alias("fileSize"),
        col("audio.recodingType").alias("recodingType"),
        col("audio.sampleRate").alias("sampleRate"),
        col("environment.acqDevice").alias("acqDevice"),
        col("environment.acqMethod").alias("acqMethod"),
        col("environment.acqType").alias("acqType"),
        col("environment.areaUse").alias("areaUse"),
        col("environment.dayNight").alias("dayNight"),
        col("environment.direction").alias("direction"),
        col("environment.distance").alias("distance"),
        col("environment.district").alias("district"),
        col("environment.gps.latitude").alias("latitude"),
        col("environment.gps.longitude").alias("longitude"),
        col("environment.micClass").alias("micClass"),
        col("environment.obstacle").alias("obstacle"),
        col("environment.place").alias("place"),
        col("environment.recordingTime").alias("recordingTime"),
        col("environment.urban").alias("urban"),
        col("environment.weather").alias("weather"),
        col("info.contributor").alias("contributor"),
        col("info.dateCreated").alias("dateCreated"),
        col("info.description").alias("description"),
        col("info.uri").alias("uri"),
        col("info.version").alias("version"),
        col("info.year").alias("year"),
        col("license.name").alias("license_name"),
        col("license.url").alias("license_url"),
    )
    return df

# Flatten 적용
df_flattened = flatten_json(df)
df_flattened.show(5, truncate=False)

25/02/17 12:59:52 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+----------+--------+-----------+------------+--------------+-------+-------------------------+------------+--------------+--------+--------+----------+---------------------+--------+------------+----------+---------------+---------+-------+------------+--------+---------+--------+----------+--------+---------+--------+--------+------+-------------+----------+-------+-----------+-----------+----------------+-----------------------+-------+----+------------+-----------------------+
|area_start|area_end|category_01|category_02 |category_03   |decibel|labelName                |soundQuality|subCategory   |bitRate |duration|fileFormat|fileName             |fileSize|recodingType|sampleRate|acqDevice      |acqMethod|acqType|areaUse     |dayNight|direction|distance|district  |latitude|longitude|micClass|obstacle|place |recordingTime|urban     |weather|contributor|dateCreated|description     |uri                    |version|year|license_name|license_url            |
+----------+--------+-------

In [31]:
import pandas as pd

# 전체 열, 행 표시 설정
# pd.set_option("display.max_columns", None)  # 모든 컬럼 출력
# pd.set_option("display.max_rows", None)  # 모든 행 출력
# pd.set_option("display.width", 200)  # 출력 너비 조정

# DataFrame을 Pandas로 변환하여 출력
display(df_flattened.toPandas().head(5))

                                                                                

Unnamed: 0,area_start,area_end,category_01,category_02,category_03,decibel,labelName,soundQuality,subCategory,bitRate,...,urban,weather,contributor,dateCreated,description,uri,version,year,license_name,license_url
0,[3.85],[7.55],[교통소음],[이륜자동차],[이륜차주행음],[71],[2.이륜자동차_1867_1.wav],[노이즈],[이륜차주행음],1411kbps,...,서울특별시,맑음,IMR,2021-09-04,도시 소리 데이터,https://www.aihub.or.kr,1.0,2021,CC 0,https://www.aihub.or.kr
1,[2.45],[8.06],[교통소음],[이륜자동차],[이륜차주행음],[120],[2.이륜자동차_2117_1.wav],[정상],[이륜차주행음],1411kbps,...,서울특별시,맑음,IMR,2021-09-04,도시 소리 데이터,https://www.aihub.or.kr,1.0,2021,CC 0,https://www.aihub.or.kr
2,[4.02],[8.59],[교통소음],[이륜자동차],[이륜차주행음],[125],[2.이륜자동차_1964_1.wav],[정상],[이륜차주행음],1411kbps,...,서울특별시,맑음,IMR,2021-09-04,도시 소리 데이터,https://www.aihub.or.kr,1.0,2021,CC 0,https://www.aihub.or.kr
3,[2.0],[16.425],[교통소음],[이륜자동차],[이륜차주행음],[108],[2.이륜자동차_2263_1.wav],[노이즈],[이륜차주행음],1411kbps,...,광주광역시,흐림,IMR,2021-09-04,도시 소리 데이터,https://www.aihub.or.kr,1.0,2021,CC 0,https://www.aihub.or.kr
4,[7.44],[23.86],[교통소음],[이륜자동차],[이륜차주행음],[92],[2.이륜자동차_2392_1.wav],[정상],[이륜차주행음],1411kbps,...,서울특별시,맑음,IMR,2021-09-04,도시 소리 데이터,https://www.aihub.or.kr,1.0,2021,CC 0,https://www.aihub.or.kr


### 3) 병렬 처리 및 성능 최적화

In [32]:
df_flattened = df_flattened.cache()  # 캐싱하여 속도 향상
df_flattened = df_flattened.repartition(4)  # 4개의 파티션으로 병렬 처리

### 4) CSV 또는 Parquet으로 저장

In [33]:
# CSV로 저장
#df_flattened.write.mode("overwrite").option("header", "true").csv(f"{config.HDFS_BASE_PATH}/output/csv_data")

# Parquet으로 저장 (속도 빠름)
#df_flattened.write.mode("overwrite").parquet(f"{config.HDFS_BASE_PATH}/output/parquet_data")

## 3. 데이터 전처리 - WAV

In [11]:
!pip install librosa

Defaulting to user installation because normal site-packages is not writeable
Collecting librosa
  Downloading librosa-0.9.2-py3-none-any.whl (214 kB)
     |████████████████████████████████| 214 kB 13.8 MB/s            
Collecting numba>=0.45.1
  Downloading numba-0.53.1-cp36-cp36m-manylinux2014_x86_64.whl (3.4 MB)
     |████████████████████████████████| 3.4 MB 46.5 MB/s            
[?25hCollecting resampy>=0.2.2
  Downloading resampy-0.4.3-py3-none-any.whl (3.1 MB)
     |████████████████████████████████| 3.1 MB 82.9 MB/s            
Collecting audioread>=2.1.9
  Using cached audioread-3.0.1-py3-none-any.whl (23 kB)
Collecting pooch>=1.0
  Downloading pooch-1.6.0-py3-none-any.whl (56 kB)
     |████████████████████████████████| 56 kB 5.8 MB/s             
[?25hCollecting soundfile>=0.10.2
  Using cached soundfile-0.13.1-py2.py3-none-any.whl (25 kB)
Collecting llvmlite<0.37,>=0.36.0rc1
  Downloading llvmlite-0.36.0-cp36-cp36m-manylinux2010_x86_64.whl (25.3 MB)
     |███████████████████

In [7]:
pip install hdfs

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [8]:
spark.conf.set("spark.sql.hive.filesourcePartitionFileCacheSize", 512000000)  # 500MB로 증가

In [12]:
import pyspark
from pyspark.sql import SparkSession
import io
from scipy.io import wavfile
import librosa
import numpy as np
import os
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, FloatType, StringType

# HDFS에서 모든 WAV 파일 읽기
hdfs_dir = f"{config.HDFS_BASE_PATH}/raw_data/1.Car/1.horn_of_car"
binary_df = spark.read.format("binaryFile").load(hdfs_dir)

# 🔹 UDF (User Defined Function) 정의: WAV → MFCC 변환
def extract_mfcc(binary_data):
    try:
        audio_bytes = io.BytesIO(binary_data)  # 바이너리 데이터를 메모리 파일로 변환
        sr, audio = wavfile.read(audio_bytes)  # scipy로 샘플링 레이트 확인
        audio_librosa, sr_librosa = librosa.load(audio_bytes, sr=None)  # librosa로 리샘플링
        mfcc = librosa.feature.mfcc(y=audio_librosa, sr=sr_librosa, n_mfcc=50)  # MFCC 추출
        mfcc_mean = np.mean(mfcc, axis=1).astype(float)  # 평균 계산
        return mfcc_mean.tolist()  # 리스트로 반환
    except Exception as e:
        return None  # 에러 발생 시 None 반환

# UDF 등록
mfcc_udf = udf(extract_mfcc, ArrayType(FloatType()))

# 🔹 파일 이름 추출 UDF 정의
def extract_filename(path):
    return os.path.basename(path)

filename_udf = udf(extract_filename, StringType())

# 🔹 변환 적용
df_mfcc = binary_df \
    .withColumn("file_name", filename_udf(binary_df["path"])) \
    .withColumn("mfcc_features", mfcc_udf(binary_df["content"]))

# 🔹 배열 데이터를 개별 컬럼으로 변환
mfcc_columns = [f"mfcc_{i+1}" for i in range(50)]
for i in range(50):
    df_mfcc = df_mfcc.withColumn(mfcc_columns[i], df_mfcc["mfcc_features"][i])

# 🔹 불필요한 컬럼 정리
df_mfcc = df_mfcc.select(["file_name"] + mfcc_columns)

# 🔹 결과 저장 (HDFS)
# output_path = f"{config.HDFS_BASE_PATH}/mfcc_features/"
# df_mfcc.write.csv(output_path, header=True, mode="overwrite")

# print(f"✅ MFCC 데이터가 HDFS에 저장됨: {output_path}")

In [13]:
df_mfcc.toPandas().head(5)

Unnamed: 0,file_name,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,...,mfcc_41,mfcc_42,mfcc_43,mfcc_44,mfcc_45,mfcc_46,mfcc_47,mfcc_48,mfcc_49,mfcc_50
0,1.car_horn_87719_1.wav,-303.535339,24.502613,19.504515,36.805042,14.415648,24.194254,-15.078179,9.494457,3.428701,...,0.599135,4.465346,11.633167,1.621047,-12.947955,-8.517405,11.782574,9.078656,-5.716046,-16.894989
1,1.car_horn_87688_1.wav,-275.247253,74.094482,-7.014953,35.502041,20.998775,17.419048,-4.690052,19.882788,7.529735,...,-4.985178,6.648241,6.169272,-0.182671,-2.993148,11.996006,17.761364,11.141748,-2.013043,-4.945284
2,1.car_horn_87964_1.wav,-233.662842,87.917618,45.434906,19.435001,9.098841,32.400368,5.817845,9.204194,14.611247,...,-7.923782,8.347244,-2.440833,-17.408949,-5.891588,22.778133,21.374588,-19.310514,-26.457247,4.794274
3,1.car_horn_88422_1.wav,-405.863525,138.927872,48.664036,-7.739197,-1.499425,22.140759,2.871114,3.680598,21.160671,...,-5.819475,9.372609,1.557052,-12.188097,-16.732809,7.821749,24.011944,-5.900558,-21.896723,-11.031067
4,1.car_horn_87987_1.wav,-318.16153,126.123177,43.729805,-11.429483,-5.223235,13.667998,0.934844,10.497743,2.401143,...,-5.953082,1.538178,-0.032047,-10.759938,-10.296665,14.938017,19.099138,-5.256958,-27.792421,-7.377015


----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 37452)
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/ml_env_python3.6/lib/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/home/ubuntu/anaconda3/envs/ml_env_python3.6/lib/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/home/ubuntu/anaconda3/envs/ml_env_python3.6/lib/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/home/ubuntu/anaconda3/envs/ml_env_python3.6/lib/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/home/lab06/.local/lib/python3.6/site-packages/pyspark/accumulators.py", line 262, in handle
    poll(accum_updates)
  File "/home/lab06/.local/lib/python3.6/site-packages/pyspark/accumulators.py", line 235, i

In [6]:
import pyspark
from pyspark.sql import SparkSession
import io
from scipy.io import wavfile
import librosa
import numpy as np
import os
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, FloatType, StringType

# HDFS에서 모든 WAV 파일 읽기
hdfs_dir = f"{config.HDFS_BASE_PATH}/raw_data/1.Car/2.siren_of_car"
binary_df = spark.read.format("binaryFile").load(hdfs_dir)

# 🔹 UDF (User Defined Function) 정의: WAV → MFCC 변환
def extract_mfcc(binary_data):
    try:
        audio_bytes = io.BytesIO(binary_data)  # 바이너리 데이터를 메모리 파일로 변환
        sr, audio = wavfile.read(audio_bytes)  # scipy로 샘플링 레이트 확인
        audio_librosa, sr_librosa = librosa.load(audio_bytes, sr=None)  # librosa로 리샘플링
        mfcc = librosa.feature.mfcc(y=audio_librosa, sr=sr_librosa, n_mfcc=13)  # MFCC 추출
        mfcc_mean = np.mean(mfcc, axis=1).astype(float)  # 평균 계산
        return mfcc_mean.tolist()  # 리스트로 반환
    except Exception as e:
        return None  # 에러 발생 시 None 반환

# UDF 등록
mfcc_udf = udf(extract_mfcc, ArrayType(FloatType()))

# 🔹 파일 이름 추출 UDF 정의
def extract_filename(path):
    return os.path.basename(path)

filename_udf = udf(extract_filename, StringType())

# 🔹 변환 적용
df_mfcc_siren_car = binary_df \
    .withColumn("file_name", filename_udf(binary_df["path"])) \
    .withColumn("mfcc_features", mfcc_udf(binary_df["content"]))

# 🔹 배열 데이터를 개별 컬럼으로 변환
mfcc_columns = [f"mfcc_{i+1}" for i in range(13)]
for i in range(13):
    df_mfcc_siren_car = df_mfcc_siren_car.withColumn(mfcc_columns[i], df_mfcc_siren_car["mfcc_features"][i])

# 🔹 불필요한 컬럼 정리
df_mfcc_siren_car = df_mfcc_siren_car.select(["file_name"] + mfcc_columns)

# 🔹 결과 저장 (HDFS)
# output_path = f"{config.HDFS_BASE_PATH}/mfcc_features/"
# df_mfcc.write.csv(output_path, header=True, mode="overwrite")

# print(f"✅ MFCC 데이터가 HDFS에 저장됨: {output_path}")

In [7]:
display(df_mfcc_siren_car.toPandas().head(5))

                                                                                

Unnamed: 0,file_name,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13
0,1.car_siren_303_1.wav,-331.622162,194.139023,-19.950441,23.301527,-9.28169,4.838598,8.544706,-0.232365,18.394283,13.557921,13.304636,7.358498,4.700674
1,1.car_siren_493_1.wav,-287.294861,202.257141,-7.542938,14.94933,0.315074,13.062035,5.098648,3.266061,16.367357,14.25031,7.647503,3.553201,6.276692
2,1.car_siren_288_1.wav,-239.801895,153.763153,-23.571859,27.403528,-8.118979,12.162041,-7.386652,-1.804927,10.035619,18.910061,17.573339,10.941156,8.292389
3,1.car_siren_499_1.wav,-264.13681,225.186554,-12.084479,-22.841242,8.205718,-0.783455,4.59601,1.768316,0.112573,11.33933,11.931375,5.466545,-0.937018
4,1.car_siren_409_1.wav,-280.859741,202.961441,-18.432287,15.58564,4.136283,12.429645,5.971639,-0.230551,6.385309,5.937668,5.668032,1.751873,2.468761


In [8]:
df_mfcc_siren_car.show()



+--------------------+----------+---------+----------+----------+----------+-----------+----------+-----------+-----------+---------+---------+----------+----------+
|           file_name|    mfcc_1|   mfcc_2|    mfcc_3|    mfcc_4|    mfcc_5|     mfcc_6|    mfcc_7|     mfcc_8|     mfcc_9|  mfcc_10|  mfcc_11|   mfcc_12|   mfcc_13|
+--------------------+----------+---------+----------+----------+----------+-----------+----------+-----------+-----------+---------+---------+----------+----------+
|1.car_siren_303_1...|-331.62216|194.13902|-19.950441| 23.301527|  -9.28169|   4.838598|  8.544706| -0.2323654|  18.394283|13.557921|13.304636| 7.3584976| 4.7006736|
|1.car_siren_493_1...|-287.29486|202.25714| -7.542938|  14.94933|0.31507355|  13.062035| 5.0986476|   3.266061|  16.367357| 14.25031| 7.647503|  3.553201|  6.276692|
|1.car_siren_288_1...| -239.8019|153.76315| -23.57186| 27.403528|-8.1189785|  12.162041|-7.3866525| -1.8049265|  10.035619| 18.91006| 17.57334| 10.941156|  8.292389|
|1.c

                                                                                

In [14]:
# 캐시된 DataFrame 해제
spark.catalog.clearCache()
print("✅ Spark 캐시가 정리되었습니다.")

✅ Spark 캐시가 정리되었습니다.
