In [66]:
import random
import pandas as pd
import numpy as np
import os
from tqdm.auto import tqdm
import librosa

from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

import json
from pandas import json_normalize

In [2]:
##### hyperparameter
CFG = {
    'SR':16000,
    'N_MFCC':128, # Melspectrogram 벡터를 추출할 개수
    'SEED':42
}


In [3]:
#### fixed random seed
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # seed 고정

In [None]:
#### data preprocessing
from pathlib import Path
import json
import pandas as pd

def get_filelist(subfolder, file_extension):
    data_path = Path.cwd()/subfolder
    
    return list(data_path.glob('**/*' + file_extension))

root_path = 'C:/Users/user/git/MiraeCity/SR/data/1.Training/label/06.지하철,버스/'

# 이 파일이 위치해있는 폴더의 하위폴더 'data'에 있는 확장자명이 '.json'인 모든 파일을 불러옵니다
files = get_filelist(root_path+'01.지하철플랫폼','json')

# 저장할 데이터 항목의 이름을 입력합니다. json 파일에 적힌 항목(key)과 같아야합니다.
column_names = ['dataSet', 'version', 'mediaUrl', 'date', 'typeInfo', 'conversationType', 'speakerNumber', 'speakers', 'dialogs', 'samplingRate', 'recStime', 'recLen', 'recDevice']
result = pd.DataFrame(columns=column_names)   

for json_file in files:
    df = pd.read_json(json_file)
    row_data = pd.json_normalize(data=df['row'])
    print(row_data.head(2)) #데이터가 잘 불러와지는지 확인하는 출력
    
    result = pd.concat([result,df])
    
# 현재 이 파일이 위치한 폴더의 하위 폴더 data 에 'result.csv'로 저장
result.to_csv(Path.cwd()/'data'/'01.지하철플랫폼.csv', index=None)

In [8]:
datas = json.load(open('C:/Users/user/git/MiraeCity/SR/data/1.Training/label/06.지하철,버스/01.지하철플랫폼/06_01_000817_210811_SD.json', 'r'))

keys = [key for key in datas]
print(keys)

['dataSet', 'version', 'mediaUrl', 'date', 'typeInfo', 'conversationType', 'speakerNumber', 'speakers', 'dialogs', 'samplingRate', 'recStime', 'recLen', 'recDevice']


In [28]:
### json to csv
rootdir = 'C:/Users/user/git/MiraeCity/SR/data/2.Validation/label/06.지하철,버스/04.버스안'  # Enter your directory here

file_list = [f for f in os.scandir(rootdir) if f.is_file() and f.name.endswith('.json')]

dataframes = []

for file in file_list:
    with open(file, 'r') as f:
        json_data = json.load(f)
        
        # Flatten 'typeInfo', 'speakers' and 'dialogs' separately
        typeInfo_df = json_normalize(json_data, record_path='typeInfo', meta=['dataSet', 'version', 'mediaUrl', 'date', 'conversationType', 'speakerNumber'], errors='ignore')
        speakers_df = json_normalize(json_data, record_path='speakers', meta=['dataSet', 'version', 'mediaUrl', 'date', 'conversationType', 'speakerNumber'], errors='ignore')
        dialogs_df = json_normalize(json_data, record_path='dialogs', meta=['dataSet', 'version', 'mediaUrl', 'date', 'conversationType', 'speakerNumber'], errors='ignore')
        
        # Concatenate all data into one DataFrame
        dataframes.append(pd.concat([typeInfo_df, speakers_df, dialogs_df], axis=1))

# Concatenate all data from different JSON files
big_frame = pd.concat(dataframes, ignore_index=True)

# Save the DataFrame to CSV
big_frame.to_csv('04.버스안.csv', index=False)

In [32]:
### csv concat
####### train, valid csv todo

# list all csv files in the directory
csv_dir = 'C:/Users/user/git/MiraeCity/SR/data/2.Validation/label/csv/'
csv_files = [f for f in os.listdir(csv_dir) if f.endswith('.csv')]

# read and concatenate all csv files
df_list = []
for csv_file in csv_files:
    df = pd.read_csv(os.path.join(csv_dir, csv_file))
    df_list.append(df)

final_df = pd.concat(df_list, ignore_index=True)

# write concatenated dataframe to a new csv file
final_df.to_csv('concatenated.csv', index=False)

In [59]:

#### data preprocessing
train_df = pd.read_csv('C:/Users/user/git/MiraeCity/SR/data/1.Training/label/csv/지하철,버스/02.지하철안.csv')
test_df = pd.read_csv('C:/Users/user/git/MiraeCity/SR/data/2.Validation/label/csv/지하철,버스/02.지하철안.csv')


  train_df = pd.read_csv('C:/Users/user/git/MiraeCity/SR/data/1.Training/label/csv/지하철,버스/02.지하철안.csv')


In [42]:
#### zip 파일 압축 해제
import zipfile

def unzip_file(zip_filepath, dest_path):
    with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
        zip_ref.extractall(dest_path)

zip_filepath = 'C:/Users/user/git/MiraeCity/SR/data/2.Validation/raw/VS_06.지하철,버스.zip'  # replace with your zip file path
dest_path = 'C:/Users/user/git/MiraeCity/SR/data/2.Validation/raw'  # replace with the path where you want to extract files

unzip_file(zip_filepath, dest_path)

In [75]:
##### mfcc feature extract function

rootdir = 'C:/Users/user/git/MiraeCity/SR/data/1.Training/raw/'

def get_mfcc_feature(df):
    features = []
    for path in tqdm(df['mediaUrl'].astype(str)):
        full_path = os.path.join(rootdir, path)
        try:
            y, sr = librosa.load(full_path, sr=CFG['SR'])
        except FileNotFoundError:
            print(f"File {full_path} not found.")
            continue
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])
        features.append({
            'mfcc_mean': np.mean(mfcc, axis=1),
            'mfcc_max': np.max(mfcc, axis=1),
            'mfcc_min': np.min(mfcc, axis=1),
        })

    mfcc_df = pd.DataFrame(features)
    mfcc_mean_df = pd.DataFrame(mfcc_df['mfcc_mean'].tolist(), columns=[f'mfcc_mean_{i}' for i in range(CFG['N_MFCC'])])
    mfcc_max_df = pd.DataFrame(mfcc_df['mfcc_max'].tolist(), columns=[f'mfcc_max_{i}' for i in range(CFG['N_MFCC'])])
    mfcc_min_df = pd.DataFrame(mfcc_df['mfcc_min'].tolist(), columns=[f'mfcc_min_{i}' for i in range(CFG['N_MFCC'])])

    return pd.concat([mfcc_mean_df, mfcc_max_df, mfcc_min_df], axis=1)

##### mel feature extract function
def get_feature_mel(df):
    features = []
    for path in tqdm(df['mediaUrl'].astype(str)):
        full_path = os.path.join(rootdir, path)
        try:
            y, sr = librosa.load(full_path, sr=CFG['SR'])
        except FileNotFoundError:
            print(f"File {full_path} not found.")
            continue
        n_fft = 2048
        win_length = 2048
        hop_length = 1024
        n_mels = 128

        D = np.abs(librosa.stft(data, n_fft=n_fft, win_length = win_length, hop_length=hop_length))
        mel = librosa.feature.melspectrogram(S=D, sr=sr, n_mels=n_mels, hop_length=hop_length, win_length=win_length)

        features.append({
            'mel_mean': mel.mean(axis=1),
            'mel_max': mel.min(axis=1),
            'mel_min': mel.max(axis=1),
        })
    mel_df = pd.DataFrame(features)
    mel_mean_df = pd.DataFrame(mel_df['mel_mean'].tolist(), columns=[f'mel_mean_{i}' for i in range(n_mels)])
    mel_max_df = pd.DataFrame(mel_df['mel_max'].tolist(), columns=[f'mel_max_{i}' for i in range(n_mels)])
    mel_min_df = pd.DataFrame(mel_df['mel_min'].tolist(), columns=[f'mel_min_{i}' for i in range(n_mels)])

    return pd.concat([mel_mean_df, mel_max_df, mel_min_df], axis=1)

In [76]:
train_mf = get_mfcc_feature(train_df)
test_mf = get_mfcc_feature(test_df)

train_mel = get_feature_mel(train_df)
test_mel = get_feature_mel(test_df)

train_x = pd.concat([train_mel, train_mf], axis=1)
test_x = pd.concat([test_mel, test_mf], axis=1)

train_y = train_df['label']

train_x['label'] = train_df['label']
train_data = TabularDataset(train_x)
test_data = TabularDataset(test_x)

  0%|          | 0/32770 [00:00<?, ?it/s]

File C:/Users/user/git/MiraeCity/SR/data/1.Training/raw/06.지하철,버스/02.지하철안/06_01_000154_210727_SD.wav not found.


  y, sr = librosa.load(full_path, sr=CFG['SR'])


TypeError: join() argument must be str, bytes, or os.PathLike object, not 'float'

In [None]:
print(train_x.head())

In [None]:
#### autogluon
label = 'label'
eval_metric = 'accuracy'
time_limit = 3600 * 24 # 24 hrs

predictor = TabularPredictor(
    label=label, eval_metric=eval_metric
).fit(train_data, presets='best_quality', time_limit=time_limit, ag_args_fit={'num_gpus': 2, 'num_cpus': 32})