In [1]:
import random
import pandas as pd
import numpy as np
import os
from tqdm.auto import tqdm
import librosa

from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

import json
from pandas import json_normalize

In [2]:
##### hyperparameter
CFG = {
    'SR':16000,
    'N_MFCC':128, # Melspectrogram 벡터를 추출할 개수
    'SEED':42
}


In [3]:
#### fixed random seed
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # seed 고정

In [6]:
#### data preprocessing
from pathlib import Path
import json
import pandas as pd

def get_filelist(subfolder, file_extension):
    data_path = Path.cwd()/subfolder
    
    return list(data_path.glob('**/*' + file_extension))

root_path = 'C:/Users/user/git/MiraeCity/SR/data/1.Training/label/07.터미널/'

# 이 파일이 위치해있는 폴더의 하위폴더 'data'에 있는 확장자명이 '.json'인 모든 파일을 불러옵니다
files = get_filelist(root_path+ '*' ,'json')

# 저장할 데이터 항목의 이름을 입력합니다. json 파일에 적힌 항목(key)과 같아야합니다.
column_names = ['dataSet', 'version', 'mediaUrl', 'date', 'typeInfo', 'conversationType', 'speakerNumber', 'speakers', 'dialogs', 'samplingRate', 'recStime', 'recLen', 'recDevice']
result = pd.DataFrame(columns=column_names)   

for json_file in files:
    df = pd.read_json(json_file)
    row_data = pd.json_normalize(data=df['row'])
    print(row_data.head(2)) #데이터가 잘 불러와지는지 확인하는 출력
    
    result = pd.concat([result,df])
    
# 현재 이 파일이 위치한 폴더의 하위 폴더 data 에 'result.csv'로 저장
result.to_csv(Path.cwd()/'data'/'01.기차역대합실.csv', index=None)

In [8]:
datas = json.load(open('C:/Users/user/git/MiraeCity/SR/data/1.Training/label/06.지하철,버스/01.지하철플랫폼/06_01_000817_210811_SD.json', 'r'))

keys = [key for key in datas]
print(keys)

['dataSet', 'version', 'mediaUrl', 'date', 'typeInfo', 'conversationType', 'speakerNumber', 'speakers', 'dialogs', 'samplingRate', 'recStime', 'recLen', 'recDevice']


In [9]:
### json to csv
rootdir = 'C:/Users/user/git/MiraeCity/SR/data/01.데이터/1.Training/label/TL/07.터미널/01.기차역대합실'  # Enter your directory here

file_list = [f for f in os.scandir(rootdir) if f.is_file() and f.name.endswith('.json')]

dataframes = []

for file in file_list:
    with open(file, 'r') as f:
        json_data = json.load(f)
        
        # Flatten 'typeInfo', 'speakers' and 'dialogs' separately
        typeInfo_df = json_normalize(json_data, record_path='typeInfo', meta=['dataSet', 'version', 'mediaUrl', 'date', 'conversationType', 'speakerNumber'], errors='ignore')
        speakers_df = json_normalize(json_data, record_path='speakers', meta=['dataSet', 'version', 'mediaUrl', 'date', 'conversationType', 'speakerNumber'], errors='ignore')
        dialogs_df = json_normalize(json_data, record_path='dialogs', meta=['dataSet', 'version', 'mediaUrl', 'date', 'conversationType', 'speakerNumber'], errors='ignore')
        
        # Concatenate all data into one DataFrame
        dataframes.append(pd.concat([typeInfo_df, speakers_df, dialogs_df], axis=1))

# Concatenate all data from different JSON files
big_frame = pd.concat(dataframes, ignore_index=True)

# Save the DataFrame to CSV
big_frame.to_csv('01.기차역대합실.csv', index=False)

In [32]:
### csv concat
####### train, valid csv todo

# list all csv files in the directory
csv_dir = 'C:/Users/user/git/MiraeCity/SR/data/2.Validation/label/csv/'
csv_files = [f for f in os.listdir(csv_dir) if f.endswith('.csv')]

# read and concatenate all csv files
df_list = []
for csv_file in csv_files:
    df = pd.read_csv(os.path.join(csv_dir, csv_file))
    df_list.append(df)

final_df = pd.concat(df_list, ignore_index=True)

# write concatenated dataframe to a new csv file
final_df.to_csv('concatenated.csv', index=False)

In [4]:

#### data preprocessing
train_df = pd.read_csv('C:/Users/user/git/MiraeCity/SR/data/01.데이터/1.Training/label/01.기차역대합실.csv')
test_df = pd.read_csv('C:/Users/user/git/MiraeCity/SR/data/01.데이터/2.Validation/label/01.기차역대합실.csv')


In [None]:
sample_df = pd.read_csv('C:/Users/user/git/MiraeCity/SR/data/01.데이터/1.Training/label/01.기차역대합실.csv', nrows=10)
print(sample_df.dtypes)

In [16]:
#### zip 파일 압축 해제
import zipfile

def unzip_file(zip_filepath, dest_path):
    with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
        zip_ref.extractall(dest_path)

zip_filepath = 'C:/Users/user/git/MiraeCity/SR/data/1.Training/raw/TS2_06.지하철,버스_02.지하철안.zip'  # replace with your zip file path
dest_path = 'C:/Users/user/git/MiraeCity/SR/data/1.Training/raw'  # replace with the path where you want to extract files

unzip_file(zip_filepath, dest_path)

In [6]:
##### mfcc feature extract function
rootdir = 'C:/Users/user/git/MiraeCity/SR/data/01.데이터/2.Validation/raw/VS_07.터미널/'

def get_mfcc_feature(df):
    features = []
    for path in tqdm(df['mediaUrl'].astype(str)):
        full_path = os.path.join(rootdir, path)
        try:
            y, sr = librosa.load(full_path, sr=CFG['SR'])
        except FileNotFoundError:
            #print(f"File {full_path} not found.")
            continue
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])
        features.append({
            'mfcc_mean': np.mean(mfcc, axis=1),
            'mfcc_max': np.max(mfcc, axis=1),
            'mfcc_min': np.min(mfcc, axis=1),
        })
    if not features:  # If features list is empty
        print("No valid audio files found.")
        return pd.DataFrame()  # Return an empty DataFrame
    else:
        print("Found features")

    mfcc_df = pd.DataFrame(features)
    mfcc_mean_df = pd.DataFrame(mfcc_df['mfcc_mean'].tolist(), columns=[f'mfcc_mean_{i}' for i in range(CFG['N_MFCC'])])
    mfcc_max_df = pd.DataFrame(mfcc_df['mfcc_max'].tolist(), columns=[f'mfcc_max_{i}' for i in range(CFG['N_MFCC'])])
    mfcc_min_df = pd.DataFrame(mfcc_df['mfcc_min'].tolist(), columns=[f'mfcc_min_{i}' for i in range(CFG['N_MFCC'])])

    return pd.concat([mfcc_mean_df, mfcc_max_df, mfcc_min_df], axis=1)

##### mel feature extract function
def get_feature_mel(df):
    features = []
    for path in tqdm(df['mediaUrl'].astype(str)):
        full_path = os.path.join(rootdir, path)
        try:
            y, sr = librosa.load(full_path, sr=CFG['SR'])
        except FileNotFoundError:
            #print(f"File {full_path} not found.")
            continue
        n_fft = 2048
        win_length = 2048
        hop_length = 1024
        n_mels = 128

        D = np.abs(librosa.stft(y, n_fft=n_fft, win_length = win_length, hop_length=hop_length))
        mel = librosa.feature.melspectrogram(S=D, sr=sr, n_mels=n_mels, hop_length=hop_length, win_length=win_length)

        features.append({
            'mel_mean': mel.mean(axis=1),
            'mel_max': mel.min(axis=1),
            'mel_min': mel.max(axis=1),
        })
        
    if not features:  # If features list is empty
        print("No valid audio files found.")
        return pd.DataFrame()  # Return an empty DataFrame
    else:
        print("Found features")

    mel_df = pd.DataFrame(features)
    mel_mean_df = pd.DataFrame(mel_df['mel_mean'].tolist(), columns=[f'mel_mean_{i}' for i in range(n_mels)])
    mel_max_df = pd.DataFrame(mel_df['mel_max'].tolist(), columns=[f'mel_max_{i}' for i in range(n_mels)])
    mel_min_df = pd.DataFrame(mel_df['mel_min'].tolist(), columns=[f'mel_min_{i}' for i in range(n_mels)])

    return pd.concat([mel_mean_df, mel_max_df, mel_min_df], axis=1)

In [7]:
train_mf = get_mfcc_feature(train_df)
test_mf = get_mfcc_feature(test_df)

train_mel = get_feature_mel(train_df)
test_mel = get_feature_mel(test_df)

train_x = pd.concat([train_mel, train_mf], axis=1)
test_x = pd.concat([test_mel, test_mf], axis=1)

train_y = train_df['place']

#train_x['place'] = train_df['place']
#test_x['place'] = test_df['place']

train_x['place'] = train_df['place'].fillna(method='ffill')
test_x['place'] = test_df['place'].fillna(method='ffill')

# train_x.dropna(subset=['place'], inplace=True)
# test_x.dropna(subset=['place'], inplace=True)

train_data = TabularDataset(train_x)
test_data = TabularDataset(test_x)

# train_data = pd.concat([train_data, test_data], axis=1)

  0%|          | 0/40020 [00:00<?, ?it/s]

  y, sr = librosa.load(full_path, sr=CFG['SR'])
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


No valid audio files found.


  0%|          | 0/4575 [00:00<?, ?it/s]

  y, sr = librosa.load(full_path, sr=CFG['SR'])
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Found features


  0%|          | 0/40020 [00:00<?, ?it/s]

  y, sr = librosa.load(full_path, sr=CFG['SR'])
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


No valid audio files found.


  0%|          | 0/4575 [00:00<?, ?it/s]

Found features


In [9]:
print(test_data)
test_data.to_csv('test.csv', index=False)

     mel_mean_0  mel_mean_1  mel_mean_2  mel_mean_3  mel_mean_4  mel_mean_5  \
0      0.194321    0.362591    0.433129    0.264966    0.189134    0.352695   
1      0.213185    0.368740    0.416187    0.303982    0.319825    0.424623   
2      0.345119    0.476485    0.514477    0.342951    0.334910    0.389481   
3      0.046725    0.093747    0.111278    0.101546    0.106966    0.156729   
4      0.051330    0.097078    0.118053    0.122123    0.118884    0.135077   
..          ...         ...         ...         ...         ...         ...   
258    0.045257    0.145505    0.201457    0.141556    0.143485    0.231502   
259    0.051619    0.182746    0.300776    0.603530    0.278957    0.187884   
260    0.047302    0.143825    0.171655    0.123403    0.133860    0.129655   
261    0.079806    0.158248    0.200954    0.216048    0.189978    0.218465   
262    0.073091    0.131891    0.161752    0.141749    0.165858    0.210938   

     mel_mean_6  mel_mean_7  mel_mean_8  mel_mean_9

In [11]:
#### autogluon
label = 'place'
eval_metric = 'accuracy'
time_limit = 3600 * 1 # hrs

predictor = TabularPredictor(
    label=label, eval_metric=eval_metric
).fit(test_data, presets='best_quality', time_limit=time_limit, ag_args_fit={'num_gpus': 0, 'num_cpus': 12})

No path specified. Models will be saved in: "AutogluonModels\ag-20230604_024034\"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=5, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "AutogluonModels\ag-20230604_024034\"
AutoGluon Version:  0.7.0
Python Version:     3.9.13
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Train Data Rows:    263
Train Data Columns: 768
Label Column: place
Preprocessing data ...
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == object).
	4 unique label values:  ['KTX서울역 바로위지상', '서울역 KTX 지하대기실', '서울역KTX 지하대기실', '부산역']
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Train Data Class Count: 4
Using Feature Generators to 

In [22]:
### leaderboard
predictor.leaderboard(silent=True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.913978,1.083753,709.569122,0.0,0.147008,2,True,14
1,XGBoost_BAG_L1,0.897849,0.309414,276.264465,0.309414,276.264465,1,True,11
2,NeuralNetFastAI_BAG_L1,0.897849,0.774339,433.157649,0.774339,433.157649,1,True,10
3,LightGBMXT_BAG_L1,0.88172,0.289712,273.603543,0.289712,273.603543,1,True,3
4,NeuralNetTorch_BAG_L1,0.876344,6.9469,334.759939,6.9469,334.759939,1,True,12
5,LightGBM_BAG_L1,0.870968,0.291355,274.315669,0.291355,274.315669,1,True,4
6,CatBoost_BAG_L1,0.870968,0.743648,978.384512,0.743648,978.384512,1,True,7
7,ExtraTreesGini_BAG_L1,0.865591,0.051349,0.356773,0.051349,0.356773,1,True,8
8,LightGBMLarge_BAG_L1,0.865591,0.292059,309.976674,0.292059,309.976674,1,True,13
9,ExtraTreesEntr_BAG_L1,0.854839,0.049998,0.348104,0.049998,0.348104,1,True,9


In [37]:
#### inference 
model_to_use = predictor.get_model_best()
model_pred = predictor.predict(test_data, model=model_to_use)

In [38]:
#### result
result = pd.DataFrame()

result['place'] = model_pred
result.to_csv('result.csv', index=False)