In [1]:
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
'''
dir_2017: KETI-2017-SL-Annotation-v2_1.xlsx 파일 위치
dir_2018: KETI-2018-SL-Annotation-v1.xlsx 파일 위치
dir_video: video 폴더들이 있는 위치 (Video/0001~3000(영상)처럼 Video 내의 영상 하위폴더) ex) ../Video  
dir_csv: 저장하고자 하는 csv 파일 위치 (.csv 미포함)  ex) ../KETI/label
'''

In [2]:
LABEL_PATH = '/hdd1/dataset/KETI_SignLanguage/Annotation'
dir_2017 = os.path.join(LABEL_PATH, 'KETI-2017-SL-Annotation-v2_1.xlsx')
dir_2018 = os.path.join(LABEL_PATH, 'KETI-2018-SL-Annotation-v1.xlsx')
dir_video = '/hdd1/dataset/KETI_SignLanguage/Video'
dir_csv = './'

In [3]:
df1 = pd.read_excel(dir_2017)
df1.drop(columns="Unnamed: 7", inplace=True)
df1.dropna(inplace=True)

df2_sheet1 = pd.read_excel(dir_2018, sheet_name="KETI-2018-수어데이터-학습용-Annotation")
df2_sheet2 = pd.read_excel(dir_2018, sheet_name="KETI-2018-수어데이터-응답용-Anotation")
df2_sheet1.drop(columns="Unnamed: 5", inplace=True)
df2_sheet1.drop(columns="Unnamed: 8", inplace=True)
df2_sheet1.dropna(inplace=True)
df2_sheet2.dropna(inplace=True)

len(df1), len(df2_sheet1), len(df2_sheet2)

(10480, 33012, 387)

In [4]:
df1.isnull().values.any(), df2_sheet1.isnull().values.any(), df2_sheet2.isnull().values.any()

(False, False, False)

In [5]:
df1["파일명"] = df1["파일명"].str.split(".").str.get(0)
df = pd.concat([df1, df2_sheet1, df2_sheet2], ignore_index=True)
df

Unnamed: 0,번호,언어 제공자 ID,취득연도,방향,타입(단어/문장),파일명,한국어
0,419.0,1,2017.0,정면,단어,KETI_SL_0000000419,화재
1,838.0,1,2017.0,측면,단어,KETI_SL_0000000838,화재
2,1255.0,2,2017.0,정면,단어,KETI_SL_0000001255,화재
3,1674.0,2,2017.0,측면,단어,KETI_SL_0000001674,화재
4,2032.0,3,2017.0,정면,단어,KETI_SL_0000002032,화재
...,...,...,...,...,...,...,...
43874,383.0,1,2018.0,우측면,문장,KETI_SL_RES_0000000383.MOV,금방 119 아저씨들이 올거예요
43875,384.0,1,2018.0,우측면,문장,KETI_SL_RES_0000000384.MOV,금방 구급대원들이 도착할테니 잠시만 기다리세요
43876,385.0,1,2018.0,우측면,문장,KETI_SL_RES_0000000385.MOV,지금 사람들이 가고 있어요
43877,386.0,1,2018.0,우측면,문장,KETI_SL_RES_0000000386.MOV,최대한 빨리 출동하도록 하겠습니다


In [6]:
files = glob.glob(dir_video + '/**/*', recursive=True)

fold = pd.DataFrame(files, columns=['directory'])
fold['파일명'] = fold['directory'].str.split("/").str.get(-1)
fold = fold[~fold['파일명'].str.contains('~')]
fold['파일명'] = fold['파일명'].str.split(".").str.get(0)

In [7]:
fold

Unnamed: 0,directory,파일명
18,/hdd1/dataset/KETI_SignLanguage/Video/25565~28...,KETI_SL_0000026889
19,/hdd1/dataset/KETI_SignLanguage/Video/25565~28...,KETI_SL_0000027989
20,/hdd1/dataset/KETI_SignLanguage/Video/25565~28...,KETI_SL_0000027833
21,/hdd1/dataset/KETI_SignLanguage/Video/25565~28...,KETI_SL_0000027271
22,/hdd1/dataset/KETI_SignLanguage/Video/25565~28...,KETI_SL_0000027948
...,...,...
41933,/hdd1/dataset/KETI_SignLanguage/Video/23051~25...,KETI_SL_0000024898
41934,/hdd1/dataset/KETI_SignLanguage/Video/23051~25...,KETI_SL_0000024796
41935,/hdd1/dataset/KETI_SignLanguage/Video/23051~25...,KETI_SL_0000023722
41936,/hdd1/dataset/KETI_SignLanguage/Video/23051~25...,KETI_SL_0000023736


In [8]:
fdf = pd.merge(df, fold, how='right')
fdf.isnull().values.any()

True

In [9]:
fdf.dropna(inplace=True)
fdf.isnull().values.any()

False

In [10]:
# fdf = fdf[(fdf['타입(단어/문장)'] == '단어') & (fdf['방향'] == '정면')]
fdf = fdf[fdf['타입(단어/문장)'] == '단어']

In [11]:
fdf['파일명'] = fdf['파일명'].astype(str)  # 문자열 변환
fdf['한국어'] = fdf['한국어'].astype(str)

label_name = list(set(fdf['한국어'].to_list()))
label_name.sort()
len(label_name)

419

In [12]:
df_label = pd.DataFrame({'label': label_name})
df_label['label_index'] = df_label.index
df_label

Unnamed: 0,label,label_index
0,0,0
1,1,1
2,10,2
3,100,3
4,1000,4
...,...,...
414,화상,414
415,화약,415
416,화요일,416
417,화장실,417


In [13]:
fdf = pd.merge(fdf, df_label, how='left', left_on='한국어', right_on='label')
fdf

Unnamed: 0,번호,언어 제공자 ID,취득연도,방향,타입(단어/문장),파일명,한국어,directory,label,label_index
0,26889.0,24,2018.0,정면,단어,KETI_SL_0000026889,67,/hdd1/dataset/KETI_SignLanguage/Video/25565~28...,67,69
1,27989.0,24,2018.0,우측면,단어,KETI_SL_0000027989,위협,/hdd1/dataset/KETI_SignLanguage/Video/25565~28...,위협,326
2,27833.0,24,2018.0,우측면,단어,KETI_SL_0000027833,누수,/hdd1/dataset/KETI_SignLanguage/Video/25565~28...,누수,171
3,27271.0,24,2018.0,좌측면,단어,KETI_SL_0000027271,30,/hdd1/dataset/KETI_SignLanguage/Video/25565~28...,30,29
4,27948.0,24,2018.0,우측면,단어,KETI_SL_0000027948,약국,/hdd1/dataset/KETI_SignLanguage/Video/25565~28...,약국,284
...,...,...,...,...,...,...,...,...,...,...
33512,24898.0,22,2018.0,좌측면,단어,KETI_SL_0000024898,농약,/hdd1/dataset/KETI_SignLanguage/Video/23051~25...,농약,169
33513,24796.0,22,2018.0,좌측면,단어,KETI_SL_0000024796,69,/hdd1/dataset/KETI_SignLanguage/Video/23051~25...,69,71
33514,23722.0,21,2018.0,좌측면,단어,KETI_SL_0000023722,서울시,/hdd1/dataset/KETI_SignLanguage/Video/23051~25...,서울시,248
33515,23736.0,21,2018.0,좌측면,단어,KETI_SL_0000023736,송파구,/hdd1/dataset/KETI_SignLanguage/Video/23051~25...,송파구,262


In [14]:
signer_ids = np.unique(fdf['언어 제공자 ID'])
len(signer_ids)

30

In [15]:
signer_ids

array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
       20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], dtype=object)

In [16]:
num_data_per_signer = [len(fdf[fdf['언어 제공자 ID']==i]) for i in range(1, 31)]
print(num_data_per_signer)

[838, 838, 838, 838, 838, 838, 838, 838, 838, 838, 1257, 1257, 1256, 1257, 1257, 1257, 1257, 1255, 1257, 1257, 1257, 1257, 1257, 1257, 1257, 1257, 1257, 1257, 1257, 1257]


In [None]:
419 * 30

In [None]:
# for i in range(1, 31):
#     df_signer = fdf[fdf['언어 제공자 ID']==i]
#     print(np.unique(df_signer['label_index'], return_counts=True))
#     print(len(np.unique(df_signer['label_index'])))

In [17]:
fdf_train = fdf.loc[fdf['언어 제공자 ID'] > 10, ['파일명', 'label_index']]
fdf_val = fdf.loc[fdf['언어 제공자 ID'] <= 10, ['파일명', 'label_index']]
len(fdf_train), len(fdf_val)

(25137, 8380)

In [18]:
fdf_train.to_csv('./label_KETI_train_all.csv', mode='w', index=None, header=False)
fdf_val.to_csv('./label_KETI_val_all.csv', mode='w', index=None, header=False)