In [297]:
import re
import uuid

import numpy as np

from src.service.ParseService import ParseService


class SimpleParser(ParseService):
    def __init__(self):
        self.title = ['진동속도(cm/s)', '진동레벨[dB(V)]', '소음[dB(A)]']
        self.other_simple_version = False

    def extract_columns(self, table_list):
        columns = ['구분', '진동속도(cm/s)', '진동레벨[dB(V)]', '소음[dB(A)]', '최저치', '최고치', '최저치', '최고치', '최저치', '최고치', '허용기준', '비고']
        
        if '현장관리기준' in table_list:
            self.other_simple_version = True
            table_list = [i for i in table_list if '현장관리기준' not in i]
            
        return [i for i in table_list if i not in columns and  not re.match(r'[^\w\s-]', i)]

    def conversion_error_value(self, non_columns_list):
        conversion_error_list = []
        
        for item in non_columns_list:
            if self.other_simple_version and re.match(r'n/*?t\(.*?\)', item, re.IGNORECASE):
                conversion_error_list.extend([np.nan, np.nan, np.nan, np.nan, np.nan, np.nan])
            elif not self.other_simple_version and re.match(r'n/*?t', item, re.IGNORECASE) or item == '-':
                conversion_error_list.append(np.nan)
            else:
                conversion_error_list.append(item)

        return conversion_error_list

    def delete_other_value(self, conversion_error_list):
        filtered_list = []
        skip_count = 0

        for index, item in enumerate(conversion_error_list):
            try:
                if skip_count > 0:
                    skip_count -= 1
                    continue
                elif re.match(r'\d+\.\d+cm/s', item, re.IGNORECASE) and re.match(r'\d+\.\d+cm/s', conversion_error_list[index + 1], re.IGNORECASE):
                    skip_count += 2
                elif re.match(r'\d+\.\d+cm/s', item, re.IGNORECASE) and not re.match(r'\d+\.\d+cm/s', conversion_error_list[index + 1], re.IGNORECASE):
                    skip_count += 1
                else:
                    filtered_list.append(item)
            except TypeError as e:
                filtered_list.append(item)

        return filtered_list

    def classification_by_date(self, filtered_list):
        section_list = []
        current_date_section = []

        for item in filtered_list:
            try:
                if re.match(r'\d+월\d+일', item) and current_date_section:
                    section_list.append(current_date_section)
                    current_date_section = []
                current_date_section.append(item)
            except TypeError as e:
                current_date_section.append(item)

        if current_date_section:
            section_list.append(current_date_section)

        return section_list

    def extract_location(self, classification_list):
        location = []
        data_count = 6

        for items in classification_list:
            for item in items:
                try:
                    if len(location) == 0 and not re.match(r'\d+\.\d+', item) and item is not np.nan and not re.match(r'\d+월\d+일', item):
                        location.append(item)
                    if re.match(r'\d+\.\d+', item) or item is np.nan:
                        data_count -= 1
                    elif data_count == 0 and not re.match(r'\d+월\d+일', item):
                        print(f'장소 아이템 : {item}')
                        location.append(item)
                        data_count = 6
                except TypeError as e:
                    data_count -= 1
        
        print(list(set(location)))

        return list(set(location))

    def get_dict(self, classification_list, location_list):
        result_dict = {}
        value_list = []

        date_key, location_key = None, None

        for items in classification_list:
            for index, item in enumerate(items):
                if isinstance(item, float):
                    item = str(item)

                if re.match(r'\d+월\d+일', item):
                    date_key = item
                elif item not in location_list:
                    value_list.append(item)
                elif item in location_list:
                    location_key = item
                    unique_key = str(uuid.uuid4())
                    result_dict[location_key] = {} if location_key not in result_dict else result_dict[location_key]

                    result_dict[location_key][unique_key] = {'일시': date_key}
                    idx1, idx2 = index + 1, index + 2
                    for j in range(len(self.title)):
                        result_dict[location_key][unique_key][f'{self.title[j]} 최저치'] = float(items[idx1])
                        result_dict[location_key][unique_key][f'{self.title[j]} 최고치'] = float(items[idx2])
                        
                        idx1 += 2
                        idx2 += 2
                    idx1, idx2 = 0, 0
                        
        return result_dict




In [298]:
import clipboard

test_case = SimpleParser()

In [299]:
table_list = list(filter(lambda x: x, clipboard.paste().replace("\r\n", " ").split(" ")))

In [300]:
non_columns_list = test_case.extract_columns(table_list)
print(non_columns_list)

['2월1일', 'S-oil1', 'NT(TL0.03cm/s이하로감지안됨)', '0.1cm/s80dB(A)', '0.1cm/s80dB(A)', 'OK', 'S-oil2', 'NT(TL0.03cm/s이하로감지안됨)', '0.1cm/s80dB(A)', '0.1cm/s80dB(A)', 'OK', 'S-oil3', 'NT(TL0.03cm/s이하로감지안됨)', '0.1cm/s80dB(A)', '0.1cm/s80dB(A)', 'OK', 'S-oil4', 'NT(TL0.03cm/s이하로감지안됨)', '0.1cm/s80dB(A)', '0.1cm/s80dB(A)', 'OK', 'S-oil5', 'NT(TL0.03cm/s이하로감지안됨)', '0.1cm/s80dB(A)', '0.1cm/s80dB(A)', 'OK', '오성기업', '0.0221', '0.0323', '-', '-', '60.0', '61.2', '0.1cm/s80dB(A)', '0.2cm/s80dB(A)', 'OK', '울산종합비즈니스센터', '0.073', '0.156', '62.93', '69.85', '69.44', '78.92', '0.1cm/s80dB(A)', '0.2cm/s80dB(A)', 'OK', '고도화학', '0.0867', '0.1143', '-', '-', '74.8', '75.5', '0.1cm/s80dB(A)', '0.2cm/s80dB(A)', 'OK', '가옥(거남3길34-5)', 'NT(TL0.01cm/s이하로감지안됨)', '0.1cm/s75dB(A)', '0.2cm/s75dB(A)', 'OK', '우사(거남3길34-30)', 'NT(TL0.01cm/s이하로감지안됨)', '0.05cm/s60dB(A)', '0.1cm/s60dB(A)', 'OK', '축사(학남리644-2)', 'NT(TL0.01cm/s이하로감지안됨)', '0.05cm/s60dB(A)', '0.1cm/s60dB(A)', 'OK', '옹벽', 'NT(TL0.03cm/s이하로감지안됨)', '-', '-', 'OK', '태성산업

In [301]:
conversion_error_list = test_case.conversion_error_value(non_columns_list)
print(conversion_error_list)
# 문제없음

['2월1일', 'S-oil1', nan, nan, nan, nan, nan, nan, '0.1cm/s80dB(A)', '0.1cm/s80dB(A)', 'OK', 'S-oil2', nan, nan, nan, nan, nan, nan, '0.1cm/s80dB(A)', '0.1cm/s80dB(A)', 'OK', 'S-oil3', nan, nan, nan, nan, nan, nan, '0.1cm/s80dB(A)', '0.1cm/s80dB(A)', 'OK', 'S-oil4', nan, nan, nan, nan, nan, nan, '0.1cm/s80dB(A)', '0.1cm/s80dB(A)', 'OK', 'S-oil5', nan, nan, nan, nan, nan, nan, '0.1cm/s80dB(A)', '0.1cm/s80dB(A)', 'OK', '오성기업', '0.0221', '0.0323', nan, nan, '60.0', '61.2', '0.1cm/s80dB(A)', '0.2cm/s80dB(A)', 'OK', '울산종합비즈니스센터', '0.073', '0.156', '62.93', '69.85', '69.44', '78.92', '0.1cm/s80dB(A)', '0.2cm/s80dB(A)', 'OK', '고도화학', '0.0867', '0.1143', nan, nan, '74.8', '75.5', '0.1cm/s80dB(A)', '0.2cm/s80dB(A)', 'OK', '가옥(거남3길34-5)', nan, nan, nan, nan, nan, nan, '0.1cm/s75dB(A)', '0.2cm/s75dB(A)', 'OK', '우사(거남3길34-30)', nan, nan, nan, nan, nan, nan, '0.05cm/s60dB(A)', '0.1cm/s60dB(A)', 'OK', '축사(학남리644-2)', nan, nan, nan, nan, nan, nan, '0.05cm/s60dB(A)', '0.1cm/s60dB(A)', 'OK', '옹벽', nan, n

In [302]:
filtered_list = test_case.delete_other_value(conversion_error_list)
print(filtered_list)

['2월1일', 'S-oil1', nan, nan, nan, nan, nan, nan, 'S-oil2', nan, nan, nan, nan, nan, nan, 'S-oil3', nan, nan, nan, nan, nan, nan, 'S-oil4', nan, nan, nan, nan, nan, nan, 'S-oil5', nan, nan, nan, nan, nan, nan, '오성기업', '0.0221', '0.0323', nan, nan, '60.0', '61.2', '울산종합비즈니스센터', '0.073', '0.156', '62.93', '69.85', '69.44', '78.92', '고도화학', '0.0867', '0.1143', nan, nan, '74.8', '75.5', '가옥(거남3길34-5)', nan, nan, nan, nan, nan, nan, '우사(거남3길34-30)', nan, nan, nan, nan, nan, nan, '축사(학남리644-2)', nan, nan, nan, nan, nan, nan, '옹벽', nan, nan, nan, nan, nan, nan, nan, nan, 'OK', '태성산업', '0.096', '0.161', '63.81', '70.92', '70.76', '78.21', '부승화학', '0.0497', '0.0591', nan, nan, '63.2', '65.6', '2월2일', 'S-oil1', nan, nan, nan, nan, nan, nan, 'S-oil2', nan, nan, nan, nan, nan, nan, 'S-oil3', nan, nan, nan, nan, nan, nan, 'S-oil4', nan, nan, nan, nan, nan, nan, 'S-oil5', nan, nan, nan, nan, nan, nan, '오성기업', nan, nan, nan, nan, nan, nan, '울산종합비즈니스센터', '0.118', '0.162', '66.27', '70.58', '71.28', '75

In [303]:
classification_list = test_case.classification_by_date(filtered_list)
print(classification_list)

# 여기까지 문제 없음

[['2월1일', 'S-oil1', nan, nan, nan, nan, nan, nan, 'S-oil2', nan, nan, nan, nan, nan, nan, 'S-oil3', nan, nan, nan, nan, nan, nan, 'S-oil4', nan, nan, nan, nan, nan, nan, 'S-oil5', nan, nan, nan, nan, nan, nan, '오성기업', '0.0221', '0.0323', nan, nan, '60.0', '61.2', '울산종합비즈니스센터', '0.073', '0.156', '62.93', '69.85', '69.44', '78.92', '고도화학', '0.0867', '0.1143', nan, nan, '74.8', '75.5', '가옥(거남3길34-5)', nan, nan, nan, nan, nan, nan, '우사(거남3길34-30)', nan, nan, nan, nan, nan, nan, '축사(학남리644-2)', nan, nan, nan, nan, nan, nan, '옹벽', nan, nan, nan, nan, nan, nan, nan, nan, 'OK', '태성산업', '0.096', '0.161', '63.81', '70.92', '70.76', '78.21', '부승화학', '0.0497', '0.0591', nan, nan, '63.2', '65.6'], ['2월2일', 'S-oil1', nan, nan, nan, nan, nan, nan, 'S-oil2', nan, nan, nan, nan, nan, nan, 'S-oil3', nan, nan, nan, nan, nan, nan, 'S-oil4', nan, nan, nan, nan, nan, nan, 'S-oil5', nan, nan, nan, nan, nan, nan, '오성기업', nan, nan, nan, nan, nan, nan, '울산종합비즈니스센터', '0.118', '0.162', '66.27', '70.58', '71.28', 

In [304]:
location_list = test_case.extract_location(classification_list)
print(location_list)

장소 아이템 : S-oil2
장소 아이템 : S-oil3
장소 아이템 : S-oil4
장소 아이템 : S-oil5
장소 아이템 : 오성기업
장소 아이템 : 울산종합비즈니스센터
장소 아이템 : 고도화학
장소 아이템 : 가옥(거남3길34-5)
장소 아이템 : 우사(거남3길34-30)
장소 아이템 : 축사(학남리644-2)
장소 아이템 : 옹벽
['축사(학남리644-2)', 'S-oil3', 'S-oil2', '가옥(거남3길34-5)', '옹벽', 'S-oil1', '오성기업', 'S-oil5', '우사(거남3길34-30)', '고도화학', '울산종합비즈니스센터', 'S-oil4']
['축사(학남리644-2)', 'S-oil3', 'S-oil2', '가옥(거남3길34-5)', '옹벽', 'S-oil1', '오성기업', 'S-oil5', '우사(거남3길34-30)', '고도화학', '울산종합비즈니스센터', 'S-oil4']


In [271]:
result_dict = test_case.get_dict(classification_list, location_list)
print(result_dict)

{'S-oil1': {'3e0ee70c-eb30-4890-8b75-29fcfb7e6962': {'일시': '2월1일', '진동속도(cm/s) 최저치': nan, '진동속도(cm/s) 최고치': nan, '진동레벨[dB(V)] 최저치': nan, '진동레벨[dB(V)] 최고치': nan, '소음[dB(A)] 최저치': nan, '소음[dB(A)] 최고치': nan}, '0d19568f-90a5-4291-b3d5-19c5ff8e8cae': {'일시': '2월2일', '진동속도(cm/s) 최저치': nan, '진동속도(cm/s) 최고치': nan, '진동레벨[dB(V)] 최저치': nan, '진동레벨[dB(V)] 최고치': nan, '소음[dB(A)] 최저치': nan, '소음[dB(A)] 최고치': nan}, '5f6fe7aa-ba42-4a79-bb0f-4be2ef6f819f': {'일시': '2월3일', '진동속도(cm/s) 최저치': nan, '진동속도(cm/s) 최고치': nan, '진동레벨[dB(V)] 최저치': nan, '진동레벨[dB(V)] 최고치': nan, '소음[dB(A)] 최저치': nan, '소음[dB(A)] 최고치': nan}, 'b0c465d3-de33-4167-a0f3-794af934fb1b': {'일시': '2월6일', '진동속도(cm/s) 최저치': nan, '진동속도(cm/s) 최고치': nan, '진동레벨[dB(V)] 최저치': nan, '진동레벨[dB(V)] 최고치': nan, '소음[dB(A)] 최저치': nan, '소음[dB(A)] 최고치': nan}, '267827b8-5c5e-40a2-8162-2ddf8f42af0f': {'일시': '2월7일', '진동속도(cm/s) 최저치': nan, '진동속도(cm/s) 최고치': nan, '진동레벨[dB(V)] 최저치': nan, '진동레벨[dB(V)] 최고치': nan, '소음[dB(A)] 최저치': nan, '소음[dB(A)] 최고치': nan}, '427c3e9c-f342-4e5

In [272]:
import pandas as pd

for i in result_dict:
    df = pd.DataFrame.from_dict(result_dict[i]).transpose()
    print(df)

                                         일시 진동속도(cm/s) 최저치 진동속도(cm/s) 최고치  \
3e0ee70c-eb30-4890-8b75-29fcfb7e6962   2월1일            NaN            NaN   
0d19568f-90a5-4291-b3d5-19c5ff8e8cae   2월2일            NaN            NaN   
5f6fe7aa-ba42-4a79-bb0f-4be2ef6f819f   2월3일            NaN            NaN   
b0c465d3-de33-4167-a0f3-794af934fb1b   2월6일            NaN            NaN   
267827b8-5c5e-40a2-8162-2ddf8f42af0f   2월7일            NaN            NaN   
427c3e9c-f342-4e57-bc22-417ef1f77ad4   2월8일            NaN          0.044   
946d58a0-f3b1-4801-86da-c5c782eb3739  2월13일          0.035          0.038   
67e6ae89-d3ef-47c4-b3af-aa1b86ebc6a8  2월14일          0.042          0.058   
f34730d0-7544-4ec0-9d35-e53484bf621f  2월15일            NaN            NaN   
e15014a7-beeb-4a7d-9575-160461af522f  2월16일            NaN            NaN   
a222cfe8-0ffc-4de1-910f-931b21e2b831  2월17일            NaN          0.098   
5b3388cf-2b14-47e0-9608-f6eff9b55492  2월20일            NaN           0.04   