# LSTM을 수행하기위해서 기존 데이터 형식 맞추기

In [37]:
# 환경변수 불러오기
import pandas as pd
from dotenv import load_dotenv
import os

load_dotenv()

True

In [38]:
# 경로 설정
file_path = os.getenv('FILE_PATH')
save_path = os.getenv('SAVE_PATH')

## 1. 종침후_TN_9시간_비교

In [39]:
try:
    tn = pd.read_csv(file_path+'lstm_data/sewageMeasurement/종침후_TN_9시간_비교.csv', encoding='utf-8')
except UnicodeDecodeError:
    try:
        tn = pd.read_csv(file_path+'lstm_data/sewageMeasurement/종침후_TN_9시간_비교.csv', encoding='cp949')
    except Exception as e:
        print(f"Error: {e}")

In [40]:
tn

Unnamed: 0,_time_gateway,Sensor_S4_TN_9h,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,9/1/22 0:00,8.00,,,
1,9/1/22 0:10,7.97,,,
2,9/1/22 0:20,8.25,,,
3,9/1/22 0:30,8.01,,,
4,9/1/22 0:40,7.95,,,
...,...,...,...,...,...
51915,8/31/23 23:10,8.05,,,
51916,8/31/23 23:20,8.07,,,
51917,8/31/23 23:30,8.06,,,
51918,8/31/23 23:40,8.08,,,


In [42]:
# 필요한 열 제외 제거
tn = tn.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

In [43]:
tn

Unnamed: 0,_time_gateway,Sensor_S4_TN_9h
0,9/1/22 0:00,8.00
1,9/1/22 0:10,7.97
2,9/1/22 0:20,8.25
3,9/1/22 0:30,8.01
4,9/1/22 0:40,7.95
...,...,...
51915,8/31/23 23:10,8.05
51916,8/31/23 23:20,8.07
51917,8/31/23 23:30,8.06
51918,8/31/23 23:40,8.08


## 종침후_TN_9시간_비교.csv의 형식

_time_gateway Sensor_S4_TN_9h     
9/1/22 0:00, 8.00  
9/1/22 0:10, 7.97  
9/1/22 0:20, 8.25  

- datetime 인덱스를 사용하기위해 수정 필요

### _Time_gateway 변환

- YYYY/MM/DD HH:MM 형식

In [44]:
# 날짜 형식 변환 함수
def convert_date_format(date_str):
    return pd.to_datetime(date_str, format='%m/%d/%y %H:%M').strftime('%Y/%m/%d %H:%M')

In [45]:
# 날짜 형식 변환 적용
tn['_time_gateway'] = tn['_time_gateway'].apply(convert_date_format)

In [60]:
tn

Unnamed: 0,_time_gateway,Sensor_S4_TN_9h
0,2022/09/01 00:00,8.00
1,2022/09/01 00:10,7.97
2,2022/09/01 00:20,8.25
3,2022/09/01 00:30,8.01
4,2022/09/01 00:40,7.95
...,...,...
51915,2023/08/31 23:10,8.05
51916,2023/08/31 23:20,8.07
51917,2023/08/31 23:30,8.06
51918,2023/08/31 23:40,8.08


### 학습 및 예측을 위한 데이터셋 분리

In [63]:
# tn 학습 데이터 생성
tn_train = tn.iloc[0:29888]

In [64]:
tn_train

Unnamed: 0,_time_gateway,Sensor_S4_TN_9h
0,2022/09/01 00:00,8.00
1,2022/09/01 00:10,7.97
2,2022/09/01 00:20,8.25
3,2022/09/01 00:30,8.01
4,2022/09/01 00:40,7.95
...,...,...
29883,2023/03/31 23:10,14.42
29884,2023/03/31 23:20,14.44
29885,2023/03/31 23:30,14.36
29886,2023/03/31 23:40,14.25


In [65]:
# tn 예측 데이터 생성
tn_predict = tn.iloc[29888:]

In [66]:
tn_predict

Unnamed: 0,_time_gateway,Sensor_S4_TN_9h
29888,2023/04/01 00:00,14.15
29889,2023/04/01 00:10,14.06
29890,2023/04/01 00:20,14.01
29891,2023/04/01 00:30,14.01
29892,2023/04/01 00:40,13.96
...,...,...
51915,2023/08/31 23:10,8.05
51916,2023/08/31 23:20,8.07
51917,2023/08/31 23:30,8.06
51918,2023/08/31 23:40,8.08


In [67]:
# 분석을 위한 데이터셋 저장
tn_train.to_csv(save_path+"LSTM_pipeline/Input/tn_train.csv",index=False)
tn_predict.to_csv(save_path+"LSTM_pipeline/Input/tn_predict.csv",index=False)

## 2. 종침후_TOC_9시간_비교

In [28]:
try:
    toc = pd.read_csv(file_path+'lstm_data/sewageMeasurement/종침후_TOC_9시간_비교.csv', encoding='utf-8')
except UnicodeDecodeError:
    try:
        toc = pd.read_csv(file_path+'lstm_data/sewageMeasurement/종침후_TOC_9시간_비교.csv', encoding='cp949')
    except Exception as e:
        print(f"Error: {e}")

In [29]:
toc

Unnamed: 0,_time_gateway,Sensor_S4_TOC_9h,_time_gateway.1,pred_Sensor_S4_TOC_9h_xgboost,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,9/1/22 0:00,7.0,,,,,,,,,,,,,
1,9/1/22 0:10,7.0,,,,,,,,,,,,,
2,9/1/22 0:20,7.1,,,,,,,,,,,,,
3,9/1/22 0:30,7.0,,,,,,,,,,,,,
4,9/1/22 0:40,6.9,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51915,8/31/23 23:10,5.9,8/31/23 23:10,5.482946,,,,,,,,,,,
51916,8/31/23 23:20,5.8,8/31/23 23:20,5.476313,,,,,,,,,,,
51917,8/31/23 23:30,5.9,8/31/23 23:30,5.465485,,,,,,,,,,,
51918,8/31/23 23:40,5.9,8/31/23 23:40,5.455047,,,,,,,,,,,


In [30]:
# 필요한 열 제외 제거
toc = toc[['_time_gateway', 'Sensor_S4_TOC_9h']]

In [31]:
toc

Unnamed: 0,_time_gateway,Sensor_S4_TOC_9h
0,9/1/22 0:00,7.0
1,9/1/22 0:10,7.0
2,9/1/22 0:20,7.1
3,9/1/22 0:30,7.0
4,9/1/22 0:40,6.9
...,...,...
51915,8/31/23 23:10,5.9
51916,8/31/23 23:20,5.8
51917,8/31/23 23:30,5.9
51918,8/31/23 23:40,5.9


## 종침후_TOC_9시간_비교.csv의 형식

_time_gateway Sensor_S4_TN_9h     
9/1/22 0:00, 7.0  
9/1/22 0:10, 7.0  
9/1/22 0:20, 7.1  

- datetime 인덱스를 사용하기위해 수정 필요

### _Time_gateway 변환

- YYYY/MM/DD HH:MM 형식

In [32]:
# 날짜 형식 변환 함수
def convert_date_format(date_str):
    return pd.to_datetime(date_str, format='%m/%d/%y %H:%M').strftime('%Y/%m/%d %H:%M')

In [33]:
# 날짜 형식 변환 적용
toc['_time_gateway'] = toc['_time_gateway'].apply(convert_date_format)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  toc['_time_gateway'] = toc['_time_gateway'].apply(convert_date_format)


In [70]:
toc

Unnamed: 0,_time_gateway,Sensor_S4_TOC_9h
0,2022/09/01 00:00,7.0
1,2022/09/01 00:10,7.0
2,2022/09/01 00:20,7.1
3,2022/09/01 00:30,7.0
4,2022/09/01 00:40,6.9
...,...,...
51915,2023/08/31 23:10,5.9
51916,2023/08/31 23:20,5.8
51917,2023/08/31 23:30,5.9
51918,2023/08/31 23:40,5.9


### 학습 및 예측을 위한 데이터셋 분리

In [71]:
# toc 학습 데이터 생성
toc_train = toc.iloc[0:29888]

In [72]:
toc_train

Unnamed: 0,_time_gateway,Sensor_S4_TOC_9h
0,2022/09/01 00:00,7.0
1,2022/09/01 00:10,7.0
2,2022/09/01 00:20,7.1
3,2022/09/01 00:30,7.0
4,2022/09/01 00:40,6.9
...,...,...
29883,2023/03/31 23:10,9.2
29884,2023/03/31 23:20,9.2
29885,2023/03/31 23:30,9.2
29886,2023/03/31 23:40,9.1


In [73]:
# tn 예측 데이터 생성
toc_predict = toc.iloc[29888:]

In [74]:
toc_predict

Unnamed: 0,_time_gateway,Sensor_S4_TOC_9h
29888,2023/04/01 00:00,9.1
29889,2023/04/01 00:10,9.1
29890,2023/04/01 00:20,9.1
29891,2023/04/01 00:30,9.1
29892,2023/04/01 00:40,9.0
...,...,...
51915,2023/08/31 23:10,5.9
51916,2023/08/31 23:20,5.8
51917,2023/08/31 23:30,5.9
51918,2023/08/31 23:40,5.9


In [75]:
# 분석을 위한 데이터셋 저장
toc_train.to_csv(save_path+"LSTM_pipeline/Input/toc_train.csv",index=False)
toc_predict.to_csv(save_path+"LSTM_pipeline/Input/toc_predict.csv",index=False)