In [64]:
import random
import pandas as pd
import numpy as np
import os
import glob
import datetime
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

def limit_range(df):
    '''
    환경 변수 별 제한 범위를 넘어서는 값을 결측치 처리
    '''
    df.loc[(df['내부온도관측치'] < 4) | (df['내부온도관측치'] > 40), '내부온도관측치'] = np.nan
    df.loc[(df['내부습도관측치'] < 0) | (df['내부습도관측치'] > 100), '내부습도관측치'] = np.nan
    df.loc[(df['co2관측치'] < 0) | (df['co2관측치'] > 1200), 'co2관측치'] = np.nan
    df.loc[(df['ec관측치'] < 0) | (df['ec관측치'] > 8), 'ec관측치'] = np.nan
    df.loc[(df['시간당분무량'] < 0) | (df['시간당분무량'] > 3000), '시간당분무량'] = np.nan
    df.loc[(df['일간누적분무량'] < 0) | (df['일간누적분무량'] > 72000), '일간누적분무량'] = np.nan
    df.loc[(df['시간당백색광량'] < 0) | (df['시간당백색광량'] > 120000), '시간당백색광량'] = np.nan
    df.loc[(df['일간누적백색광량'] < 0) | (df['일간누적백색광량'] > 2880000), '일간누적백색광량'] = np.nan
    df.loc[(df['시간당적색광량'] < 0) | (df['시간당적색광량'] > 120000), '시간당적색광량'] = np.nan
    df.loc[(df['일간누적적색광량'] < 0) | (df['일간누적적색광량'] > 2880000), '일간누적적색광량'] = np.nan
    df.loc[(df['시간당청색광량'] < 0) | (df['시간당청색광량'] > 120000), '시간당청색광량'] = np.nan
    df.loc[(df['일간누적청색광량'] < 0) | (df['일간누적청색광량'] > 2880000), '일간누적청색광량'] = np.nan
    df.loc[(df['시간당총광량'] < 0) | (df['시간당총광량'] > 120000), '시간당총광량'] = np.nan
    df.loc[(df['일간누적총광량'] < 0) | (df['일간누적총광량'] > 2880000), '일간누적총광량'] = np.nan
    return df

def time_value(df):
    ''' 
    ex) 00:59:59 => 01:00:00으로 변환 후 시간단위만 추출
    '''
    df['obs_time'] = pd.to_datetime(df["obs_time"]) + datetime.timedelta(seconds=1)
    df['obs_time'] = df['obs_time'].dt.hour
    return df


def col_cumsum(df, col, cum_col):
    '''
    시간값에 이상치가 있어서 누적값을 새로 생성
    '''
    import itertools
    df[cum_col] = 0
    for i in range(28):
        result = itertools.accumulate(df[col][i*24:(i+1)*24])
        cumsum = [value for value in result]
        df[cum_col][i*24:(i+1)*24] = cumsum
        
    return df

def make_col_data(input_path):
    df_new = pd.DataFrame()
    all_input_list = sorted(glob.glob(input_path))
    for path in all_input_list:
        df = pd.read_csv(path)
        df = time_value(df)
        df = limit_range(df)
        df = df.fillna(method='ffill')
        df = df.iloc[:, 1:]
        df = col_cumsum(df, "시간당분무량", "일간누적분무량")
        df = col_cumsum(df, "시간당백색광량", "일간누적백색광량")
        df = col_cumsum(df, "시간당적색광량", "일간누적적색광량")
        df = col_cumsum(df, "시간당청색광량", "일간누적청색광량")
        df = col_cumsum(df, "시간당총광량", "일간누적총광량")
        col_list = df.columns[1:]
        for i in range(0,28):    
            day = df.iloc[24*i:24*i+24]
            time_list = day['obs_time'].unique()
            for col in col_list:
                for time in time_list:
                    value = day[day['obs_time']==time][col].iloc[0]
                    df[col+str(time)+'시'] = value           
            nx = df.iloc[:1, 15:]
            df_new = pd.concat([df_new, nx]).reset_index(drop=True)

    return df_new

input_path = './data/test_input/*.csv'
test = make_col_data(input_path)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[cum_col][i*24:(i+1)*24] = cumsum
  df[col+str(time)+'시'] = value
  df[col+str(time)+'시'] = value
  df[col+str(time)+'시'] = value
  df[col+str(time)+'시'] = value
  df[col+str(time)+'시'] = value
  df[col+str(time)+'시'] = value
  df[col+str(time)+'시'] = value
  df[col+str(time)+'시'] = value
  df[col+str(time)+'시'] = value
  df[col+str(time)+'시'] = value
  df[col+str(time)+'시'] = value
  df[col+str(time)+'시'] = value
  df[col+str(time)+'시'] = value
  df[col+str(time)+'시'] = value
  df[col+str(time)+'시'] = value
  df[col+str(time)+'시'] = value
  df[col+str(time)+'시'] = value
  df[col+str(time)+'시'] = value
  df[col+str(time)+'시'] = value
  df[col+str(time)+'시'] = value
  df[col+str(time)+'시'] = value
  df[col+str(time)+'시'] = value
  df[col+str(time)+'시'] = value
  df[col+str(time)+'