In [252]:
import random
import pandas as pd
import numpy as np
import os
from glob import glob
import datetime
import matplotlib.pyplot as plt

In [253]:
def make_dataset(all_input_list, all_target_list):
    '''
    Train, Test데이터를 하나의 데이터 프레임으로 변경
    '''
    df_all = pd.DataFrame()
    length = len(all_input_list)
    for idx in range(length):
        X = pd.read_csv(all_input_list[idx])
        y = pd.read_csv(all_target_list[idx])
        y['DAT'] = y['DAT']-1
        df_concat = pd.merge(X, y, on='DAT', how='left')
        df_concat['Case'] = idx+1
        df_all = pd.concat([df_all, df_concat])
    return df_all


def time_value(df):
    ''' 
    ex) 00:59:59 => 01:00:00으로 변환 후 시간단위만 추출
    '''
    df['obs_time'] = pd.to_datetime(df["obs_time"]) + datetime.timedelta(seconds=1)
    df['obs_time'] = df['obs_time'].dt.hour
    return df

def limit_range(df):
    '''
    환경 변수 별 제한 범위를 넘어서는 값을 결측치 처리
    '''
    df.loc[(df['내부온도관측치'] < 4) | (df['내부온도관측치'] > 40), '내부온도관측치'] = np.nan
    df.loc[(df['내부습도관측치'] < 0) | (df['내부습도관측치'] > 100), '내부습도관측치'] = np.nan
    df.loc[(df['co2관측치'] < 0) | (df['co2관측치'] > 1200), 'co2관측치'] = np.nan
    df.loc[(df['ec관측치'] < 0) | (df['ec관측치'] > 8), 'ec관측치'] = np.nan
    df.loc[(df['시간당분무량'] < 0) | (df['시간당분무량'] > 3000), '시간당분무량'] = np.nan
    df.loc[(df['일간누적분무량'] < 0) | (df['일간누적분무량'] > 72000), '일간누적분무량'] = np.nan
    df.loc[(df['시간당백색광량'] < 0) | (df['시간당백색광량'] > 120000), '시간당백색광량'] = np.nan
    df.loc[(df['일간누적백색광량'] < 0) | (df['일간누적백색광량'] > 2880000), '일간누적백색광량'] = np.nan
    df.loc[(df['시간당적색광량'] < 0) | (df['시간당적색광량'] > 120000), '시간당적색광량'] = np.nan
    df.loc[(df['일간누적적색광량'] < 0) | (df['일간누적적색광량'] > 2880000), '일간누적적색광량'] = np.nan
    df.loc[(df['시간당청색광량'] < 0) | (df['시간당청색광량'] > 120000), '시간당청색광량'] = np.nan
    df.loc[(df['일간누적청색광량'] < 0) | (df['일간누적청색광량'] > 2880000), '일간누적청색광량'] = np.nan
    df.loc[(df['시간당총광량'] < 0) | (df['시간당총광량'] > 120000), '시간당총광량'] = np.nan
    df.loc[(df['일간누적총광량'] < 0) | (df['일간누적총광량'] > 2880000), '일간누적총광량'] = np.nan
    return df

def col_cumsum(df, col, cum_col):
    '''
    시간값에 이상치가 있어서 누적값을 새로 생성
    '''
    import itertools
    df[cum_col] = 0
    for i in range(784):
        result = itertools.accumulate(df[col][i*24:(i+1)*24])
        cumsum = [value for value in result]
        df[cum_col][i*24:(i+1)*24] = cumsum
        
    return df

def apply_cumsum(df):
    df = col_cumsum(df, "시간당분무량", "일간누적분무량")
    df = col_cumsum(df, "시간당백색광량", "일간누적백색광량")
    df = col_cumsum(df, "시간당적색광량", "일간누적적색광량")
    df = col_cumsum(df, "시간당청색광량", "일간누적청색광량")
    df = col_cumsum(df, "시간당총광량", "일간누적총광량")
    
    return df

In [254]:
train_input_list = sorted(glob('./data/train_input/*.csv'))
train_target_list = sorted(glob('./data/train_target/*.csv'))

test_input_list = sorted(glob('./data/test_input/*.csv'))
test_target_list = sorted(glob('./data/test_target/*.csv'))

In [255]:
train = make_dataset(train_input_list, train_target_list)
test = make_dataset(test_input_list, test_target_list)

train = time_value(train)
test = time_value(test)

train = limit_range(train)
test = limit_range(test)

train = train.fillna(method='ffill')
test = test.fillna(method='ffill')

In [256]:
test['predicted_weight_g'] = 0

In [257]:
train = col_cumsum(train, "시간당분무량", "일간누적분무량")
train = col_cumsum(train, "시간당백색광량", "일간누적백색광량")
train = col_cumsum(train, "시간당적색광량", "일간누적적색광량")
train = col_cumsum(train, "시간당청색광량", "일간누적청색광량")
train = col_cumsum(train, "시간당총광량", "일간누적총광량")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[cum_col][i*24:(i+1)*24] = cumsum


In [258]:
df_a = pd.DataFrame()

for i, v in enumerate(train["Case"].unique()):
    train_old = train[train['Case']==v]
    train_old = train_old.groupby(['DAT']).median().reset_index()
    df_a = pd.concat([df_a, train_old])

In [259]:
df_b = pd.DataFrame()

for i, v in enumerate(test["Case"].unique()):
    test_old = test[test['Case']==v]
    test_old = test_old.groupby(['DAT']).median().reset_index()
    df_b = pd.concat([df_b, test_old])

In [260]:
df_a = df_a.drop(['obs_time', 'Case'], axis=1)
df_b = df_b.drop(['obs_time', 'Case'], axis=1)

In [262]:
train = df_a.copy()
test = df_b.copy()