In [1]:
import pandas as pd
import glob
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import glob
import math
warnings.filterwarnings('ignore')

In [39]:
def preprocessing(input_path):
    all_input_list = sorted(glob.glob(input_path))
    train = pd.DataFrame()
    for datapath in all_input_list:
        data = pd.read_csv(datapath) 
        
        #  obs_time 단위 통일
        data['obs_time'] = data.index % 24 
        
        # 전처리 1 : 데이콘에서 제공된 제한범위로 1차 전처리
        # 분무량,백색광량,적색광량,청색광량 95% 값으로 제한
        df = abs(data)
        df.loc[(df['내부온도관측치'] > 40), '내부온도관측치'] = 40
        df.loc[(df['내부습도관측치'] > 100), '내부습도관측치'] = 100
        df.loc[(df['co2관측치'] > 1200), 'co2관측치'] = 1200
        df.loc[(df['ec관측치'] > 8), 'ec관측치'] = 8
        df.loc[(df['시간당분무량'] > 1499.42), '시간당분무량'] = 1499.42
        df.loc[(df['시간당백색광량'] > 18255.19), '시간당백색광량'] = 18255.19
        df.loc[(df['시간당적색광량'] > 9763.32), '시간당적색광량'] = 9763.32
        df.loc[(df['시간당청색광량'] > 9242.35), '시간당청색광량'] = 9242.35
        df.loc[(df['시간당총광량'] > 120000), '시간당총광량'] = 120000
        
        # 전처리 2 : 이상치처리
        # 해당경우는 14시에서 15시로 넘어갈때 내부온도 및 내부습도가 0으로 관찰되었음
        # 같은 케이스가 반복적으로 보여서 15시 값을 14시 값으로 바꿔줌
        df.loc[(df['내부온도관측치'] < 6.921053), '내부온도관측치'] = 6.921053
        df.loc[(df['내부습도관측치'] < 9.639473), '내부습도관측치'] = 9.639473
        
        # 전처리3 : 시간당총광량
        # 백색/적색/청색 값의 합이 총광량이기때문에 전처리 1에 영향을 받았을 가능성이 있어서 칼럼을 다시 만들어줌
        df['시간당총광량'] = df['시간당청색광량']+df['시간당백색광량']+df['시간당적색광량']
        
        # 전처리4 : 전처리 과정으로 인해 누적값 새로 만들어줌
        col_list = df.columns
        for i in range(0,len(col_list)):
            col = col_list[i]    
            if '누적' in col : 
                df[col] = df.groupby((df.obs_time == 0).cumsum()).agg(col_list[i-1]).cumsum()    

        # 전처리 5 : expanding        
        # case 별 0~27 days 까지의 누적값 칼럼 추가
        df['일간누적분무량합'] = df['시간당분무량'].cumsum().values
        df['일간누적백색광량합'] = df['시간당백색광량'].cumsum().values
        df['일간누적적색광량합'] = df['시간당적색광량'].cumsum().values
        df['일간누적청색광량합'] = df['시간당청색광량'].cumsum().values
        df['일간누적총광량합'] = df['시간당총광량'].cumsum().values            
                
        df.to_csv(datapath,index=False)
        train = pd.concat([train,df])
    print('finish!!')
    return train

In [42]:
traininput = preprocessing('drive/MyDrive/dacon/robust/상추/train_input/*.csv')

finish!!


In [43]:
testinput = preprocessing('drive/MyDrive/dacon/robust/상추/test_input/*.csv')

finish!!


In [44]:
traininput

Unnamed: 0,DAT,obs_time,내부온도관측치,내부습도관측치,co2관측치,ec관측치,시간당분무량,일간누적분무량,시간당백색광량,일간누적백색광량,...,일간누적적색광량,시간당청색광량,일간누적청색광량,시간당총광량,일간누적총광량,일간누적분무량합,일간누적백색광량합,일간누적적색광량합,일간누적청색광량합,일간누적총광량합
0,0,0,25.300000,81.835000,536.016667,1.407439,0.0,0.00,0.0000,0.000,...,0.0000,0.0,0.0000,0.0000,0.0000,0.00,0.000000e+00,0.0000,0.0000,0.000000e+00
1,0,1,25.680357,81.264286,528.696429,1.409003,126.0,126.00,0.0000,0.000,...,0.0000,0.0,0.0000,0.0000,0.0000,126.00,0.000000e+00,0.0000,0.0000,0.000000e+00
2,0,2,25.273333,81.471666,532.833333,1.406913,0.0,126.00,0.0000,0.000,...,0.0000,0.0,0.0000,0.0000,0.0000,126.00,0.000000e+00,0.0000,0.0000,0.000000e+00
3,0,3,25.355000,81.398334,545.566667,1.406689,126.0,252.00,0.0000,0.000,...,0.0000,0.0,0.0000,0.0000,0.0000,252.00,0.000000e+00,0.0000,0.0000,0.000000e+00
4,0,4,25.391667,81.483333,558.583333,1.411070,0.0,252.00,0.0000,0.000,...,0.0000,0.0,0.0000,0.0000,0.0000,252.00,0.000000e+00,0.0000,0.0000,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,27,19,26.030000,58.736667,448.500000,1.195415,0.0,2543.12,12.3764,146722.222,...,22253.7504,0.0,10462.6535,12.3764,179438.6259,363575.67,4.893663e+06,202335.7056,95125.7125,5.191124e+06
668,27,20,27.341666,58.373334,449.183333,1.190780,126.0,2669.12,0.0000,146722.222,...,22253.7504,0.0,10462.6535,0.0000,179438.6259,363701.67,4.893663e+06,202335.7056,95125.7125,5.191124e+06
669,27,21,27.785000,58.711667,441.933333,1.185593,0.0,2669.12,0.0000,146722.222,...,22253.7504,0.0,10462.6535,0.0000,179438.6259,363701.67,4.893663e+06,202335.7056,95125.7125,5.191124e+06
670,27,22,28.480000,58.121667,437.600000,1.179664,0.0,2669.12,0.0000,146722.222,...,22253.7504,0.0,10462.6535,0.0000,179438.6259,363701.67,4.893663e+06,202335.7056,95125.7125,5.191124e+06


In [45]:
colname = traininput.columns

from sklearn.preprocessing import *
scaler = RobustScaler()
trainsd = scaler.fit(traininput.iloc[:,2:]).transform(traininput.iloc[:,2:])
testsd = scaler.fit(traininput.iloc[:,2:]).transform(testinput.iloc[:,2:])

trainsd = pd.DataFrame(trainsd,columns=colname[2:]).reset_index(drop=True)
testsd = pd.DataFrame(testsd,columns=colname[2:]).reset_index(drop=True)

train=pd.merge(traininput.iloc[:,:2].reset_index(drop=True),trainsd,left_index=True, right_index=True)
test=pd.merge(testinput.iloc[:,:2].reset_index(drop=True),testsd,left_index=True, right_index=True)

In [47]:
def preprocessing2(input_path, data):
    all_input_list = sorted(glob.glob(input_path))
    for i in range(0,len(all_input_list)):
        df = data.iloc[ 24*28*i : 24*28*(i+1)]
        df.to_csv(all_input_list[i],index=False)
    print("완료~!")

In [48]:
preprocessing2('drive/MyDrive/dacon/robust/상추/train_input/*.csv',train)

완료~!


In [49]:
preprocessing2('drive/MyDrive/dacon/robust/상추/test_input/*.csv',test)

완료~!


In [50]:
def make_dataset(input_path, target_path):
    train = pd.DataFrame()
    all_input_list = sorted(glob.glob(input_path))
    all_target_list = sorted(glob.glob(target_path))
    for x, y in zip(all_input_list,all_target_list):
        x = pd.read_csv(x)
        y = pd.read_csv(y)
        x = x.iloc[:,1:]
        #x = x.drop(['시간당총광량','일간누적총광량'],axis=1)
        col_list = x.columns[1:]
        for i in range(0,28) :
            day = x.iloc[24*i:24*i+24]

            time_list = day['obs_time'].unique()
            for col in col_list :
                if col in ['일간누적백색광량','일간누적청색광량','일간누적적색광량','일간누적분무량','일간누적총광량',
                            '일간누적분무량합', '일간누적백색광량합', '일간누적적색광량합', '일간누적청색광량합', '일간누적총광량합']:
                    for t in range(11,len(time_list)):
                        time = time_list[t]
                        value1 = day[day['obs_time']==time][col].iloc[0]
                        x[col+str(time)] = value1
                else :
                    for t in range(0,len(time_list)-3) :
                        time = time_list[t]
                        ntime = time_list[t+1]
                        nntime = time_list[t+2]
                        nnntime = time_list[t+3]
                        value1 = day[day['obs_time']==time][col].iloc[0]
                        value2 = day[day['obs_time']==ntime][col].iloc[0]
                        value3 = day[day['obs_time']==nntime][col].iloc[0]
                        value4 = day[day['obs_time']==nnntime][col].iloc[0]
                        x[col+str(time)+str("~")+str(nnntime)] = (value1 + value2 + value3 + value4)

            nx = x.iloc[:1,15:]
            ny = y.iloc[i:i+1].reset_index(drop=True)
            xy = pd.merge(nx,ny,left_index=True, right_index=True)
            train = pd.concat([train,xy]).reset_index(drop=True)

    return train

In [51]:
train_input_path, train_target_path = 'drive/MyDrive/dacon/robust/상추/train_input/*.csv', 'drive/MyDrive/dacon/robust/상추/train_target/*.csv'
test_input_path, test_target_path = 'drive/MyDrive/dacon/robust/상추/test_input/*.csv', 'drive/MyDrive/dacon/robust/상추/test_target/*.csv'

train = make_dataset(train_input_path, train_target_path)
test = make_dataset(test_input_path, test_target_path)

In [52]:
train

Unnamed: 0,일간누적분무량합,일간누적백색광량합,일간누적적색광량합,일간누적청색광량합,일간누적총광량합,내부온도관측치0~3,내부온도관측치1~4,내부온도관측치2~5,내부온도관측치3~6,내부온도관측치4~7,...,일간누적총광량합16,일간누적총광량합17,일간누적총광량합18,일간누적총광량합19,일간누적총광량합20,일간누적총광량합21,일간누적총광량합22,일간누적총광량합23,DAT,predicted_weight_g
0,-0.472784,-0.535042,-0.273312,-0.075162,-1.011549,-0.860367,-0.838201,-0.912040,-0.630730,-0.242620,...,-0.938941,-0.932320,-0.925699,-0.919079,-0.918813,-0.918813,-0.918813,-0.918813,1,0.167719
1,-0.472784,-0.535042,-0.273312,-0.075162,-1.011549,-0.560757,-0.651437,-0.790327,-0.422368,0.130982,...,-0.846098,-0.839478,-0.832857,-0.826236,-0.825852,-0.825852,-0.825852,-0.825852,2,0.181787
2,-0.472784,-0.535042,-0.273312,-0.075162,-1.011549,-0.261507,-0.367502,-0.539647,-0.174509,0.375214,...,-0.753307,-0.746686,-0.740065,-0.733445,-0.733108,-0.733108,-0.733108,-0.733108,3,0.265921
3,-0.472784,-0.535042,-0.273312,-0.075162,-1.011549,-0.825056,-0.864552,-0.994660,-0.614609,-0.068514,...,-0.660393,-0.653772,-0.647152,-0.640531,-0.640146,-0.640146,-0.640146,-0.640146,4,0.423650
4,-0.472784,-0.535042,-0.273312,-0.075162,-1.011549,-0.846498,-0.848513,-0.861259,-0.621864,-0.291385,...,-0.567563,-0.560942,-0.554322,-0.547701,-0.547364,-0.547364,-0.547364,-0.547364,5,0.475272
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
779,-0.472784,-0.535042,-0.273312,-0.075162,-1.011549,4.105442,4.245290,4.357481,4.446146,4.658942,...,0.520231,0.525621,0.528466,0.528470,0.528470,0.528470,0.528470,0.528470,24,64.875499
780,-0.472784,-0.535042,-0.273312,-0.075162,-1.011549,-2.488526,-2.493765,-2.531385,-2.552746,-2.304081,...,0.582084,0.587474,0.590252,0.590254,0.590254,0.590254,0.590254,0.590254,25,74.002614
781,-0.472784,-0.535042,-0.273312,-0.075162,-1.011549,-2.116467,-2.120094,-2.162620,-2.227909,-2.077179,...,0.643788,0.649178,0.652022,0.652025,0.652025,0.652025,0.652025,0.652025,26,76.342275
782,-0.472784,-0.535042,-0.273312,-0.075162,-1.011549,-2.198891,-2.219043,-2.234760,-2.260957,-2.103375,...,0.705568,0.710957,0.713772,0.713776,0.713776,0.713776,0.713776,0.713776,27,82.621245


In [21]:
pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp38-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 1.2 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [53]:
train.to_csv('drive/MyDrive/dacon/robust/상추/train_robust_time4.csv',index=False)
test.to_csv('drive/MyDrive/dacon/robust/상추/test_robust_time4.csv',index=False)

In [29]:
train = pd.read_csv('drive/MyDrive/dacon/data_re/상추/train_log_time4.csv')
test = pd.read_csv('drive/MyDrive/dacon/data_re/상추/test_log_time4.csv')

In [54]:
corr_df = train.corr()
corr_df = corr_df.apply(lambda x: round(x ,2))
s = corr_df.unstack()
성장 = pd.DataFrame(s['predicted_weight_g']).sort_values(by=0).fillna(0)
성장[0] = abs(성장[0])
성장.sort_values(by=[0],ascending=False).reset_index().to_csv('target값이랑corr.csv',index=False,encoding='utf-8-sig')
under_1per = 성장[성장[0]<0.1].index
under_1per

Index(['내부온도관측치6~9', '내부온도관측치1~4', '일간누적총광량11', '시간당총광량17~20', '시간당백색광량19~22',
       '내부온도관측치19~22', '시간당백색광량20~23', '시간당총광량5~8', '내부온도관측치2~5', '내부온도관측치5~8',
       '내부온도관측치20~23', '내부온도관측치3~6', '내부온도관측치4~7', '일간누적청색광량합', '시간당백색광량0~3',
       '일간누적총광량합', '일간누적적색광량합', '시간당적색광량14~17', '시간당적색광량16~19', '시간당적색광량15~18',
       '시간당적색광량13~16', '시간당적색광량17~20', '시간당청색광량11~14', '시간당청색광량10~13',
       '시간당청색광량12~15', '시간당청색광량9~12', '시간당적색광량12~15', '시간당총광량18~21',
       '시간당적색광량18~21', '일간누적백색광량합', '시간당백색광량1~4', '시간당백색광량2~5'],
      dtype='object')

In [32]:
drop_column = train.copy()
drop_column = drop_column.drop(under_1per,axis=1)

In [33]:
drop_columntt = test.copy()
drop_columntt = drop_columntt.drop(under_1per,axis=1)

In [55]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import catboost

X = train.drop(['predicted_weight_g'],axis=1)
y = train['predicted_weight_g']
CAT_model= {}
for f in range(0,27):
      train_idx = [i for i in range(0,784)]
      valid_idx = train_idx[f*28:f*28+28]
      for idx in valid_idx:
        train_idx.remove(idx)
      print(f'===================================={f+1}============================================')
      print(min(valid_idx),max(valid_idx))
      x_train, x_val, y_train, y_val = X.iloc[train_idx], X.iloc[valid_idx], y.iloc[train_idx], y.iloc[valid_idx]
      
      CAT = catboost.CatBoostRegressor(verbose=0,random_state=404)
      CAT.fit(x_train, y_train)
      
      y_pred = CAT.predict(x_val)
      mae = mean_squared_error(y_val, y_pred)**0.5
      print(f"{f + 1} Fold MAE = {mae}")
      CAT_model[f] = CAT
      print(f'================================================================================\n\n')

0 27
1 Fold MAE = 17.6185574724968


28 55
2 Fold MAE = 11.613291437017974


56 83
3 Fold MAE = 15.51592550166431


84 111
4 Fold MAE = 24.26385559720995


112 139
5 Fold MAE = 43.06895599755031


140 167
6 Fold MAE = 14.509213051523169


168 195
7 Fold MAE = 3.5541293483505987


196 223
8 Fold MAE = 6.375822982430165


224 251
9 Fold MAE = 22.51546352940914


252 279
10 Fold MAE = 6.432118009690551


280 307
11 Fold MAE = 7.520918045293789


308 335
12 Fold MAE = 20.664224306309123


336 363
13 Fold MAE = 27.67583689406079


364 391
14 Fold MAE = 13.206430464057066


392 419
15 Fold MAE = 28.536866928339904


420 447
16 Fold MAE = 38.78697662393502


448 475
17 Fold MAE = 11.918956771588098


476 503
18 Fold MAE = 4.877550492611513


504 531
19 Fold MAE = 13.353021824880976


532 559
20 Fold MAE = 15.359492768324916


560 587
21 Fold MAE = 23.260671736139


588 615
22 Fold MAE = 16.883582870731562


616 643
23 Fold MAE = 13.478207203965871


644 671
24 Fold MAE = 16.856204663464286




In [60]:
X = train.drop(['predicted_weight_g'],axis=1)
y = train['predicted_weight_g']
model = catboost.CatBoostRegressor(verbose=0,random_state=404)
model.fit(X,y)

<catboost.core.CatBoostRegressor at 0x7fe62dea9e80>

In [62]:
testX = test.drop(['predicted_weight_g'],axis=1)
pc = model.predict(testX)
pc

array([ 0.68585632,  0.78919152,  1.37357154,  2.35753945,  4.43631397,
        4.43795273,  4.63160304,  4.67788615,  5.67040596,  6.76150272,
        9.74328641, 11.28050824, 15.0547486 , 17.30205291, 18.13745518,
       26.36054128, 25.03666392, 24.21133531, 24.22007824, 30.4966252 ,
       43.10359644, 49.53432786, 56.69946628, 65.54107242, 71.8313112 ,
       78.30768326, 79.37077332, 83.09306798,  0.52394948,  0.53461189,
        1.85163626,  2.16781213,  2.29146693,  3.264366  ,  3.79388036,
        4.58824049,  4.45951551,  6.44652722,  7.37642951,  9.1845485 ,
       11.64590324, 11.45858738, 15.9097275 , 18.81109638, 20.67510996,
       25.15750085, 32.32877085, 41.65469624, 46.73295   , 50.96743611,
       56.15685917, 64.27312473, 66.53023422, 70.89209199, 70.00062973,
       71.76613409,  0.97692145,  2.57665555,  2.79024303,  2.6561638 ,
        3.15588082,  4.28291444,  4.36752642,  5.43420036,  5.5990414 ,
        6.64957025,  8.64483048,  7.33912559,  7.51355295,  8.99

In [56]:
testX = test.drop(['predicted_weight_g'],axis=1)
pc=[0]*140
for fold in range(27):
    pc += CAT_model[fold].predict(testX)/27
pc

array([ 0.81339017,  0.84633128,  1.48525789,  2.72182865,  4.2131988 ,
        4.24680264,  4.75756747,  5.17478354,  6.13869574,  7.20616489,
        9.43375518, 11.72006083, 16.83570171, 19.02709066, 16.97647501,
       24.72539918, 25.77003916, 23.71397509, 25.47405204, 29.99180425,
       41.28277839, 47.06939159, 58.70535448, 64.35479668, 71.03987986,
       79.26363898, 83.18776674, 89.97261147,  1.02849848,  0.94723016,
        1.95224517,  2.01039498,  2.44887978,  3.38481424,  4.04340177,
        4.69974508,  5.15052309,  6.01670628,  7.59380728,  9.84891809,
       10.91009899, 12.99821015, 15.89224206, 17.74658303, 21.47784973,
       26.73570195, 32.9607171 , 40.23878624, 46.6199799 , 50.89567123,
       57.67742272, 63.04624344, 66.02979152, 68.09742299, 70.53416777,
       72.16012933,  1.66396081,  1.42907683,  1.78531663,  2.27393483,
        3.81124527,  5.49496794,  5.84880586,  6.45717634,  6.08833343,
        7.27588024,  7.63785648,  7.90086673,  7.99559872,  9.83

In [63]:
i=0
pc = [ 0.81339017,  0.84633128,  1.48525789,  2.72182865,  4.2131988 ,
        4.24680264,  4.75756747,  5.17478354,  6.13869574,  7.20616489,
        9.43375518, 11.72006083, 16.83570171, 19.02709066, 16.97647501,
       24.72539918, 25.77003916, 23.71397509, 25.47405204, 29.99180425,
       41.28277839, 47.06939159, 58.70535448, 64.35479668, 71.03987986,
       79.26363898, 83.18776674, 89.97261147,  1.02849848,  0.94723016,
        1.95224517,  2.01039498,  2.44887978,  3.38481424,  4.04340177,
        4.69974508,  5.15052309,  6.01670628,  7.59380728,  9.84891809,
       10.91009899, 12.99821015, 15.89224206, 17.74658303, 21.47784973,
       26.73570195, 32.9607171 , 40.23878624, 46.6199799 , 50.89567123,
       57.67742272, 63.04624344, 66.02979152, 68.09742299, 70.53416777,
       72.16012933,  1.66396081,  1.42907683,  1.78531663,  2.27393483,
        3.81124527,  5.49496794,  5.84880586,  6.45717634,  6.08833343,
        7.27588024,  7.63785648,  7.90086673,  7.99559872,  9.83209232,
       10.83289883, 10.58109966, 11.06432454, 11.54629258, 13.12180853,
       14.17944264, 15.28864143, 17.20130616, 19.37126364, 20.47663741,
       21.86164074, 23.10584267, 23.00471176, 23.86688259,  0.80643185,
        1.74359847,  1.35812625,  2.7146066 ,  2.81278889,  4.37130141,
        3.96085093,  6.47727825,  7.35344593,  8.24404808,  9.15073829,
       12.6065112 , 19.30636935, 18.5773357 , 20.43991633, 25.10871909,
       27.64291217, 31.01340975, 30.8342916 , 37.82506712, 40.50883235,
       51.30057893, 58.98572628, 65.99947584, 70.92598776, 80.85309316,
       88.38738908, 93.44680177,  0.9239519 ,  1.94024263,  3.34912742,
        2.77895533,  4.33627023,  2.46301518,  4.22357326,  4.55019817,
        6.80860564,  7.21424136,  6.8603485 ,  8.53785308, 10.7746792 ,
       14.64564487, 17.85107074, 16.49141953, 18.64063729, 20.12185631,
       25.57475939, 30.0938804 , 35.0376389 , 43.10983085, 49.57153672,
       54.05162093, 61.43465335, 71.00142606, 75.93086864, 82.16012658]
all_target_list = sorted(glob.glob('drive/MyDrive/dacon/robust/상추/test_target/*.csv'))
for test_path in all_target_list:
    submit_df = pd.read_csv(test_path)
    submit_df['predicted_weight_g'] = pc[i*28:i*28+28]
    submit_df.to_csv(test_path, index=False)
    i+=1