In [1]:
import pandas as pd
import glob
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import glob
import math
warnings.filterwarnings('ignore')

In [2]:
def preprocessing(input_path):
    all_input_list = sorted(glob.glob(input_path))
    train = pd.DataFrame()
    for datapath in all_input_list:
        data = pd.read_csv(datapath) 
        
        #  obs_time 단위 통일
        data['obs_time'] = data.index % 24 
        
        # 전처리 1 : 데이콘에서 제공된 제한범위로 1차 전처리
        # 분무량,백색광량,적색광량,청색광량 95% 값으로 제한
        df = abs(data)
        df.loc[(df['내부온도관측치'] > 40), '내부온도관측치'] = 40
        df.loc[(df['내부습도관측치'] > 100), '내부습도관측치'] = 100
        df.loc[(df['co2관측치'] > 1200), 'co2관측치'] = 1200
        df.loc[(df['ec관측치'] > 8), 'ec관측치'] = 8
        df.loc[(df['시간당분무량'] > 1499.42), '시간당분무량'] = 1499.42
        df.loc[(df['시간당백색광량'] > 18255.19), '시간당백색광량'] = 18255.19
        df.loc[(df['시간당적색광량'] > 9763.32), '시간당적색광량'] = 9763.32
        df.loc[(df['시간당청색광량'] > 9242.35), '시간당청색광량'] = 9242.35
        df.loc[(df['시간당총광량'] > 120000), '시간당총광량'] = 120000
        
        # 전처리 2 : 이상치처리
        # 해당경우는 14시에서 15시로 넘어갈때 내부온도 및 내부습도가 0으로 관찰되었음
        # 같은 케이스가 반복적으로 보여서 15시 값을 14시 값으로 바꿔줌
        df.loc[(df['내부온도관측치'] < 6.921053), '내부온도관측치'] = 6.921053
        df.loc[(df['내부습도관측치'] < 9.639473), '내부습도관측치'] = 9.639473
        
        # 전처리3 : 시간당총광량
        # 백색/적색/청색 값의 합이 총광량이기때문에 전처리 1에 영향을 받았을 가능성이 있어서 칼럼을 다시 만들어줌
        df['시간당총광량'] = df['시간당청색광량']+df['시간당백색광량']+df['시간당적색광량']
        
        # 전처리4 : 전처리 과정으로 인해 누적값 새로 만들어줌
        col_list = df.columns
        for i in range(0,len(col_list)):
            col = col_list[i]    
            if '누적' in col : 
                df[col] = df.groupby((df.obs_time == 0).cumsum()).agg(col_list[i-1]).cumsum()    
                
        df.to_csv(datapath,index=False)
        train = pd.concat([train,df])
    print('finish!!')
    return train

In [3]:
train = preprocessing('drive/MyDrive/dacon/data_re/상추/train_input/*.csv')

finish!!


In [4]:
test = preprocessing('drive/MyDrive/dacon/data_re/상추/test_input/*.csv')

finish!!


In [7]:
colname = train.columns
for col in colname[2:]:
    train[col]=np.log1p(train[col].astype(int)) # x+1
    test[col]=np.log1p(test[col].astype(int)) # x+1

In [10]:
def preprocessing2(input_path, data):
    all_input_list = sorted(glob.glob(input_path))
    for i in range(0,len(all_input_list)):
        df = data.iloc[ 24*28*i : 24*28*(i+1)]
        df.to_csv(all_input_list[i],index=False)
    print("완료~!")

In [12]:
preprocessing2('drive/MyDrive/dacon/data_re/상추/train_input/*.csv',train)

완료~!


In [13]:
preprocessing2('drive/MyDrive/dacon/data_re/상추/test_input/*.csv',test)

완료~!


In [18]:
def make_dataset(input_path, target_path):
    train = pd.DataFrame()
    all_input_list = sorted(glob.glob(input_path))
    all_target_list = sorted(glob.glob(target_path))
    for x, y in zip(all_input_list,all_target_list):
        x = pd.read_csv(x)
        y = pd.read_csv(y)
        x = x.iloc[:,1:]
        #x = x.drop(['시간당총광량','일간누적총광량'],axis=1)
        col_list = x.columns[1:]
        for i in range(0,28) :
            day = x.iloc[24*i:24*i+24]

            time_list = day['obs_time'].unique()
            for col in col_list :
                if col in ['일간누적백색광량','일간누적청색광량','일간누적적색광량','일간누적분무량','일간누적총광량']:
                    for t in range(11,len(time_list)):
                        time = time_list[t]
                        value1 = day[day['obs_time']==time][col].iloc[0]
                        x[col+str(time)] = value1
                else :
                    for t in range(0,len(time_list)-3) :
                        time = time_list[t]
                        ntime = time_list[t+1]
                        nntime = time_list[t+2]
                        nnntime = time_list[t+3]
                        value1 = day[day['obs_time']==time][col].iloc[0]
                        value2 = day[day['obs_time']==ntime][col].iloc[0]
                        value3 = day[day['obs_time']==nntime][col].iloc[0]
                        value4 = day[day['obs_time']==nnntime][col].iloc[0]
                        x[col+str(time)+str("+")+str(nnntime)] = (value1 + value2 + value3 + value4)/4

            nx = x.iloc[:1,15:]
            ny = y.iloc[i:i+1].reset_index(drop=True)
            xy = pd.merge(nx,ny,left_index=True, right_index=True)
            train = pd.concat([train,xy]).reset_index(drop=True)

    return train

In [19]:
train_input_path, train_target_path = 'drive/MyDrive/dacon/data_re/상추/train_input/*.csv', 'drive/MyDrive/dacon/data_re/상추/train_target/*.csv'
test_input_path, test_target_path = 'drive/MyDrive/dacon/data_re/상추/test_input/*.csv', 'drive/MyDrive/dacon/data_re/상추/test_target/*.csv'

train = make_dataset(train_input_path, train_target_path)
test = make_dataset(test_input_path, test_target_path)

In [20]:
train

Unnamed: 0,내부온도관측치2+5,내부온도관측치3+6,내부온도관측치4+7,내부온도관측치5+8,내부온도관측치6+9,내부온도관측치7+10,내부온도관측치8+11,내부온도관측치9+12,내부온도관측치10+13,내부온도관측치11+14,...,일간누적청색광량16,일간누적청색광량17,일간누적청색광량18,일간누적청색광량19,일간누적청색광량20,일간누적청색광량21,일간누적청색광량22,일간누적청색광량23,DAT,predicted_weight_g
0,3.258097,3.267532,3.276967,3.295494,3.314021,3.323113,3.332205,3.332205,3.332205,3.340977,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1,0.167719
1,3.258097,3.276624,3.295151,3.322450,3.349750,3.358523,3.367296,3.367296,3.367296,3.367296,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2,0.181787
2,3.258097,3.276624,3.303923,3.331223,3.349750,3.349750,3.331885,3.314021,3.304929,3.295837,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3,0.265921
3,3.258097,3.267532,3.286059,3.295494,3.304929,3.304929,3.295837,3.295837,3.295837,3.295837,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,4,0.423650
4,3.258097,3.267532,3.276967,3.286402,3.295837,3.304929,3.314021,3.323113,3.332205,3.332205,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,5,0.475272
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
779,3.433987,3.441924,3.449862,3.465492,3.465492,3.432109,3.389634,3.339466,3.304929,3.295837,...,8.446556,8.627661,8.705994,8.705994,8.705994,8.705994,8.705994,8.705994,24,64.875499
780,3.178054,3.178054,3.188259,3.208270,3.228281,3.257726,3.286059,3.304586,3.323113,3.332205,...,9.120197,9.216521,9.259511,9.259511,9.259511,9.259511,9.259511,9.259511,25,74.002614
781,3.208670,3.198465,3.198465,3.208270,3.228281,3.248291,3.258097,3.258097,3.258097,3.258097,...,9.118115,9.214631,9.258750,9.258750,9.258750,9.258750,9.258750,9.258750,26,76.342275
782,3.188259,3.178054,3.188259,3.208270,3.228281,3.248291,3.258097,3.258097,3.258097,3.258097,...,9.117128,9.213735,9.256651,9.256651,9.256651,9.256651,9.256651,9.256651,27,82.621245


In [21]:
pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp38-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 1.2 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [22]:
train.to_csv('drive/MyDrive/dacon/data_re/상추/train_log_time4.csv',index=False)
test.to_csv('drive/MyDrive/dacon/data_re/상추/test_log_time4.csv',index=False)

In [29]:
train = pd.read_csv('drive/MyDrive/dacon/data_re/상추/train_log_time4.csv')
test = pd.read_csv('drive/MyDrive/dacon/data_re/상추/test_log_time4.csv')

In [31]:
corr_df = train.corr()
corr_df = corr_df.apply(lambda x: round(x ,2))
s = corr_df.unstack()
성장 = pd.DataFrame(s['predicted_weight_g']).sort_values(by=0).fillna(0)
성장[0] = abs(성장[0])
성장.sort_values(by=[0],ascending=False).reset_index().to_csv('target값이랑corr.csv',index=False,encoding='utf-8-sig')
under_1per = 성장[성장[0]<0.1].index
under_1per

Index(['내부온도관측치6+9', '시간당백색광량20+23', '내부온도관측치19+22', '내부온도관측치2+5',
       '내부온도관측치20+23', '내부온도관측치5+8', '시간당적색광량14+17', '시간당적색광량15+18',
       '시간당적색광량13+16', '시간당백색광량0+3', '시간당적색광량16+19', '내부온도관측치3+6',
       '내부온도관측치4+7', '시간당적색광량12+15', '시간당적색광량17+20', '시간당적색광량11+14',
       '시간당청색광량10+13', '시간당청색광량11+14', '시간당청색광량9+12', '시간당적색광량18+21',
       '일간누적적색광량16', '시간당청색광량12+15', '일간누적적색광량22', '일간누적적색광량23', '일간누적적색광량17',
       '일간누적적색광량21', '일간누적적색광량20', '시간당청색광량8+11', '일간누적적색광량18', '시간당적색광량10+13',
       '일간누적적색광량19', '일간누적적색광량15', '일간누적적색광량13', '일간누적적색광량14', '시간당적색광량9+12',
       '시간당청색광량7+10', '시간당백색광량1+4', '시간당백색광량2+5'],
      dtype='object')

In [32]:
drop_column = train.copy()
drop_column = drop_column.drop(under_1per,axis=1)

In [33]:
drop_columntt = test.copy()
drop_columntt = drop_columntt.drop(under_1per,axis=1)

In [35]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import catboost

X = train.drop(['predicted_weight_g'],axis=1)
y = train['predicted_weight_g']
CAT_model= {}
for f in range(0,27):
      train_idx = [i for i in range(0,784)]
      valid_idx = train_idx[f*28:f*28+28]
      for idx in valid_idx:
        train_idx.remove(idx)
      print(f'===================================={f+1}============================================')
      print(min(valid_idx),max(valid_idx))
      x_train, x_val, y_train, y_val = X.iloc[train_idx], X.iloc[valid_idx], y.iloc[train_idx], y.iloc[valid_idx]
      
      CAT = catboost.CatBoostRegressor(verbose=0,random_state=404)
      CAT.fit(x_train, y_train)
      
      y_pred = CAT.predict(x_val)
      mae = mean_squared_error(y_val, y_pred)**0.5
      print(f"{f + 1} Fold MAE = {mae}")
      CAT_model[f] = CAT
      print(f'================================================================================\n\n')

0 27
1 Fold MAE = 16.846762394239516


28 55
2 Fold MAE = 9.58440843540524


56 83
3 Fold MAE = 25.415663315268453


84 111
4 Fold MAE = 26.383651895349686


112 139
5 Fold MAE = 31.663650898799602


140 167
6 Fold MAE = 11.379664165608595


168 195
7 Fold MAE = 6.619430845811996


196 223
8 Fold MAE = 7.6688594370390755


224 251
9 Fold MAE = 11.76798240059569


252 279
10 Fold MAE = 9.88909867541086


280 307
11 Fold MAE = 21.179084775364174


308 335
12 Fold MAE = 13.360224275378977


336 363
13 Fold MAE = 38.7549847198574


364 391
14 Fold MAE = 14.827642415611555


392 419
15 Fold MAE = 15.413131334083557


420 447
16 Fold MAE = 38.58785876441744


448 475
17 Fold MAE = 15.581527432781764


476 503
18 Fold MAE = 12.279275677885146


504 531
19 Fold MAE = 14.923885782448053


532 559
20 Fold MAE = 9.92994427951591


560 587
21 Fold MAE = 20.5361534182735


588 615
22 Fold MAE = 18.85988480524488


616 643
23 Fold MAE = 12.250138963495022


644 671
24 Fold MAE = 12.309474099480484



In [36]:
testX = test.drop(['predicted_weight_g'],axis=1)
pc=[0]*140
for fold in range(27):
    pc += CAT_model[fold].predict(testX)/27
pc

array([1.12980323e+00, 1.58420693e+00, 1.85550121e+00, 2.55513170e+00,
       3.46950907e+00, 3.96358471e+00, 4.19434031e+00, 4.62246025e+00,
       5.82976133e+00, 6.07814331e+00, 8.38474451e+00, 1.13359190e+01,
       1.71588419e+01, 1.84063752e+01, 1.49465456e+01, 2.62653823e+01,
       2.87647186e+01, 2.37599939e+01, 2.53379195e+01, 3.52719697e+01,
       5.36349927e+01, 6.36082550e+01, 7.22595413e+01, 7.00348473e+01,
       7.36423528e+01, 7.65499145e+01, 7.77733810e+01, 8.58069655e+01,
       7.68689742e-01, 5.91840333e-01, 9.46412717e-01, 1.33191365e+00,
       1.44483490e+00, 2.59448876e+00, 2.92190392e+00, 3.54387622e+00,
       4.01660745e+00, 4.89076516e+00, 6.35890014e+00, 8.82643385e+00,
       1.03020791e+01, 1.21238661e+01, 1.40421960e+01, 1.69911595e+01,
       1.95410679e+01, 2.49493764e+01, 2.96483794e+01, 3.50039599e+01,
       4.07357256e+01, 4.49843301e+01, 5.55626384e+01, 5.93809966e+01,
       6.28445287e+01, 6.27213222e+01, 7.04714872e+01, 7.10450911e+01,
      

In [37]:
i=0
all_target_list = sorted(glob.glob('drive/MyDrive/dacon/data_re/상추/test_target/*.csv'))
for test_path in all_target_list:
    submit_df = pd.read_csv(test_path)
    submit_df['predicted_weight_g'] = pc[i*28:i*28+28]
    submit_df.to_csv(test_path, index=False)
    i+=1