# <b>월간 데이콘 1 반도체 박막 두께 분석

##### 작성자 : 허은정
##### 마지막 수정 : 20.02.01 00:00 

In [0]:
#비활성화방지, F12, 개발자도구. 콘솔창에 입력

function ClickConnect(){
console.log("Working"); 
document.querySelector("colab-toolbar-button#connect").click() 
}
setInterval(ClickConnect,60000)

### <b>라이브러리 및 데이터

#### 구글 드라이브에 마운트

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

#### 라이브러리 설치

In [0]:
import os                                           # 디렉토리 설정
os.chdir("/content/gdrive/My Drive/semicon")
import warnings                                      # 경고 메세지 무시
warnings.filterwarnings('ignore')
import pandas as pd                                  # 데이터 조작, 분석
import numpy as np                                   # 행렬 연산
import random                                        # 난수 생성

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LeakyReLU
from keras.layers import ELU
from keras.layers import LSTM
from keras.callbacks import EarlyStopping
from sklearn.metrics import mean_absolute_error      # MAE 측정

import matplotlib.pyplot as plt                      # 시각화
import seaborn as sns                                # 시각화

#### 데이터 불러오기

In [0]:
train_raw = pd.read_csv("train.csv")
test_raw = pd.read_csv("test.csv")

### <b>데이터 전처리

In [0]:
# 결측치 없음

#### train 데이터를 train, valid 데이터로 분리

In [0]:
valid = train_raw.sample(frac=1, random_state=2020).iloc[:10000,:].reset_index(drop=True)
train = train_raw.sample(frac=1, random_state=2020).iloc[10000:,:].reset_index(drop=True)

test = test_raw

#### train, valid, test 데이터를 독립변수 x, 종속변수 y로 분리

In [0]:
train_x = train.iloc[:,4:]
train_x.columns = [x for x in range(226)]
train_y = train.iloc[:,:4]

valid_x = valid.iloc[:,4:]
valid_x.columns = [x for x in range(226)]
valid_y = valid.iloc[:,:4]

test_x = test.iloc[:,1:]
test_x.columns = [x for x in range(226)]

### <b>탐색적 자료분석 (변수 간 차이 계산)

#### 그래프 파악

In [0]:
train = train_raw
train.groupby(['layer_1']).mean()

In [0]:
plt.figure(figsize=(20, 10))
plt.grid()
n = 6
for i in range(n):
  plt.plot(train.keys()[4:], train.groupby(['layer_1']).mean().iloc[i*5,3:], color=(i/n, 0, 0))
  plt.xticks([x*5 for x in range(46)])

#### 1차미분

In [0]:
train_x0 = train_x.iloc[:,1:]
train_x0.columns = [x for x in range(225)]
train_x1 = train_x0 - train_x.iloc[:,:225]

valid_x0 = valid_x.iloc[:,1:]
valid_x0.columns = [x for x in range(225)]
valid_x1 = valid_x0 - valid_x.iloc[:,:225]

test_x0 = test_x.iloc[:,1:]
test_x0.columns = [x for x in range(225)]
test_x1 = test_x0 - test_x.iloc[:,:225]

#### 2차미분

In [0]:
train_x10 = train_x1.iloc[:,1:]
train_x10.columns = [x for x in range(224)]
train_x2 = train_x10 - train_x1.iloc[:,:224]

valid_x10 = valid_x1.iloc[:,1:]
valid_x10.columns = [x for x in range(224)]
valid_x2 = valid_x10 - valid_x1.iloc[:,:224]

test_x10 = test_x1.iloc[:,1:]
test_x10.columns = [x for x in range(224)]
test_x2 = test_x10 - test_x1.iloc[:,:224]

#### 미분값 변수 추가

In [0]:
train_X = pd.concat([train_x2, train_x1, train_x], axis=1)
valid_X = pd.concat([valid_x2, valid_x1, valid_x], axis=1)
test_X = pd.concat([test_x2, test_x1, test_x], axis=1)

### <b>모델 구축

In [0]:
iteration = 1

train_pred = np.zeros([len(train),4])
valid_pred = np.zeros([len(valid),4])
test_pred = np.zeros([len(test),4])

atv = ELU(alpha=1.0)
model = Sequential()
model.add(Dense(units=226, activation=atv, input_dim=226))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=4, activation='linear'))
model.compile(loss='mae', optimizer='adam', metrics=['mae'])

for i in range(iteration) :
    
    model.fit(train_x, train_y, epochs=2000, batch_size=25000, validation_split = 0.05)
    
    train_pred += model.predict(train_x)
    valid_pred += model.predict(valid_x)
    test_pred += model.predict(test_x)
    
    train_pred[train_pred < 10] = 10
    valid_pred[valid_pred < 10] = 10
    test_pred[test_pred < 10] = 10

    train_pred[train_pred > 300] = 300
    valid_pred[valid_pred > 300] = 300
    test_pred[test_pred > 300] = 300

#train_pred = train_pred / iteration
#valid_pred = valid_pred / iteration

### <b>모델 학습 및 검증

In [0]:
mean_absolute_error(valid_y, valid_pred)

In [0]:
plt.scatter(valid_y, valid_pred, alpha=0.05)

In [0]:
mean_absolute_error(train_y, train_pred)

In [0]:
plt.scatter(train_y, train_pred, alpha=0.05)

### <b>모델별 결과

###### 1. 기본변수, epoch=200, batch_size=10000

In [0]:
model = Sequential()
model.add(Dense(units=226, activation='relu', input_dim=226))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=4, activation='linear'))

model.compile(loss='mae', optimizer='adam', metrics=['mae'])
model.fit(train_x, train_y, epochs=200, batch_size=10000, validation_split = 0.05)
valid_pred = model.predict(valid_x)

# train 15.961750167919915
# valid 16.385424884825238

###### 2. 기본변수, epoch=1000, batch_size=10000

In [0]:
model = Sequential()
model.add(Dense(units=226, activation='relu', input_dim=226))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=4, activation='linear'))

model.compile(loss='mae', optimizer='adam', metrics=['mae'])
model.fit(train_x, train_y, epochs=1000, batch_size=10000, validation_split = 0.05)
valid_pred = model.predict(valid_x)

# train 8.126535143337069
# valid 8.946578103011547

###### 3. 기본변수, lightGBM, ensemble=4

In [0]:
iteration = 4
sample_size = 1

train_pred = np.zeros([len(train),4])
valid_pred = np.zeros([len(valid),4])
test_pred = np.zeros([len(test),4])

for i in range(iteration) :
    params = {'learning_rate': 0.25, 
              'max_bin': 511,
              'boosting': 'gbdt', 
              'objective': 'regression', 
              'metric': 'mae', 
              'is_training_metric': True,
              'sparse_threshold': 1.0,
              'seed':2020,
              'divice': 'gpu' 
              }

    sample_index = random.sample(range(len(train)), 1000)
    train_ds = lgb.Dataset(train_x, label = train_y.iloc[:,i])
    valid_ds = lgb.Dataset(train_x.iloc[sample_index,:], train_y.iloc[sample_index,i])
    model = lgb.train(params, train_ds, 5000, valid_ds, verbose_eval=100, early_stopping_rounds=1000)  
    
    train_pred[:,i] += model.predict(train_x)
    valid_pred[:,i] += model.predict(valid_x)
    test_pred[:,i] += model.predict(test_x)
    
    train_pred[train_pred[:,i] < 0, i] = 0
    valid_pred[valid_pred[:,i] < 0, i] = 0
    test_pred[test_pred[:,i] < 0, i] = 0

# train 12.758567823740458
# valid 24.27478328767353

###### 4. 기본변수, epoch=1500, batch_size=10000

In [0]:
model = Sequential()
model.add(Dense(units=226, activation='relu', input_dim=226))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=4, activation='linear'))

model.compile(loss='mae', optimizer='adam', metrics=['mae'])

model.fit(train_x, train_y, epochs=1500, batch_size=10000, validation_split = 0.05)
    
train_pred += model.predict(train_x)
valid_pred += model.predict(valid_x)
test_pred += model.predict(test_x)
    
train_pred[train_pred < 0] = 0
valid_pred[valid_pred < 0] = 0
test_pred[test_pred < 0] = 0

# train 5.254762536292218
# valid 5.694623216835558

###### 5. 기본변수, knn

In [0]:
# knn으로 train_y 데이터의 layer 변수 추가

# train 2.932624136510533
# valid 11.573848717975617

###### 6. 1차미분+기본변수, epoch=1500, batch_size=10000, layer 좀 늘림

In [0]:
# 1차 미분 변수 추가

model = Sequential()
model.add(Dense(units=452, activation='relu', input_dim=452))
model.add(Dense(units=256, activation='relu'))
model.add(Dense(units=256, activation='relu'))
model.add(Dense(units=256, activation='relu'))
model.add(Dense(units=256, activation='relu'))
model.add(Dense(units=256, activation='relu'))
model.add(Dense(units=256, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=4, activation='linear'))

model.compile(loss='mae', optimizer='adam', metrics=['mae'])
model.fit(train_x, train_y, epochs=1500, batch_size=10000, validation_split = 0.05)
valid_pred = model.predict(valid_x)

# train 1.7777946670879572
# valid 2.1964727961288393

###### 7. 1차미분, epoch=200, batch_size=10000

In [0]:
# 1차 미분 변수로만 epoch 200

model = Sequential()
model.add(Dense(units=226, activation='relu', input_dim=226))
model.add(Dense(units=256, activation='relu'))
model.add(Dense(units=256, activation='relu'))
model.add(Dense(units=256, activation='relu'))
model.add(Dense(units=256, activation='relu'))
model.add(Dense(units=256, activation='relu'))
model.add(Dense(units=256, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=4, activation='linear'))

model.compile(loss='mae', optimizer='adam', metrics=['mae'])
model.fit(train_x2, train_y, epochs=200, batch_size=10000, validation_split = 0.05)
valid_pred = model.predict(valid_x2)

# trian 12.853896638178593
# valid 14.482646765730864

###### 8. 1차미분, epoch=200, batch_size=5000, 1000, 500

In [0]:
# 1차 미분 변수로만 epoch 200
# batch_size = 5000, 1000, 500으로, 시간오래걸림

model = Sequential()
model.add(Dense(units=226, activation='relu', input_dim=226))
model.add(Dense(units=256, activation='relu'))
model.add(Dense(units=256, activation='relu'))
model.add(Dense(units=256, activation='relu'))
model.add(Dense(units=256, activation='relu'))
model.add(Dense(units=256, activation='relu'))
model.add(Dense(units=256, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=4, activation='linear'))

model.compile(loss='mae', optimizer='adam', metrics=['mae'])
model.fit(train_x2, train_y, epochs=200, batch_size=1000, validation_split = 0.05)
valid_pred = model.predict(valid_x2)

# batch_size=5000 3~4초

# train 6.594973571259298
# valid 7.687990546501744

# batch_size=1000 6초

# train 2.6649903466623432
# valid 3.104061243339777

# batch_size=500 9초

# train 2.4533957332863774
# valid 2.8087869629114866

###### 9. 2차미분+1차미분+기본변수, epoch=1500, batch_size=10000

In [0]:
# 1차, 2차 미분 변수 추가

model = Sequential()
model.add(Dense(units=675, activation='relu', input_dim=675))
model.add(Dense(units=518, activation='relu'))
model.add(Dense(units=518, activation='relu'))
model.add(Dense(units=518, activation='relu'))
model.add(Dense(units=518, activation='relu'))
model.add(Dense(units=518, activation='relu'))
model.add(Dense(units=518, activation='relu'))
model.add(Dense(units=256, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=4, activation='linear'))

# train 1.186030168549741
# valid 1.6393125162190199

###### 10. 3차미분+2차미분+1차미분+기본변수, epoch=1500, batch_size=10000

In [0]:
# 1,2,3차 미분 변수 추가
# 과적합발생, 2차까지만 변수 추가

model = Sequential()
model.add(Dense(units=898, activation='relu', input_dim=898))
model.add(Dense(units=518, activation='relu'))
model.add(Dense(units=518, activation='relu'))
model.add(Dense(units=518, activation='relu'))
model.add(Dense(units=518, activation='relu'))
model.add(Dense(units=518, activation='relu'))
model.add(Dense(units=518, activation='relu'))
model.add(Dense(units=256, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=4, activation='linear'))

# train 2.8556745621275708
# valid 4.078369204955921

###### 11. 2차미분+1차미분+기본변수, 109500개 샘플링, epoch=500, batch_size=50000, layer 조정 실험

In [0]:
# 빠른 실험 위해 train 데이터 109500개, epoch=500, batch_size=50000
# epoch=100 200 300 400 500일 때 train 데이터의 mae

model = Sequential()
model.add(Dense(units=675, activation='relu', input_dim=675))
model.add(Dense(units=518, activation='relu'))
model.add(Dense(units=518, activation='relu'))
model.add(Dense(units=518, activation='relu'))
model.add(Dense(units=4, activation='linear'))

# 기본        58.9 50.8 40.4 33.5 29.9

model = Sequential()
model.add(Dense(units=675, activation='relu', input_dim=675))
model.add(Dense(units=768, activation='relu'))
model.add(Dense(units=768, activation='relu'))
model.add(Dense(units=768, activation='relu'))
model.add(Dense(units=4, activation='linear'))

# unit수 증가  58.3 49.3 35.4 29.7 24.8 속도차이없음

model = Sequential()
model.add(Dense(units=675, activation='relu', input_dim=675))
model.add(Dense(units=768, activation='relu'))
model.add(Dense(units=768, activation='relu'))
model.add(Dense(units=768, activation='relu'))
model.add(Dense(units=768, activation='relu'))
model.add(Dense(units=768, activation='relu'))
model.add(Dense(units=768, activation='relu'))
model.add(Dense(units=768, activation='relu'))
model.add(Dense(units=768, activation='relu'))
model.add(Dense(units=768, activation='relu'))
model.add(Dense(units=4, activation='linear'))

# layer수 증가 61.8 61.5 60.6 49.5 32.4 속도차이없음

model = Sequential()
model.add(Dense(units=675, activation='relu', input_dim=675))
model.add(Dense(units=1250, activation='relu'))
model.add(Dense(units=1250, activation='relu'))
model.add(Dense(units=1250, activation='relu'))
model.add(Dense(units=1250, activation='relu'))
model.add(Dense(units=1250, activation='relu'))
model.add(Dense(units=1250, activation='relu'))
model.add(Dense(units=4, activation='linear'))

# unit수 증가 61.4 51.2 35.9 19.3 12.5 속도조금느림 과적합심함
# 그런데 샘플사이즈 작아서 과적합있어도 될지도

###### 12. 2차미분+1차미분+기본변수, 109500개 샘플링, epoch=500, batch_size=50000, activation 조정 실험

In [0]:
# 빠른 실험 위해 train 데이터 109500개, epoch=500, batch_size=50000
# epoch=100 200 300 400 500일 때 train 데이터의 mae

atv = LeakyReLU(alpha=0.3)
model = Sequential()
model.add(Dense(units=675, activation=atv, input_dim=675))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=4, activation='linear'))
model.compile(loss='mae', optimizer='adam', metrics=['mae'])

# LeakyReLU(alpha=0.3) 59.4 48.9 28.4 20.0 15.8 속도차이없음 과적합약간있음

atv = LeakyReLU(alpha=0.7)
model = Sequential()
model.add(Dense(units=675, activation=atv, input_dim=675))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=4, activation='linear'))
model.compile(loss='mae', optimizer='adam', metrics=['mae'])

# LeakyReLU(alpha=0.7) 61.2 54.5 49.7 45.4 40.5

atv = ELU(alpha=1.0)
model = Sequential()
model.add(Dense(units=675, activation=atv, input_dim=675))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=4, activation='linear'))
model.compile(loss='mae', optimizer='adam', metrics=['mae'])

# ELU(alpha=1.0) 60.0 32.8 13.1 7.5 5.5 과적합있지만거의없음 성능좋음

###### 13. 2차미분+1차미분+기본변수, epoch=1500, batch_size=25000, activation=ELU

In [0]:
# epoch당 12~13초 -> 5시간

atv = ELU(alpha=1.0)
model = Sequential()
model.add(Dense(units=675, activation=atv, input_dim=675))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=768, activation=atv))
model.add(Dense(units=4, activation='linear'))
model.compile(loss='mae', optimizer='adam', metrics=['mae'])
model.fit(train_X, train_y, epochs=1500, batch_size=25000, validation_split = 0.05)

# train 0.6384669344828162
# valid 1.0066060829472543

###### 14. 결론: epoch=1000, batch_size=25000, ectivation=ELU로 모델 여러 개 ensemble, median으로 최종 결과물 만들기

In [0]:
# 한 모델에 약 3.3시간

###### 15. 결론2: epoch=2000, batch_size=50000, ectivation=ELU로 모델 여러 개 ensemble, median으로 최종 결과물 만들기

### <b>테스트 데이터에 모델 적용

In [0]:
sample_sub = pd.read_csv('sample_submission.csv', index_col=0)
submission = sample_sub + test_pred

In [0]:
submission

In [0]:
submission.to_csv('05_elu.csv')

### <b>여러 모델 한 번에 만들기

In [0]:
# validation 데이터 없이

file_name = ['m12.csv']
for i in range(1):

  # 샘플링
  train = train_raw.sample(n=700000, random_state=2020+11+i).reset_index(drop=True)
  test = test_raw

  # x, y 분리
  train_x = train.iloc[:,4:]
  train_x.columns = [x for x in range(226)]
  train_y = train.iloc[:,:4]

  test_x = test.iloc[:,1:]
  test_x.columns = [x for x in range(226)]

  # 1차미분
  train_x0 = train_x.iloc[:,1:]
  train_x0.columns = [x for x in range(225)]
  train_x1 = train_x0 - train_x.iloc[:,:225]

  test_x0 = test_x.iloc[:,1:]
  test_x0.columns = [x for x in range(225)]
  test_x1 = test_x0 - test_x.iloc[:,:225]

  # 2차미분
  train_x10 = train_x1.iloc[:,1:]
  train_x10.columns = [x for x in range(224)]
  train_x2 = train_x10 - train_x1.iloc[:,:224]

  test_x10 = test_x1.iloc[:,1:]
  test_x10.columns = [x for x in range(224)]
  test_x2 = test_x10 - test_x1.iloc[:,:224]

  # 미분변수추가
  train_X = pd.concat([train_x2, train_x1, train_x], axis=1)
  test_X = pd.concat([test_x2, test_x1, test_x], axis=1)

  # 모델 학습
  train_pred = np.zeros([len(train),4])
  test_pred = np.zeros([len(test),4])

  atv = ELU(alpha=1.0)
  model = Sequential()
  model.add(Dense(units=675, activation=atv, input_dim=675))
  model.add(Dense(units=768, activation=atv))
  model.add(Dense(units=768, activation=atv))
  model.add(Dense(units=768, activation=atv))
  model.add(Dense(units=768, activation=atv))
  model.add(Dense(units=768, activation=atv))
  model.add(Dense(units=768, activation=atv))
  model.add(Dense(units=4, activation='linear'))
  model.compile(loss='mae', optimizer='adam', metrics=['mae'])
    
  model.fit(train_X, train_y, epochs=2000, batch_size=25000, validation_split = 0.05)

  # 모델 적용 
  test_pred += model.predict(test_X)

  test_pred[test_pred < 10] = 10
  test_pred[test_pred > 300] = 300

  # 저장
  sample_sub = pd.read_csv('sample_submission.csv', index_col=0)
  submission = sample_sub + test_pred
  submission.to_csv(file_name[i])

### <b>앙상블

In [0]:
file_name = ['m01.csv','m02.csv','m03.csv','m04.csv','m05.csv','m06.csv','m07.csv','m08.csv',
             'm09.csv','m10.csv','m11.csv','m12.csv','m13.csv','m14.csv','m15.csv','m16.csv',
             'm17.csv','m18.csv','m19.csv','m20.csv','m21.csv','m22.csv','m23.csv','m24.csv',
             'm25.csv','m26.csv','m27.csv','m28.csv','m29.csv','m30.csv','m31.csv','m32.csv']

m = [0 for x in range(len(file_name))]
for i in range(len(m)):
    m[i] = pd.read_csv(file_name[i], index_col=0)
    
l = [0 for x in range(4)]
for i in range(4):
    l[i] = pd.DataFrame()
    for j in range(len(m)):
        l[i] = pd.concat([l[i], m[j].iloc[:,i]], axis=1)
        
for i in range(4):
    for j in range(10000):
        l[i].iloc[j] = l[i].iloc[j].sort_values()

result = pd.DataFrame()
for i in range(4):
    result = pd.concat([result, l[i].iloc[:,8:24].mean(axis=1)], axis=1)
result.columns = ['layer_1', 'layer_2', 'layer_3', 'layer_4']

sample_sub = pd.read_csv('sample_submission.csv', index_col=0)
submission = sample_sub + result
submission.to_csv('ensemble0132_trim_16.csv')