# 0. 라이브러리 및 파일 로드 

Google Colab 에서 실행 확인(2020-06-29 기준)

https://colab.research.google.com/drive/1C97Sj_9S5S1kAMeIC62RlCo5k_ISXXJA?usp=sharing


파이썬 버전 - Python 3.6.9

기타 라이브러리 버전 - requirements.txt참고

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, log_loss, accuracy_score, roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold

from imblearn.combine import SMOTEENN
from lightgbm.sklearn import LGBMClassifier
from tqdm.notebook import tqdm
warnings.filterwarnings(action='ignore') 

path = './data/'

In [None]:
plant1 = pd.read_csv(path + 'plant1_train.csv', parse_dates=[0], index_col=[0])
plant2 = pd.read_csv(path + 'plant2_train.csv', parse_dates=[0], index_col=[0])
forecast = pd.read_csv(path + 'weather4.csv', parse_dates=[0], index_col=[0])

In [None]:
#이슬점
def dewpoint(temp, humid):
    return ((243.12 *((17.62 * temp /(243.12 + temp)) + np.log(humid / 100.0))) 
            / (17.62-((17.62 * temp / (243.12 + temp)) + np.log(humid/ 100.0))))

In [None]:
plant1.columns = [col.replace('plant1_train.', '') for col in plant1.columns]
plant2.columns = [col.replace('plant2_train.', '') for col in plant2.columns]

In [None]:
data  = pd.concat([plant1,forecast], axis=1)
data2 = pd.concat([plant2,forecast], axis=1)

# 1. 1차 데이터 전처리

In [None]:
# 1공장
# 보간
inp = data.loc[:, 'temp_25':'wind_46']
data.update(inp.interpolate())
tempTrain = data.dropna()
# train test set.
train = tempTrain['2017-02':'2019-03'].resample('1h').asfreq().dropna().reset_index()
test = tempTrain['2017-02':'2019-03'].resample('1h').asfreq().dropna().reset_index()

In [None]:
# 2공장
# 보간
inp = data2.loc[:, 'temp_25':'wind_46']
data2.update(inp.interpolate())
tempTrain2 = data2.dropna()
# train test set.
train2 = tempTrain2['2017-02':'2019-03'].resample('1h').asfreq().dropna().reset_index()
test2 = tempTrain2['2017-02':'2019-03'].resample('1h').asfreq().dropna().reset_index()

In [None]:
def get_train_test(train, test):
    tem_col = ['tem_in_loc1', 'tem_in_loc2', 'tem_in_loc3', 'tem_out_loc1', 'temp_25', 'temp_46']
    hum_col = ['hum_in_loc1', 'hum_in_loc2', 'hum_in_loc3', 'hum_out_loc1', 'humid_25','humid_46']

    # 기상청 예보 데이터 이동평균 설정
    train_X = train.drop(['cond_loc1', 'cond_loc2', 'cond_loc3'], axis=1)
    ma6 =  train_X.rolling(6).mean().filter(regex='(25|46)').add_prefix('MA6_')
    ma24 = train_X.rolling(24).mean().filter(regex='(25|46)').add_prefix('MA24_')
    train_X = pd.concat([train_X, ma6, ma24], axis=1).dropna()

    test_X = test.drop(['cond_loc1', 'cond_loc2', 'cond_loc3'], axis=1)
    ma6 =  test_X.rolling(6).mean().filter(regex='(25|46)').add_prefix('MA6_')
    ma24 = test_X.rolling(24).mean().filter(regex='(25|46)').add_prefix('MA24_')
    test_X = pd.concat([test_X, ma6, ma24], axis=1).dropna()
    
    train_X = train_X.set_index('index')
    test_X = test_X.set_index('index')

    plant_env_col = ['tem_in_loc1', 'hum_in_loc1', 'tem_coil_loc1', 'tem_in_loc2', 'hum_in_loc2', 'tem_coil_loc2', 'tem_in_loc3', 'hum_in_loc3', 'tem_coil_loc3', 'tem_out_loc1', 'hum_out_loc1']

    # y 설정    
    
    train_y24 = train_X.shift(-24).dropna()[plant_env_col].add_prefix('y25_')
    train_y48 = train_X.shift(-48).dropna()[plant_env_col].add_prefix('y46_')
    
    test_y24 = test_X.shift(-24).dropna()[plant_env_col].add_prefix('y25_')
    test_y48 = test_X.shift(-48).dropna()[plant_env_col].add_prefix('y46_')

    train = pd.concat([train_X, train_y24, train_y48], axis=1).dropna()
    test = pd.concat([test_X, test_y24, test_y48], axis=1).dropna()

    train_X = train.iloc[:, :-22]
    train_Y = train.iloc[:, -22:]

    test_X = test.iloc[:, :-22]
    test_Y = test.iloc[:, -22:]

    return train_X, train_Y, test_X, test_Y


In [None]:
def linear_train_model(x_data, y_data, k=5):
    models = []
    
    k_fold = KFold(n_splits=k, shuffle=True, random_state=777)
    val = 0
    for train_idx, val_idx in k_fold.split(x_data):
        x_train, y_train = x_data.iloc[train_idx], y_data.iloc[train_idx]
        x_val, y_val = x_data.iloc[val_idx], y_data.iloc[val_idx]
    
        model = Ridge(alpha=0.5, random_state=777) # ElasticNet(alpha=0.005, l1_ratio=0.7, tol=0.0001, max_iter=100000, random_state=777)
        model.fit(x_train, y_train)
        pred = model.predict(x_val)
        val += mean_absolute_error(y_val, pred)/k

        models.append(model)
    
    return models, val

# 2. 1차 모델 훈련 및 예측 진행

In [None]:
# 1공장 데이터
linear_train_X, linear_train_Y, linear_test_X, linear_test_Y = get_train_test(train, test)

# 1공장 내부 환경 예측 모델

linear_models1 = {}
linear_vals1 = {}
linear_scaler1 = {}


for time_label in ['25', '46']:
    _y = linear_train_Y.filter(regex=f'y{time_label}_')
    
    for label in _y.columns:
        target = f'{label}'
        print('train column : ', target)
        time_col = linear_train_X.filter(regex= f'_{time_label}$').columns.to_list()        
        in_col = linear_train_X.filter(regex=f'(in|coil)_loc{label[-1]}').columns.to_list()
        out_col = linear_train_X.filter(regex=f'out_loc1').columns.to_list()

        if 'out_loc1' in label:
            tcol = time_col + out_col
        else:
            tcol = time_col + in_col 
        
        x = linear_train_X.loc[:, tcol]
        scaler = StandardScaler().fit(x)
        x.loc[:,:] = scaler.transform(x)
        linear_scaler1[target] = scaler
        linear_models1[target], linear_vals1[target] = linear_train_model(x, _y[label])
        


In [None]:
plant1_pred = pd.DataFrame()
for col in linear_models1:
    preds = []
    time_col = linear_train_X.filter(regex= f'{col[1:3]}$').columns.to_list()        
    in_col = linear_train_X.filter(regex=f'(in|coil)_loc{col[-1]}').columns.to_list()
    out_col = linear_train_X.filter(regex=f'out_loc1').columns.to_list()
    
    if 'out_loc1' in col:
        tcol = time_col + out_col
    else:
        tcol = time_col + in_col
    x = linear_test_X[tcol]
    scaler = linear_scaler1[col]
    x.loc[:,:] = scaler.transform(x)

    for model in linear_models1[col]:        
        preds.append(model.predict(x))    

    pred = np.mean(preds, axis=0)
    plant1_pred[col] = pred  

plant1_pred.index = linear_test_X.index

In [None]:
# 2공장 데이터
linear_train_X, linear_train_Y, linear_test_X, linear_test_Y = get_train_test(train2, test2)

# 2공장 내부 환경 예측 모델

linear_models2 = {}
linear_vals2 = {}
linear_scaler2 = {}

for time_label in ['25', '46']:
    _y = linear_train_Y.filter(regex=f'y{time_label}_')
    
    for label in _y.columns:
        target = f'{label}'
        print('train column : ', target)
        time_col = linear_train_X.filter(regex= f'_{time_label}$').columns.to_list()        
        in_col = linear_train_X.filter(regex=f'(in|coil)_loc{label[-1]}').columns.to_list()
        out_col = linear_train_X.filter(regex=f'out_loc1').columns.to_list()

        if 'out_loc1' in label:
            tcol = time_col + out_col
        else:
            tcol = time_col + in_col

        x = linear_train_X.loc[:, tcol]
        scaler = StandardScaler().fit(x)
        x.loc[:,:] = scaler.transform(x)
        linear_scaler2[target] = scaler

        linear_models2[target], linear_vals2[target] = linear_train_model(x, _y[label])


In [None]:
plant2_pred = pd.DataFrame()
for col in linear_models2:
    preds = []
    time_col = linear_train_X.filter(regex= f'{col[1:3]}$').columns.to_list()        
    in_col = linear_train_X.filter(regex=f'(in|coil)_loc{col[-1]}').columns.to_list()
    out_col = linear_train_X.filter(regex=f'out_loc1').columns.to_list()
    
    if 'out_loc1' in col:
        tcol = time_col + out_col
    else:
        tcol = time_col + in_col

    
    x = linear_train_X[tcol]
    scaler = linear_scaler2[col]
    x.loc[:,:] = scaler.transform(x)
    
    for model in linear_models2[col]:        
        preds.append(model.predict(x))    

    pred = np.mean(preds, axis=0)
    plant2_pred[col] = pred  

plant2_pred.index = linear_train_X.index

# 3. 2차 모델 데이터 전처리

In [None]:
train = train.set_index('index')
cond24 = train[['cond_loc1', 'cond_loc2', 'cond_loc3']].shift(-24).add_prefix('y25_')
cond48 = train[['cond_loc1', 'cond_loc2', 'cond_loc3']].shift(-48).add_prefix('y46_')

plant1_second = pd.concat([plant1_pred, cond24, cond48], axis=1).dropna() 

In [None]:
tem_col = plant1_second.filter(regex='tem_in_').columns
hum_col = plant1_second.filter(regex='hum_in_').columns
coil_col = plant1_second.filter(regex='coil_').columns

for i in range(len(tem_col)):
    dew_col = f'{tem_col[i][:3]}_dewpoint_{tem_col[i][-7:]}'
    plant1_second[dew_col] = dewpoint(plant1_second[tem_col[i]], plant1_second[hum_col[i]])

    plant1_second[f'{tem_col[i][:3]}_dewdiff_{tem_col[i][-7:]}'] = plant1_second[coil_col[i]] - plant1_second[dew_col]

plant1_second['month'] = plant1_second.index.month
plant1_second['day'] = plant1_second.index.day
plant1_second['hour'] = plant1_second.index.hour


In [None]:
train2 = train2.set_index('index')
cond24 = train2[['cond_loc1', 'cond_loc2', 'cond_loc3']].shift(-24).add_prefix('y25_')
cond48 = train2[['cond_loc1', 'cond_loc2', 'cond_loc3']].shift(-48).add_prefix('y46_')

plant2_second = pd.concat([plant2_pred, cond24, cond48], axis=1).dropna() 

In [None]:
tem_col = plant2_second.filter(regex='tem_in_').columns
hum_col = plant2_second.filter(regex='hum_in_').columns
coil_col = plant2_second.filter(regex='coil_').columns

for i in range(len(tem_col)):
    dew_col = f'{tem_col[i][:3]}_dewpoint_{tem_col[i][-7:]}'
    plant2_second[dew_col] = dewpoint(plant2_second[tem_col[i]], plant2_second[hum_col[i]])

    plant2_second[f'{tem_col[i][:3]}_dewdiff_{tem_col[i][-7:]}'] = plant2_second[coil_col[i]] - plant2_second[dew_col]

plant2_second['month'] = plant2_second.index.month
plant2_second['day'] = plant2_second.index.day
plant2_second['hour'] = plant2_second.index.hour


In [None]:
plant1_second_X = plant1_second.drop(['y25_cond_loc1', 'y25_cond_loc2', 'y25_cond_loc3', 'y46_cond_loc1', 'y46_cond_loc2', 'y46_cond_loc3'], axis=1 )
plant1_second_Y = plant1_second.filter(regex='cond')

plant2_second_X = plant2_second.drop(['y25_cond_loc1', 'y25_cond_loc2', 'y25_cond_loc3', 'y46_cond_loc1', 'y46_cond_loc2', 'y46_cond_loc3'], axis=1 )
plant2_second_Y = plant2_second.filter(regex='cond')

# 4. 2차 모델 훈련

In [None]:
def train_second_model(x_data, y_data, k=5):
    models = []
    sk_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=777)
    loss = 0
    score = 0
    for train_idx, val_idx in sk_fold.split(x_data, y_data):
        x_train, y_train = x_data[train_idx], y_data[train_idx]
        x_val, y_val = x_data[val_idx], y_data[val_idx]        
        
        params = {
            'boosting_type' : 'gbdt',
            'n_estimators'  : 8000, 
            'learning_rate' : 0.01,             
            'objective'     :'binary',
            'early_stopping_round': 15,
            'reg_alpha' : 0.,           # L1 regularization term on weights.
            'reg_lambda': 0.3,           # L2 regularization term on weights.
            'feature_fraction' : 0.60,                            
        }
        model = LGBMClassifier(**params) 
        model.fit(x_train, y_train, eval_set=(x_val, y_val), verbose=100)
        pred = model.predict_proba(x_val)[:, 1]
        
        loss += log_loss(y_val, pred)/k
        score += roc_auc_score(y_val, pred)/k

        models.append(model)
    
    return (models, (loss, score))

In [None]:
# 1공장 결로 예측모델 훈련
second_models1 = {}
second_eval1 = {}
for time_label in ['y25', 'y46']:
    X_time = plant1_second_X.filter(regex=f'{time_label}')
    y_time = plant1_second_Y.filter(regex=f'{time_label}_cond')
    for loc_label in ['loc1', 'loc2', 'loc3']:
        print(f'train : {time_label}_{loc_label}')
        y = y_time.filter(regex=f'{loc_label}')
        in_col = X_time.filter(regex=f'(in|coil)_{loc_label}').columns.to_list()
        out_col = X_time.filter(regex=f'out_loc1').columns.to_list()
        date_col = ['month','day', 'hour']
        tcol = in_col + out_col + date_col        
        x, y = SMOTEENN(random_state=777).fit_sample(plant1_second_X[tcol], y.values.reshape(-1,))

        second_models1[f'{time_label}_{loc_label}'], second_eval1[f'{time_label}_{loc_label}'] = train_second_model(x, y)

In [None]:
# 2공장 결로 예측모델 훈련
second_models2 = {}
second_eval2 = {}
for time_label in ['y25', 'y46']:
    X_time = plant2_second_X.filter(regex=f'{time_label}')
    y_time = plant2_second_Y.filter(regex=f'{time_label}_cond')
    for loc_label in ['loc1', 'loc2', 'loc3']:
        print(f'train : {time_label}_{loc_label}')
        y = y_time.filter(regex=f'{loc_label}')
        in_col = X_time.filter(regex=f'(in|coil)_{loc_label}').columns.to_list()
        out_col = X_time.filter(regex=f'out_loc1').columns.to_list()
        date_col = ['month','day', 'hour']
        tcol = in_col + out_col + date_col        
        x, y = SMOTEENN(random_state=777).fit_sample(plant2_second_X[tcol], y.values.reshape(-1,))

        second_models2[f'{time_label}_{loc_label}'], second_eval2[f'{time_label}_{loc_label}'] = train_second_model(x, y)

In [None]:
# 5fold validation 결과
v = 0
for x in second_eval1:
    v += second_eval1[x][1]/len(second_eval1)

for x in second_eval2:
    v += second_eval2[x][1]/len(second_eval2)

print(v/2)

# 5. Test Set에 대한 예측 진행

In [None]:
forecast_model = {
    '1' : linear_models1,
    '2' : linear_models2,
}
scalers = {
    '1' : linear_scaler1,
    '2' : linear_scaler2,
}
classifiers = {
    '1' : second_models1,
    '2' : second_models2,
}

In [None]:
test = pd.read_csv(path + 'test.csv', parse_dates=[0], index_col=[0])
test.columns = [col.replace('plant_test.', '') for col in test.columns]

In [None]:
# 날씨 예보 데이터 
forecast_ref = forecast.asfreq('30min')
forecast_ref = forecast_ref.interpolate()

ma6 = forecast_ref.rolling(6).mean().add_prefix('MA6_')
ma24= forecast_ref.rolling(24).mean().add_prefix('MA24_')
forecast_ref = pd.concat([forecast_ref, ma6, ma24], axis=1).dropna()

In [None]:
test

In [None]:
result = pd.DataFrame(columns=['plant', 'loc', '24h_cond_proba', '48h_cond_proba'])
for idx in tqdm(range(len(test))):
    data = test.iloc[idx:idx+1, :7]
    tidx = data.index[0].strftime('%Y-%m-%d %T')
    fc = forecast_ref[tidx:tidx]
    current = pd.concat([data.iloc[:, 2:], fc], axis=1)
    plant = str(data['plant'][0])
    loc = f"loc{data['loc'][0]}"
    
    time24_col = current.filter(regex='_25').columns
    time48_col = current.filter(regex='_46').columns
    
    second_df = pd.DataFrame(index=data.index)
    in_col = ['tem_in','hum_in','tem_coil']
    out_col = ['tem_out_loc1','hum_out_loc1']
    for time in ['25', '46']:
        for col in ['tem_in','hum_in','tem_coil','tem_out_loc1','hum_out_loc1']:        
            preds = []
            time_col = current.filter(regex=f'_{time}').columns.to_list()
            if 'out_' in col:                                
                tcol = time_col + out_col
                x = scalers[plant][f"y{time}_{col}"].transform(current[tcol])
                for model in forecast_model[plant][f"y{time}_{col}"]:
                    preds.append(model.predict(x))
                pred = np.mean(preds, axis=0)
            else:
                tcol = time_col + in_col
                x = scalers[plant][f"y{time}_{col}_{loc}"].transform(current[tcol])                
                for model in forecast_model[plant][f"y{time}_{col}_{loc}"]:
                    preds.append(model.predict(x))
                pred = np.mean(preds, axis=0)
            second_df[f"{time}_{col}"] = pred

    tem_col = second_df.filter(regex='tem_in').columns
    hum_col = second_df.filter(regex='hum_in').columns
    coil_col = second_df.filter(regex='coil').columns

    for i in range(len(tem_col)):
        dew_col = f'{tem_col[i][:3]}dewpoint{tem_col[i][-7:]}'
        second_df[dew_col] = dewpoint(second_df[tem_col[i]], second_df[hum_col[i]])

        second_df[f'{tem_col[i][:3]}dewdiff{tem_col[i][-7:]}'] = second_df[coil_col[i]] - second_df[dew_col]

    second_df['month'] = second_df.index.month
    second_df['day'] = second_df.index.day
    second_df['hour'] = second_df.index.hour
    
    p = {
        '25' : 0,
        '46' : 0,
    }
    for time_label in ['25', '46']:
        X_time = second_df.filter(regex=f'{time_label}')
        in_col = X_time.filter(regex=f'(in|coil)').columns.to_list()
        out_col = X_time.filter(regex=f'out_loc1').columns.to_list()
        date_col = ['month','day', 'hour']
        tcol = in_col + out_col + date_col                
        
        for m in classifiers[plant][f'y{time_label}_{loc}']:
            p[time_label] += ( m.predict_proba( second_df[tcol] ) / 5 )[:, 1].reshape(-1,)[0] #tree
    result.loc[idx, :] = [plant, loc[-1], p['25'], p['46']]

In [None]:
sample = pd.read_csv(path + 'sample.csv')

In [None]:
sample['X24H_COND_LOC'] = np.where(result['24h_cond_proba']>0.5, 1, 0)
sample['X24H_COND_LOC_PROB'] = result['24h_cond_proba']
sample['X48H_COND_LOC'] = np.where(result['48h_cond_proba']>0.5, 1, 0)
sample['X48H_COND_LOC_PROB'] = result['48h_cond_proba']

In [None]:
sample['X24H_COND_LOC_PROB'] = sample['X24H_COND_LOC_PROB'].astype(np.float64)
sample['X48H_COND_LOC_PROB'] = sample['X48H_COND_LOC_PROB'].astype(np.float64)

In [None]:
sample['X24H_COND_LOC_PROB'] = np.round(sample['X24H_COND_LOC_PROB']*100, 2)
sample['X48H_COND_LOC_PROB'] = np.round(sample['X48H_COND_LOC_PROB']*100 ,2)

In [None]:
sample['X24H_COND_LOC'].value_counts()

In [None]:
sample['X48H_COND_LOC'].value_counts()

In [None]:
# 최종 결과 저장
# sample.to_csv(path+'203752.csv', index=False)