데이터 로드

In [1]:
import multiprocessing 
from multiprocessing import Pool 
from functools import partial 
from data_loader_v2 import data_loader_v2
import os 
import pandas as pd
import numpy as np
import joblib

train_folder = 'train/'
train_label_path = 'train_label.csv'

In [None]:
train_list = os.listdir(train_folder)
train_label = pd.read_csv(train_label_path, index_col=0)

In [None]:
def data_loader_all(func, files, folder='', train_label=None, event_time=15, nrows=75):   
    func_fixed = partial(func, folder=folder, train_label=train_label, event_time=event_time, nrows=nrows)  
    if __name__ == '__main__':
        pool = Pool(processes=multiprocessing.cpu_count()) 
        df_list = list(pool.imap(func_fixed, files)) 
        pool.close() 
        pool.join() 
    combined_df = pd.concat(df_list)
    return combined_df

In [None]:
train = data_loader_all(data_loader_v2, train_list, folder=train_folder, train_label=train_label, event_time=15, nrows=75)

In [None]:
train=train.loc[:,train.std()!=0]

In [None]:
X_train = train.drop(['label'], axis=1)
y_train = train['label']

In [None]:
np.savez('train_ver3.npz', X=X_train, y=y_train)

load

In [4]:
train = np.load('train_fin.npz')
test = np.load('test_fin.npz')

In [5]:
X_train = train['X']
y_train = train['y']
X_test = test['X']

In [6]:
X_train.shape,y_train.shape,X_test.shape

((49620, 3428), (49620,), (32310, 3428))

스태킹

In [7]:
import lightgbm as lgbm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [8]:
# 사용 모델 세팅
lgbm_1_clf  = lgbm.LGBMClassifier(learning_rate = 0.06,
                                  num_leaves = 400,
                                  num_class= 198,
                                  max_depth= -1,
                                  min_child_weight = 3,
                                  colsample_bytree = 0.5,
                                  feature_fraction= 0.0,
                                  bagging_fraction= 0.0)

lgbm_2_clf  = lgbm.LGBMClassifier(learning_rate= 0.0267,
                                  num_leaves= 100, 
                                  num_class= 198,
                                  max_depth= -1,
                                  min_child_weight= 100.0,
                                  colsample_bytree= 0.5,
                                  feature_fraction= 0.0,
                                  bagging_fraction= 0.0)

lgbm_3_clf  = lgbm.LGBMClassifier(learning_rate= 0.002233,
                                  num_leaves= 75,
                                  num_class=198,
                                  max_depth= 7,
                                  min_child_weight= 0,
                                  colsample_bytree= 0.5,
                                  feature_fraction= 0.4,
                                  bagging_fraction= 0.4)

lgbm_4_clf  = lgbm.LGBMClassifier(learning_rate= 0.05,
                                  num_leaves= 10,
                                  num_class= 198,
                                  n_estimators = 100,
                                  max_depth= 2,
                                  min_child_weight= 0,
                                  colsample_bytree= 0.5,
                                  feature_fraction= 0.3,
                                  bagging_fraction= 0.3)

# 최종 Stacking 모델
lr_final = LogisticRegression(C=10)

In [9]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

def get_stacking_base_datasets(model, X_train_n, y_train_n, X_test_n, n_folds ):
    # 지정된 n_folds값으로 KFold 생성.
    kf = KFold(n_splits=n_folds, shuffle=False, random_state=0)
    #추후에 메타 모델이 사용할 학습 데이터 반환을 위한 넘파이 배열 초기화 
    train_fold_pred = np.zeros((X_train_n.shape[0] ,1 ))
    test_pred = np.zeros((X_test_n.shape[0],n_folds))
    print(model.__class__.__name__ , ' model 시작 ')
    
    for folder_counter , (train_index, valid_index) in enumerate(kf.split(X_train_n)):
        #입력된 학습 데이터에서 기반 모델이 학습/예측할 폴드 데이터 셋 추출 
        print('\t 폴드 세트: ',folder_counter,' 시작 ')
        X_tr = X_train_n[train_index] 
        y_tr = y_train_n[train_index] 
        X_te = X_train_n[valid_index]  
        
        #폴드 세트 내부에서 다시 만들어진 학습 데이터로 기반 모델의 학습 수행.
        model.fit(X_tr , y_tr)      
        #폴드 세트 내부에서 다시 만들어진 검증 데이터로 기반 모델 예측 후 데이터 저장.
        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1,1)
        #입력된 원본 테스트 데이터를 폴드 세트내 학습된 기반 모델에서 예측 후 데이터 저장. 
        test_pred[:, folder_counter] = model.predict(X_test_n)
            
    # 폴드 세트 내에서 원본 테스트 데이터를 예측한 데이터를 평균하여 테스트 데이터로 생성 
    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1,1)    
    
    #train_fold_pred는 최종 메타 모델이 사용하는 학습 데이터, test_pred_mean은 테스트 데이터
    return train_fold_pred , test_pred_mean

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

def get_stacking_base_datasets(model, X_train_n, y_train_n, X_test_n, n_folds ):
    kf = KFold(n_splits=n_folds, shuffle=False, random_state=0)
    train_fold_pred = np.zeros((X_train_n.shape[0] ,1 ))
    test_pred = np.zeros((X_test_n.shape[0],n_folds))
    print(model.__class__.__name__ , ' model 시작 ')
    
    for folder_counter , (train_index, valid_index) in enumerate(kf.split(X_train_n)):
        print('\t 폴드 세트: ',folder_counter,' 시작 ')
        #폴드 데이터셋 추출
        X_tr = X_train_n[train_index] 
        y_tr = y_train_n[train_index] 
        X_te = X_train_n[valid_index]  
        
        model.fit(X_tr , y_tr)      
        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1,1)
        test_pred[:, folder_counter] = model.predict(X_test_n)
            
    # 평균내어 사용할 데이터셋 설정
    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1,1)    
    
    return train_fold_pred , test_pred_mean

In [10]:
lgbm_1_train, lgbm_1_test = get_stacking_base_datasets(lgbm_1_clf, X_train, y_train, X_test, 5)
lgbm_2_train, lgbm_2_test = get_stacking_base_datasets(lgbm_2_clf, X_train, y_train, X_test, 5)
lgbm_3_train, lgbm_3_test = get_stacking_base_datasets(lgbm_3_clf, X_train, y_train, X_test, 5)    

LGBMClassifier  model 시작 
	 폴드 세트:  0  시작 
	 폴드 세트:  1  시작 
	 폴드 세트:  2  시작 
	 폴드 세트:  3  시작 
	 폴드 세트:  4  시작 
LGBMClassifier  model 시작 
	 폴드 세트:  0  시작 
	 폴드 세트:  1  시작 
	 폴드 세트:  2  시작 
	 폴드 세트:  3  시작 
	 폴드 세트:  4  시작 
LGBMClassifier  model 시작 
	 폴드 세트:  0  시작 
	 폴드 세트:  1  시작 
	 폴드 세트:  2  시작 
	 폴드 세트:  3  시작 
	 폴드 세트:  4  시작 


In [11]:
Stack_final_X_train = np.concatenate((lgbm_1_train, lgbm_2_train, lgbm_3_train), axis=1)
Stack_final_X_test = np.concatenate((lgbm_1_test, lgbm_2_test, lgbm_3_test), axis=1)
print('원본 학습 피처 데이터 Shape:',X_train.shape, '원본 테스트 피처 Shape:',X_test.shape)
print('스태킹 학습 피처 데이터 Shape:', Stack_final_X_train.shape,
      '스태킹 테스트 피처 데이터 Shape:',Stack_final_X_test.shape)

원본 학습 피처 데이터 Shape: (49620, 3428) 원본 테스트 피처 Shape: (32310, 3428)
스태킹 학습 피처 데이터 Shape: (49620, 3) 스태킹 테스트 피처 데이터 Shape: (32310, 3)


In [12]:
lr_final.fit(Stack_final_X_train, y_train)
stack_final = lr_final.predict(Stack_final_X_test)



In [33]:
lr_final.fit(Stack_final_X_train, y_train)
stack_final_2 = lr_final.predict_proba(Stack_final_X_test)



In [None]:
lgbm_4_train, lgbm_4_test = get_stacking_base_datasets(lgbm_4_clf, X_train, y_train, X_test, 5)

LGBMClassifier  model 시작 
	 폴드 세트:  0  시작 


In [None]:
Stack_final_X_train_2 = np.concatenate((lgbm_1_train, lgbm_2_train, lgbm_3_train, lgbm_4_train), axis=1)
Stack_final_X_test_2 = np.concatenate((lgbm_1_test, lgbm_2_test, lgbm_3_test, lgbm_4_test), axis=1)
print('원본 학습 피처 데이터 Shape:',X_train.shape, '원본 테스트 피처 Shape:',X_test.shape)
print('스태킹 학습 피처 데이터 Shape:', Stack_final_X_train.shape_2,
      '스태킹 테스트 피처 데이터 Shape:',Stack_final_X_test.shape_2)

In [None]:
lr_final.fit(Stack_final_X_train, y_train)
stack_final = lr_final.predict(Stack_final_X_test)

In [28]:
submission = pd.DataFrame(data=stack_final)
submission.index = test_1.index
submission.index.name = 'id'
submission = submission.sort_index()
submission = submission.groupby('id').mean()
submission.to_csv('submission.csv', index=True) #제출 파일 만들기

In [34]:
submission = pd.DataFrame(data=stack_final_2)
submission.index = test_1.index
submission.index.name = 'id'
submission = submission.sort_index()
submission = submission.groupby('id').mean()
submission.to_csv('submission_2.csv', index=True) #제출 파일 만들기

In [18]:
test_folder = 'test/'
train_folder = 'train/'
test_list = os.listdir(test_folder)
train_list = os.listdir(train_folder)
train_label_path = 'train_label.csv'
train_label = pd.read_csv(train_label_path, index_col=0)

In [19]:
def data_loader_all_v2(func, files, folder='', train_label=None, event_time=15, nrows=75):   
    func_fixed = partial(func, folder=folder, train_label=train_label, event_time=event_time, nrows=nrows)  
    if __name__ == '__main__':
        pool = Pool(processes=multiprocessing.cpu_count()) 
        df_list = list(pool.imap(func_fixed, files)) 
        pool.close() 
        pool.join() 
    combined_df = pd.concat(df_list)
    return combined_df

In [20]:
train = data_loader_all_v2(data_loader_v2, train_list, folder=train_folder, train_label=train_label, event_time=15, nrows=75)

In [25]:
test = data_loader_all_v2(data_loader_v2, test_list, folder=test_folder, train_label=None, event_time=15, nrows=75)

In [37]:
train=train.loc[:,train.std()!=0]

In [38]:
train.shape

(49620, 3429)

In [23]:
X_train = train_1.drop(['label'], axis=1)
y_train = train_1['label']

In [24]:
list_col = []
list_col = X_train.columns

In [60]:
test=test.loc[:,list_col]

In [27]:
test_1.shape

(32310, 3428)

In [35]:
from sklearn.model_selection import KFold
import lightgbm

import multiprocessing # 여러 개의 일꾼 (cpu)들에게 작업을 분산시키는 역할
from multiprocessing import Pool 
from functools import partial # 함수가 받는 인자들 중 몇개를 고정 시켜서 새롭게 파생된 함수를 형성하는 역할
from data_loader_v2 import data_loader_v2
import os
import pandas as pd
import numpy as np
import joblib

In [36]:
parms = {
    'learning_rate' : 0.06,
    'num_leaves' : 400,
    'n_estimators' : 300,
    'max_depth': -1,
    'min_child_weight' : 3, 
    'subsample' : 0.8,
    'colsample_bytree' : 0.5,
    'objective' : 'multiclass',
    'n_jobs': -1
}

In [64]:
x_train.drop('label',axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [65]:
lucky_seed=[5436,1265,3416]

for num,rs in enumerate(lucky_seed):

    kfold = KFold(n_splits=4, random_state = rs, shuffle = True)

    # dacon code
    cv=np.zeros((train.shape[0],198))

    for n, (train_idx, validation_idx) in enumerate(kfold.split(train)):

        model = lightgbm.LGBMClassifier(**parms, random_state=rs)

        model.fit(x_train, y_train, eval_set=[(x_validation, y_validation)], early_stopping_rounds= 30,
                  verbose=100) 
        joblib.dump(model, '2_Code_pred/%s_fold_model_%s.pkl'%(n,rs))

        # CROSS-VALIDATION , EVALUATE CV
        cv[validation_idx,:] = model.predict_proba(x_validation)

ValueError: Length of feature_name(3429) and num_feature(3428) don't match

In [None]:
# MODEL LOAD & TEST PREDICT
# 12 MODELS 평균 사용
models = os.listdir('2_Code_pred/')
models_list = [x for x in models if x.endswith(".pkl")]
assert len(models_list) ==12
temp_predictions = np.zeros((test.shape[0],198))

for m in models_list:
    model = joblib.load('2_Code_pred/'+m)
    predict_proba = model.predict_proba(test)
    temp_predictions += predict_proba/12

In [None]:
lr_final.fit(Stack_final_X_train, y_train)
stack_final_2 = lr_final.predict_proba(Stack_final_X_test)

In [46]:
datafin = model.predict_proba(X_test)

ValueError: Number of features of the model must match the input. Model n_features_ is 3429 and input n_features is 3428 

In [54]:
# dacon code
submission = pd.DataFrame(data=np.zeros((test.shape[0],198)))
submission.index = test.index 
submission.index.name = 'id'
submission+=temp_predictions

submission = submission.sort_index()
submission = submission.groupby('id').mean()
submission.to_csv('submission_3.csv', index=True)