In [1]:
import sys
sys.path.append("C:/python_Lib")

In [2]:
import os
import pandas as pd 
import numpy as np
import multiprocessing # 여러 개의 일꾼 (cpu)들에게 작업을 분산시키는 역할
from multiprocessing import Pool 
from functools import partial # 함수가 받는 인자들 중 몇개를 고정 시켜서 새롭게 파생된 함수를 형성하는 역할
from data_loader_v2 import data_loader_v2 # 자체적으로 만든 data loader version 2.0 ([데이콘 15회 대회] 데이터 설명 및 데이터 불러오기 영상 참조)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, KFold, StratifiedShuffleSplit
import joblib # 모델을 저장하고 불러오는 역할

In [3]:
train_folder = 'C:/dacon_nuclear_plant/train/'
test_folder = 'C:/dacon_nuclear_plant/test/'
train_label_path = 'C:/dacon_nuclear_plant/train_label.csv'

In [4]:
train_list = os.listdir(train_folder)
test_list = os.listdir(test_folder)
train_label = pd.read_csv(train_label_path, index_col=0)

In [5]:
# 모든 csv 파일의 상태_B로 변화는 시점이 같다라고 가정
# 하지만, 개별 csv파일의 상태_B로 변화는 시점은 상이할 수 있음
def data_loader_all_v2(func, files, folder='', train_label=None, event_time=10, nrows=60):   
    func_fixed = partial(func, folder=folder, train_label=train_label, event_time=event_time, nrows=nrows)     
    if __name__ == '__main__':
        pool = Pool(processes=multiprocessing.cpu_count()) 
        df_list = list(pool.imap(func_fixed, files)) 
        pool.close()
        pool.join()        
    combined_df = pd.concat(df_list)    
    return combined_df

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,  
                       criterion='gini', max_depth=None, max_features='auto',  
                       max_leaf_nodes=None, max_samples=None,  
                       min_impurity_decrease=0.0, min_impurity_split=None,  
                       min_samples_leaf=1, min_samples_split=2,  
                       min_weight_fraction_leaf=0.0, n_estimators=100,  
                       n_jobs=-1, oob_score=False, random_state=0, verbose=1,  
                       warm_start=False)

In [6]:
train = data_loader_all_v2(data_loader_v2, train_list, folder=train_folder, train_label=train_label, event_time=10, nrows=60)

In [7]:
# 값이 하나인 컬럼 삭제
directory = 'C:/dacon_nuclear_plant/'
load_file = 'del_col_list.csv'
del_col_list = pd.read_csv(directory + load_file)

In [8]:
# 필요없는 컬럼 삭제
del del_col_list["Unnamed: 0"]

In [9]:
del_col_list = del_col_list.set_index(['columns'])      # 데이터프레임의 컬럼을 불러오면 시리즈로 불러오기 때문에 columns 를 index로 변환해 줍니다.
del_col_list = del_col_list.index       # 데이터프레임의 index를 list로 추출합니다.
del_col_list

Index(['V0019', 'V0020', 'V0021', 'V0022', 'V0023', 'V0024', 'V0034', 'V0035',
       'V0036', 'V0037',
       ...
       'V5105', 'V5106', 'V5107', 'V5108', 'V5109', 'V5110', 'V5111', 'V5112',
       'V5113', 'V5114'],
      dtype='object', name='columns', length=1848)

In [10]:
%%time
# train set에서 유니크값 하나인 컬럼 날리기
for i in range(len(del_col_list)):      # 개별로 불러들여 작업하는것을 추천, 적은 용량의 파일이라면 해당 방법을 추천  
    del train[del_col_list[i]]

train.shape

Wall time: 20 s


In [11]:
# test set load
test = data_loader_all_v2(data_loader_v2, test_list, folder=test_folder, train_label=None, event_time=10, nrows=60)

In [12]:
%%time
# test set에서 유니크값 하나인 컬럼 날리기
for i in range(len(del_col_list)):
    del test[del_col_list[i]]
    
test.shape

Wall time: 19.4 s


(36000, 3273)

In [26]:
# 요청 dataloaderv2로 불러들인 데이터프레임에서 컬럼제거작업된 csv파일

# train.to_csv("C:/dacon_nuclear_plant/dataLoaderV2_del_columns.csv")

In [13]:
# modeling
X_train = train.drop(['label'], axis=1)
y_train = train['label']
model = RandomForestClassifier(random_state=0, verbose=1, n_jobs=-1)

In [12]:
# # Kfold cross validation
# # 오래걸려요

# kfold = KFold(n_splits=10)
# stratified_shuffle_split = StratifiedShuffleSplit(train_size=0.7,test_size=0.3,n_splits=10,random_state=0)

# scores = cross_val_score(model, X_train, y_train)
# print("k-5 cross validation score : {}".format(scores))

# # k-5 cross validation score : [0.76956522 0.82584541 0.81992754 0.83055556 0.81859903]

# print("k-fold cross validation mean score : {}".format(scores.mean()))

# # k-fold cross validation mean score : 0.8128985507246377

# # validation 10번
# scores_k10 = cross_val_score(model, X_train, y_train, cv=kfold)
# print("k-10 cross validation score : {}".format(scores_k10))

# # k-10 cross validation score : [0.72874396 0.70942029 0.79057971 0.71280193 0.7089372  0.66956522
# # 0.78719807 0.75845411 0.7468599  0.68478261]

# # shuffle_split
# scores_shuffle = cross_val_score(model, X_train, y_train, cv=stratified_shuffle_split)
# print("shuffle cross validation score : {}".format(scores_shuffle))

# # shuffle cross validation score : [0.88856683 0.88848631 0.88776167 0.88711755 0.88679549 0.88534622
# # 0.884219   0.88333333 0.88615137 0.8889694]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   48.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.9min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   42.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.8min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   41.5s
[Parallel(n_jobs=-1)]:

k-5 cross validation score : [0.76956522 0.82584541 0.81992754 0.83055556 0.81859903]


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   46.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.0min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   46.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.9min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   47.2s
[Parallel(n_jobs=-1)]:

k-10 cross validation score : [0.72874396 0.70942029 0.79057971 0.71280193 0.7089372  0.66956522
 0.78719807 0.75845411 0.7468599  0.68478261]
k-fold cross validation mean score : 0.8128985507246377


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   39.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.6min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   36.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.6min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   36.0s
[Parallel(n_jobs=-1)]:

shuffle cross validation score : [0.88856683 0.88848631 0.88776167 0.88711755 0.88679549 0.88534622
 0.884219   0.88333333 0.88615137 0.8889694 ]


In [14]:
# Learning
model.fit(X_train, y_train)

# 모델 저장
joblib.dump(model, 'model.pkl')

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.8min finished


['model.pkl']

In [17]:
# 모델 불러오기
model = joblib.load('model.pkl') 

# prediction
pred = model.predict_proba(test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    2.9s finished


In [18]:
submission = pd.DataFrame(data=pred)
submission.index = test.index
submission.index.name = 'id'

submission = submission.sort_index()
submission = submission.groupby('id').mean()

#제출 파일 만들기
submission.to_csv('C:/dacon_nuclear_plant/submission2.csv', index=True)