In [2]:
import os
import pandas as pd 
import numpy as np
import multiprocessing # 여러 개의 일꾼 (cpu)들에게 작업을 분산시키는 역할
from multiprocessing import Pool 
from functools import partial # 함수가 받는 인자들 중 몇개를 고정 시켜서 새롭게 파생된 함수를 형성하는 역할
from data_loader_v2 import data_loader_v2
from tqdm import tqdm_notebook
from sklearn.ensemble import RandomForestClassifier
import joblib # 모델을 저장하고 불러오는 역할

In [3]:
train_folder = 'data/train/'
test_folder = 'data/test/'
train_label_path = 'data/train_label.csv'

In [4]:
train_list = os.listdir(train_folder)
test_list = os.listdir(test_folder)
train_label = pd.read_csv(train_label_path, index_col=0)

In [5]:
# 데이콘 제공
def data_loader_all_v2(func, files, folder='', train_label=None, event_time=10, nrows=60):   
    func_fixed = partial(func, folder=folder, train_label=train_label, event_time=event_time, nrows=nrows)     
    if __name__ == '__main__':
        pool = Pool(processes=multiprocessing.cpu_count()) 
        df_list = list(pool.imap(func_fixed, files)) 
        pool.close()
        pool.join()        
    combined_df = pd.concat(df_list)    
    return combined_df

In [6]:
train = data_loader_all_v2(data_loader_v2, train_list, folder=train_folder, train_label=train_label, event_time=10, nrows=60)

In [8]:
test = data_loader_all_v2(data_loader_v2, test_list, folder=test_folder, train_label=None, event_time=20, nrows=60)

In [10]:
#데이터 프레임생성
X_train = train.drop(['label'], axis=1)
y_train = train['label']

In [12]:
train.head()

Unnamed: 0,V0000,V0001,V0002,V0003,V0004,V0005,V0006,V0007,V0008,V0009,...,V5112,V5113,V5114,V5115,V5116,V5117,V5118,V5119,V5120,label
0,30.474394,8.691177,8.714483,8.687399,8.72123,207.697895,165.86573,-6.018876999999999e-19,0.0,-0.002136,...,1.0,1.0,1.0,60.0,0.0,0.0,1.42162e-05,85.4,0.0,110
0,30.470463,8.736521,8.682769,8.717135,8.682402,192.66508,191.006871,-3.9187579999999997e-19,0.0,0.00171,...,1.0,1.0,1.0,60.0,0.0,0.0,-6.114455e-06,85.4,0.0,110
0,30.465427,8.753559,8.663426,8.700049,8.734147,187.065171,192.700238,-1.7991789999999997e-19,0.0,0.000493,...,1.0,1.0,1.0,60.0,0.0,0.0,-1.813291e-05,85.4,0.0,110
0,30.458532,8.715056,8.714854,8.717174,8.699257,188.500036,180.150567,-6.636970999999999e-19,0.0,0.000318,...,1.0,1.0,1.0,60.0,0.0,0.0,-5.745568e-07,85.4,0.0,110
0,30.475773,8.790241,8.735125,8.703167,8.72103,193.269046,195.98489,-6.379752e-20,0.0,-9.1e-05,...,1.0,1.0,1.0,60.0,0.0,0.0,8.437883e-06,85.4,0.0,110


In [None]:
#차원축소 분류 예측 성능 평가

#원본 데이터세트
#랜덤 포레스트 이용해 타깃 값이 디폴트 값을 3개 교차 검증 세트로 분류 예측
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

rcf=RandomForestClassifier(n_estimators=300, random_state=156)
scores = cross_val_score(rcf, X_train, y_train, scoring='accuracy', cv=3)
print('CV3인 경우 개별 fold 세트별 정확도 : ', scores)
print('평균 정확도 : {0:.4f}'.format(np.mean(scores)))

In [15]:
#PCA차원축소
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
#원본스케일링
scaler = StandardScaler()
df_scaled=scaler.fit_transform(X_train)
#컴포넌트 임의 6개 선정
pca=PCA(n_components=6)
df_pca = pca.fit_transform(df_scaled)
scores_pca = cross_val_score(rcf,df_pca, y_train, scoring='accuracy', cv=3)
print('CV3인 경우 PCA변환된 개별 fold 세트별 정확도 : ', scores_pca)
print('PCA 변환 데이터 세트 평균 정확도 : {0:.4f}'.format(np.mean(scores_pca)))

CV3인 경우 PCA변환된 개별 fold 세트별 정확도 :  [0.64587394 0.64872501 0.66316174]
PCA 변환 데이터 세트 평균 정확도 : 0.6526


In [17]:
#LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
#원본스케일링
scaler = StandardScaler()
df_scaled=scaler.fit_transform(X_train)

#컴포넌트 임의 6개 선정
lda=LinearDiscriminantAnalysis(n_components=2)
lda.fit(df_scaled, train.label)
df_lda=lda.transform(df_scaled)
scores_lda = cross_val_score(rcf, df_lda, y_train, scoring='accuracy', cv=3)
print('CV3인 경우 lda변환된 개별 fold 세트별 정확도 : ', scores_lda)
print('lda 변환 데이터 세트 평균 정확도 : {0:.4f}'.format(np.mean(scores_lda)))



CV3인 경우 lda변환된 개별 fold 세트별 정확도 :  [0.05183741 0.05201391 0.05213582]
lda 변환 데이터 세트 평균 정확도 : 0.0520


In [23]:
y_train.shape

(41350,)

In [1]:
X_train.shape

NameError: name 'X_train' is not defined

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components=2) # 2개의 에이겐 쌍을 선택
X_train_lda = lda.fit_transform(X_train_std, y_train)
X_test_lda = lda.fit_transform(X_test, y_train)

from sklearn.linear_model import LogisticRegression # 로지스틱 분류로 성능 테스트
lr = LogisticRegression()
lr = lr.fit(X_train_lda, y_train)

lr.predict_proba(X_test_lda[0,:]) #해당 분류에 속할 확률값으로 결과 도출
y_pred_lr=lr.predict(X_test_lda) #예측한 분류값을 보고싶을 때

from sklearn.metrics import accuracy_score
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred_lr))