In [1]:
from __future__ import print_function
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

### pca데이터로 랜덤포레스트

In [2]:
df=pd.read_csv('./dataset/gen/pca_df.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,error
0,0,-6.920456,-0.309658,0.620377,-0.173485,-0.049637,-0.827045,-0.755716,-0.183726,-0.067738,0.027902,0.004310,0
1,1,-4.784892,12.362206,-0.183395,-3.429440,-4.420019,-12.117411,11.793932,-2.896361,3.265171,-9.407983,0.229890,0
2,2,-5.282790,13.396142,-1.747867,-0.250519,0.295667,-0.676533,-1.421287,-0.213848,0.038796,0.191568,0.345221,0
3,3,-4.853765,13.562955,-1.445542,-0.543876,0.085141,-1.036834,-1.114040,-0.479092,0.068443,-0.079418,0.142066,0
4,4,-4.863956,14.065084,-1.475915,-0.531324,0.383049,-0.455790,-1.290119,1.001590,-0.026792,0.236988,-0.160453,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32043,32043,-3.684435,-0.846090,-0.169298,0.188369,-0.500481,0.441103,0.122972,-0.031639,0.016224,-0.229439,0.349628,0
32044,32044,-3.783528,-0.937229,-0.223218,0.166674,-0.471510,0.534247,0.175026,0.012468,-0.090849,-0.019993,0.569483,0
32045,32045,-3.789268,-0.897574,-0.105577,-0.057202,-0.465623,0.547763,0.263150,0.132393,-0.168208,-0.158670,0.112960,0
32046,32046,-3.655781,-0.775284,-0.104331,0.051790,-0.468439,0.487717,0.164065,0.042832,-0.108706,-0.319912,-0.040782,0


In [4]:
# 독립 변수와 종속 변수 분리
X = df[['PC1','PC2','PC3','PC4','PC5','PC6','PC7','PC8','PC9','PC10','PC11']]
y = df['error']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
params = {'n_estimators' : [10, 100,110,120],
          'max_depth' : [6, 8,9, 10, 12],
          'min_samples_leaf' : [8, 12, 18],
          'min_samples_split' : [8, 16, 20]
          }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state = 221, n_jobs = -1)
grid_cv = GridSearchCV(rf_clf,
                       param_grid = params,
                       cv = 5, 
                       n_jobs = -1)
grid_cv.fit(X_train, y_train)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

In [None]:
# 여기서 제일 높은 부분부터 깊이정해지는데 9가제일높음

In [None]:

cv = KFold(n_splits=5)            # Desired number of Cross Validation folds  #n_splits값이 클수록 오래걸림
accuracies = list()
max_attributes = X_test.shape[1]
depth_range = range(1, max_attributes)
for depth in depth_range:
    fold_accuracy = []
    rand_clf = RandomForestClassifier(max_depth = depth)
    # print("Current max depth: ", depth, "\n")
    for train_fold, valid_fold in cv.split(X):
        X_train = X.iloc[train_fold]
        y_train = y.iloc[train_fold]
        X_valid = X.iloc[valid_fold]
        y_valid = y.iloc[valid_fold]
        
        model = rand_clf.fit(X_train, y_train)
        valid_acc = model.score(X_valid, y_valid)
        fold_accuracy.append(valid_acc)
    avg = sum(fold_accuracy)/len(fold_accuracy)
    accuracies.append(avg)
    # print("Accuracy per fold: ", fold_accuracy, "\n")
    # print("Average accuracy: ", avg)
    # print("\n")
    
# Just to show results conveniently
df = pd.DataFrame({"Max Depth": depth_range, "Average Accuracy": accuracies})
df = df[["Max Depth", "Average Accuracy"]]
print(df.to_string(index=False))

In [None]:
### n_estimators 기본값 말고 해봤더니 120개가 제일 높았음

In [None]:
for i in (110,120,130,140):
    rand_clf = RandomForestClassifier(n_estimators = i, criterion='entropy', bootstrap=True, random_state=42, max_depth=9)
    rand_clf.fit(X_train, y_train)
    #relation_square = model.score(X_train, y_train)
    #print('결정계수 : ', relation_square)
    print('훈련세트 정확도: {:.3f}' .format(rand_clf.score(X_train, y_train)))
    print('테스트세트 정확도: {:.3f}' .format(rand_clf.score(X_test, y_test)))


In [None]:
y_pred = rand_clf.predict(X_test)
y_pred

In [None]:
#120개로진행

In [None]:
rand_clf = RandomForestClassifier(criterion='entropy',n_estimators = 120, bootstrap=True, random_state=42, max_depth=9)

rand_clf.fit(X_train, y_train)
y_pred = rand_clf.predict(X_test)

print('훈련세트 정확도: {:.3f}' .format(rand_clf.score(X_train, y_train)))
print('테스트세트 정확도: {:.3f}' .format(rand_clf.score(X_test, y_test)))


In [None]:
from sklearn import metrics

accuracy = metrics.accuracy_score(y_test, y_pred)
print("정확도:", accuracy)

precision = metrics.precision_score(y_test, y_pred)
print("정밀도:", precision)

recall = metrics.recall_score(y_test, y_pred)
print("재현율:", recall)

f1 = metrics.f1_score(y_test, y_pred)
print("f1 점수:", f1)

In [None]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# feature importance 추출 
importances = rand_clf.feature_importances_

# feature별 importance 매핑
for name, value in zip(X.columns, importances):
    print('{0} : {1:.3f}'.format(name, value))

# feature importance를 column 별로 시각화하기
plt.figure(figsize=(10, 6))
sns.barplot(x=importances, y=X.columns)
plt.xlabel('Importance')
plt.ylabel('Features')
plt.title('Feature Importance')
plt.show()


In [None]:
params = {'n_estimators' : [10, 100,110,120],
          'max_depth' : [6, 8,,9 10, 12],
          'min_samples_leaf' : [8, 12, 18],
          'min_samples_split' : [8, 16, 20]
          }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state = 221, n_jobs = -1)
grid_cv = GridSearchCV(rf_clf,
                       param_grid = params,
                       cv = 5, 
                       n_jobs = -1)
grid_cv.fit(X_train, y_train)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))
