In [20]:
import os
import numpy as np
import pandas as pd

#0,1問題は分類に当たる
from sklearn.linear_model import SGDClassifier
from sklearn.kernel_approximation import RBFSampler 
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.cluster import KMeans

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve

import matplotlib.pyplot as plt
%matplotlib inline

■■■■■　関数　■■■■■

In [21]:
def SaveFig(fig_id, tight_layout=True, fig_extension="png", resolution=600):
   
    images_path = os.path.join("..", "Images")
    os.makedirs(images_path, exist_ok=True)
    file_name = os.path.join(images_path, fig_id + "." + fig_extension)
    
    #print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(file_name, format=fig_extension, dpi=resolution)
    plt.close()

In [22]:
def GetConcatSubAssyFiles(r_path):
    files = os.listdir(r_path)
    csv_files = []
    
    for file in files:
        if(file.find(".csv") != -1 and file.upper().find("CONCAT_SA") != -1):
            csv_files.append(file)
            
    return csv_files

In [23]:
def GetConcatAssyFiles(r_path):
    files = os.listdir(r_path)
    csv_files = []
    
    for file in files:
        if (file.find(".csv") != -1 and file.upper().startswith("CONCAT_ASSY") != -1):
            csv_files.append(file)
            
    return csv_files

In [24]:
def UnderSampling(X,num,label) :

    # KMeansによるクラスタリング
    km = KMeans(random_state=42)
    km.fit(X,Y)
    X["Cluster"] = km.predict(X)

    # 群別の構成比を少数派の件数に乗じて群別の抽出件数を計算
    count_sum = X.groupby("Cluster").count().iloc[0:,0].as_matrix()
    ratio = count_sum / count_sum.sum()
    samp_num = np.round(ratio * num,0).astype(np.int32)

    # 群別にサンプリング処理を実施
    for i in np.arange(8) :
        tmp = X[X["Cluster"]==i]
        if i == 0 :
            tmp1 = X.sample(samp_num[i],replace=True)
        else :
            tmp2 = X.sample(samp_num[i],replace=True)
            tmp1 = pd.concat([tmp1,tmp2])
    tmp1["Class"] = label
    
    return tmp1

In [25]:
def ResetIndex(df):
    df = df.reset_index()
    df = df.drop("index", axis=1)
    
    return df

In [26]:
############
# ベタ書き #
############
def Main(csv_file):
    
    #ランダムサンプル数 50000だと死んだ
    n_samples = 20000
    
    #列名定義
    num_cols = ["ti_%s_to_%s[s]" %(i, i+1) for i in np.arange(11,16)] + ["cure_time[s]"]
    cat_cols = ["storage_loc_14", "sub1"]
    #cat_cols = ["sa_lot", "storage_loc_14", "sub1", "film_lot"]
    obj_cols = ["defective_cat_16"]
    
    #工程No定義
    pro_no = np.arange(11, 17)
    
    #結果格納系定義
    result = pd.DataFrame()
    clf_method = ["SGD", "Kernel", "LinearSVC", "KNeighbor", "SVC", "RandomForest"]
    conf_cols  = ["TN", "FP", "FN", "TP"]
    method, precision, recall, f1, conf = [], [], [], [], []
    
    #データの型設定→メモリ容量削減のため
    dtypes = {"sa_lot": "int32", "sa_seri": "int32", "storage_loc_14": "int16", "defective_cat_16": "int8"}
    
    #IntervalCSV読み込み
    #df = pd.read_csv(os.path.join(r_path, csv_file))
    reader = pd.read_csv(os.path.join(r_path, csv_file), dtype=dtypes, low_memory=False, chunksize=2000)
    df = pd.concat((r for r in reader), ignore_index=True)
    
    for (i, j) in zip(pro_no[:-1], pro_no[1:]):
        df["ti_%s_to_%s[s]" %(i, j)] = df["ti_%s_to_%s[s]" %(i, j)].astype("int32")
    df["cure_time[s]"] = df["cure_time[s]"].astype("int32")

    #OK:0, NG:31のデータ抽出 queryは新しい手法みたい．外海さん正解！
    df = df.query("defective_cat_16 == 0 or defective_cat_16 == 31") 
    df.loc[df["defective_cat_16"]==31, "defective_cat_16"] = 1
    df = df.dropna(subset=["sub1"])
    df = df.sample(n=n_samples)
    df = ResetIndex(df)
    #df = df.reset_index()
    #df = df.drop("index", axis=1)
    
    #学習データ，検証データに分割
    train_set, test_set = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

    #説明変数，目的変数に分割
    X_num_train = train_set[num_cols]
    X_cat_train = train_set[cat_cols]
    X_num_test  = test_set[num_cols]
    X_cat_test  = test_set[cat_cols]
    y_train     = train_set[obj_cols[0]].copy()
    y_test      = test_set[obj_cols[0]].copy() 
    
    #スケーリング後の再columns設定のため抜き出し
    sorted_num_cols = X_num_train.columns
    
    #数値データをスケーリング
    X_num_train = DataScaling(X_num_train)
    X_num_test  = DataScaling(X_num_test)
    
    #スケーリングはndarrayで返されるので，Concat様にcolumnsを付ける
    X_num_train = pd.DataFrame(X_num_train, index=None, columns=sorted_num_cols)
    X_num_test = pd.DataFrame(X_num_test, index=None, columns=sorted_num_cols)
    
    #カテゴリデータをone-hot-encode
    X_cat_train = pd.get_dummies(X_cat_train, columns=cat_cols)
    X_cat_test  = pd.get_dummies(X_cat_test, columns=cat_cols)
    
    #indexを再度
    X_cat_train = ResetIndex(X_cat_train)
    X_cat_test  = ResetIndex(X_cat_test) 
    
    #数値とカテゴリ値を結合
    X_train = pd.concat([X_num_train, X_cat_train], axis=1)
    X_test  = pd.concat([X_num_test, X_cat_test], axis=1)

    #分類器毎の実力
    for flg in clf_method:
    
        #分類器選択
        if flg == "SGD":
            clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)            
        elif flg == "Kernel":
            rbf = RBFSampler(gamma=1, n_components=100, random_state=42)
            X_rbf = rbf.fit_transform(X_train)
            clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)            
        elif flg == "LinearSVC":
            clf = LinearSVC(C=1.0, class_weight="balanced", random_state=42)            
        elif flg == "KNeighbor":
            clf = KNeighborsClassifier(n_neighbors=5, weights="distance")        
        #elif flg == "SVC":
            #clf = SVC(kernel="rbf", C=1.0, class_weight="balanced", random_state=42)        
        elif flg == "RandomForest":
            clf = RandomForestClassifier(criterion="entropy", n_estimators=10, random_state=42, class_weight="balanced")
        else:
            print("No such a method")

        #学習&評価
        #trainセットバージョン
        print(flg, "is being learned")
        if flg == "Kernel":
            clf.fit(X_rbf, y_train.ravel())
            y_train_predict = clf.predict(rbf.fit_transform(X_train))
        else:
            clf.fit(X_train, y_train.ravel())
            y_train_predict = clf.predict(X_train)
        
        #適合率と再現率のプロット
        #CalcPrecisionRecall_vs_Threshold(clf, X_train, y_train)

        method.append(flg)
        precision.append(precision_score(y_train, y_train_predict))
        recall.append(recall_score(y_train, y_train_predict))
        f1.append(f1_score(y_train, y_train_predict))
        conf.append(confusion_matrix(y_train, y_train_predict).ravel())
        
        """
        #学習&評価
        #testセットバージョン
        print(flg, "is being learned")
        if flg == "Kernel":
            clf.fit(X_rbf, y_train)
            y_test_predict = clf.predict(rbf.fit_transform(X_test))
        else:
            clf.fit(X_train, y_train)
            y_test_predict = clf.predict(X_test)
        
        #適合率と再現率のプロット
        #CalcPrecisionRecall_vs_Threshold(clf, X_train, y_train)

        method.append(flg)
        precision.append(precision_score(y_test, y_test_predict))
        recall.append(recall_score(y_test, y_test_predict))
        f1.append(f1_score(y_test, y_test_predict))
        conf.append(confusion_matrix(y_test, y_test_predict).ravel())
        """
        
        print(flg, "is done")

    #書き出し
    confusion = pd.DataFrame(conf, columns = conf_cols)
    result["method"] = method
    result["precision"] = precision
    result["recall"] = recall
    result["f1"] = f1
    
    result = pd.concat([result, confusion], axis=1)
    
    result.to_csv(os.path.join(w_path, "result_no_sa_lot_"+csv_file), index=False)

In [42]:
def GetRFCFeatures(csv_file):
    
    #ランダムサンプル数 50000だと死んだ
    n_samples = 20000
    
    #列名定義
    num_cols = ["ti_%s_to_%s[s]" %(i, i+1) for i in np.arange(11,16)] + ["cure_time[s]"]
    cat_cols = ["storage_loc_14", "sub1"]
    #cat_cols = ["sa_lot", "storage_loc_14", "sub1", "film_lot"]
    obj_cols = ["defective_cat_16"]
    
    #工程No定義
    pro_no = np.arange(11, 17)
    
    #結果格納系定義
    result = pd.DataFrame()
    clf_method = ["SGD", "Kernel", "LinearSVC", "KNeighbor", "SVC", "RandomForest"]
    conf_cols  = ["TN", "FP", "FN", "TP"]
    method, precision, recall, f1, conf = [], [], [], [], []
    
    #データの型設定→メモリ容量削減のため
    dtypes = {"sa_lot": "int32", "sa_seri": "int32", "storage_loc_14": "int16", "defective_cat_16": "int8"}
    
    #IntervalCSV読み込み
    #df = pd.read_csv(os.path.join(r_path, csv_file))
    reader = pd.read_csv(os.path.join(r_path, csv_file), dtype=dtypes, low_memory=False, chunksize=2000)
    df = pd.concat((r for r in reader), ignore_index=True)
    
    for (i, j) in zip(pro_no[:-1], pro_no[1:]):
        df["ti_%s_to_%s[s]" %(i, j)] = df["ti_%s_to_%s[s]" %(i, j)].astype("int32")
    df["cure_time[s]"] = df["cure_time[s]"].astype("int32")

    #OK:0, NG:31のデータ抽出 queryは新しい手法みたい．外海さん正解！
    df = df.query("defective_cat_16 == 0 or defective_cat_16 == 31") 
    df.loc[df["defective_cat_16"]==31, "defective_cat_16"] = 1
    df = df.dropna(subset=["sub1"])
    df = df.sample(n=n_samples)
    df = ResetIndex(df)
    #df = df.reset_index()
    #df = df.drop("index", axis=1)
    
    #学習データ，検証データに分割
    train_set, test_set = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

    #説明変数，目的変数に分割
    X_num_train = train_set[num_cols]
    X_cat_train = train_set[cat_cols]
    X_num_test  = test_set[num_cols]
    X_cat_test  = test_set[cat_cols]
    y_train     = train_set[obj_cols[0]].copy()
    y_test      = test_set[obj_cols[0]].copy() 
    
    #スケーリング後の再columns設定のため抜き出し
    sorted_num_cols = X_num_train.columns
    
    #数値データをスケーリング
    X_num_train = DataScaling(X_num_train)
    X_num_test  = DataScaling(X_num_test)
    
    #スケーリングはndarrayで返されるので，Concat様にcolumnsを付ける
    X_num_train = pd.DataFrame(X_num_train, index=None, columns=sorted_num_cols)
    X_num_test = pd.DataFrame(X_num_test, index=None, columns=sorted_num_cols)
    
    #カテゴリデータをone-hot-encode
    X_cat_train = pd.get_dummies(X_cat_train, columns=cat_cols)
    X_cat_test  = pd.get_dummies(X_cat_test, columns=cat_cols)
    
    #indexを再度
    X_cat_train = ResetIndex(X_cat_train)
    X_cat_test  = ResetIndex(X_cat_test) 
    
    #数値とカテゴリ値を結合
    X_train = pd.concat([X_num_train, X_cat_train], axis=1)
    X_test  = pd.concat([X_num_test, X_cat_test], axis=1)
    
    #列名取得
    exp_col = X_train.columns
    
    #RaodomForestの学習
    clf = RandomForestClassifier(criterion="entropy", n_estimators=10, random_state=42, class_weight="balanced")
    clf.fit(X_train, y_train.ravel())
    y_train_predict = clf.predict(X_train)

    #特徴量の重要度
    feature = clf.feature_importances_

    #特徴量の重要度順
    #indices = np.argsort(feature)[::-1]
    indices = np.argsort(feature)

    #書き出し
    result = pd.DataFrame({"Name":exp_col, "Feature":feature}).sort_values(by="Feature", ascending=False)
    result = result[["Name", "Feature"]]
    #result.to_csv(os.path.join(w_path, "feature_without_salot_"+csv_file), index=False)

    n = 10
    plt.title("Feature Importance")
    plt.barh(range(n),feature[indices[-n:]], color="gray", align="center")
    plt.yticks(range(n), exp_col[indices[-n:]], rotation=0, fontsize=7)
    plt.ylim([-1, n])
    #plt.barh(range(len(feature)),feature[indices], color="gray", align="center")
    #plt.yticks(range(len(feature)), exp_col[indices], rotation=0, fontsize=7)
    #plt.ylim([-1, len(feature)])
    plt.tight_layout()
    SaveFig(csv_file[:-4])
    plt.show()

In [27]:
csv_files = GetConcatSubAssyFiles(r_path)
Main(csv_files[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


SGD is being learned
SGD is done
Kernel is being learned


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Kernel is done
LinearSVC is being learned
LinearSVC is done
KNeighbor is being learned
KNeighbor is done
No such a method
SVC is being learned
SVC is done
RandomForest is being learned
RandomForest is done


In [43]:
GetRFCFeatures(csv_files[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [11]:
def CalcMethodComparison(csv_file):
    
    #定義
    result = pd.DataFrame()
    clf_method = ["SGD", "Kernel", "LinearSVC", "KNeighbor", "SVC", "RandomForest"]
    conf_cols  = ["TN", "FP", "FN", "TP"]
    method, precision, recall, f1, conf = [], [], [], [], []
    
    #データ準備
    df = PrepareFittingData(csv_file)
    
    #学習データ，検証データに分類
    train_set, test_set = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

    #目的変数設定
    #obj_cols = ["defective_cat_16"] #subassy
    obj_cols = ["blot_area%s_33" %r for r in ["A", "B"]]
    
    X_train = train_set.drop(columns=obj_cols)
    X_test  = test_set.drop(columns=obj_cols)
    y_train = train_set[obj_cols[0]].copy()
    y_test  = test_set[obj_cols[0]].copy()
    exp_col = X_train.columns
    
    #説明変数の標準化
    X_train = DataScaling(X_train)
    X_test  = DataScaling(X_test)
    
    #分類器毎の実力
    for flg in clf_method:
    
        #分類器選択
        if flg == "SGD":
            clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)            
        elif flg == "Kernel":
            rbf = RBFSampler(gamma=1, n_components=100, random_state=42)
            X_rbf = rbf.fit_transform(X_train)
            clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)            
        elif flg == "LinearSVC":
            clf = LinearSVC(C=1.0, class_weight="balanced", random_state=42)            
        elif flg == "KNeighbor":
            clf = KNeighborsClassifier(n_neighbors=5, weights="distance")        
        elif flg == "SVC":
            clf = SVC(kernel="rbf", C=1.0, class_weight="balanced", random_state=42)        
        elif flg == "RandomForest":
            clf = RandomForestClassifier(criterion="entropy", n_estimators=10, random_state=42, class_weight="balanced")
        else:
            print("No such a method")

        #学習&評価
        if flg == "Kernel":
            clf.fit(X_rbf, y_train)
            y_test_predict = clf.predict(rbf.fit_transform(X_test))
        else:
            clf.fit(X_train, y_train)
            y_test_predict = clf.predict(X_test)
        
        #適合率と再現率のプロット
        #CalcPrecisionRecall_vs_Threshold(clf, X_train, y_train)

        method.append(flg)
        precision.append(precision_score(y_test, y_test_predict))
        recall.append(recall_score(y_test, y_test_predict))
        f1.append(f1_score(y_test, y_test_predict))
        conf.append(confusion_matrix(y_test, y_test_predict).ravel())

    #書き出し
    confusion = pd.DataFrame(conf, columns = conf_cols)
    result["method"] = method
    result["precision"] = precision
    result["recall"] = recall
    result["f1"] = f1
    
    result = pd.concat([result, confusion], axis=1)
    
    result.to_csv(os.path.join(w_path, "result_"+csv_file), index=False)

    """
    #モデル評価
    y_test_predict = reg.predict(X_test)
    y_test_predict = cross_val_predict(clf, X_train, y_train, cv=10, method="predict_proba") #RandomForest"の場合
    """

In [12]:
def DataScaling(X):
    scaler = StandardScaler() #scaler = MinMaxScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    
    return X

In [13]:
def CalcPrecisionRecall_vs_Threshold(clf, X, y):
    y_scores = cross_val_predict(clf, X, y, cv=10, method="decision_function")
    precisions, recalls, thresholds = precision_recall_curve(y, y_scores)
    PlotPrecisionRecall_vs_Threshold(precisions, recalls, thresholds)
    SaveFig(flg)

In [14]:
def PlotPrecisionRecall_vs_Threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g--", label="Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="center left")
    plt.ylim([0, 1])

In [15]:
def CrossValidation(reg, X, y, columns):
    y_predict = reg.predict(X)
    reg_mse = mean_squared_error(y, y_predict)
    reg_rmse = np.sqrt(reg_mse)
    reg_scores = cross_val_score(reg, X, y, scoring="neg_mean_squared_error", cv=10)
    result = np.sqrt(-reg_scores)

    f1 = pd.DataFrame({"Name":columns, "Coefficients":reg.coef_}).sort_values(by='Coefficients', ascending=False)
    f2 = f1.loc[:, ["Name", "Coefficients"]]
    
    f2.to_csv(os.path.join(w_folder, csv_file), index=False)

    return result

In [17]:
def PlotHistgramInterval(csv_file):
    
    #IntervalCSV読み込み
    df = pd.read_csv(os.path.join(r_path, csv_file))
    
    #OK, NGデータ抽出
    df_okay = df[df["defective_cat_16"]==0]
    df_blob = df[df["defective_cat_16"]==31]
    
    df_okay = df_okay.dropna()
    df_blob = df_blob.dropna()
    
    #xmax = max([df_okay["cure_time[s]"].max(), df_blob["cure_time[s]"].max()]) + 100
    plot_cols = ["ti_14_15[s]", "cure_time[s]", "ti_13_14[s]", "ti_15_16[s]", "ti_12_13[s]", "ti_11_12[s]"]
    #plot_cols = ["ti_14_15[s]", "ti_14_16[s]", "ti_13_15[s]", "ti_15_16[s]", "ti_12_15[s]"]
    
    #NGcsvファイル全体をプロット
    #1画像辺りのサブプロット数
    plt_row = 6
    plt_col = 1
    
    #figNo設定
    i_mat = 0
    i_fig = 0
    
    plt.figure(figsize=(10,8))
    
    for col in plot_cols:
        i_mat += 1
        
        plt.subplot(plt_row, plt_col, i_mat)
        
        if not df_okay.empty:
            plt.hist(df_okay[col].values, bins=50, alpha=0.3, histtype="stepfilled", color="b", label="OK")
        if not df_blob.empty:
            plt.hist(df_blob[col].values, bins=50, alpha=0.3, histtype="stepfilled", color="r", label="NG")
        
        plt.title(col)
        plt.tick_params(labelsize=8)
        plt.ylim([0, 2000])
        plt.legend()
        plt.tight_layout()
    
    SaveFig("test")

■■■■■　MainProgram　■■■■■

In [18]:
#定義
r_path = os.path.join("..", "AssemblyData", "Extract")
w_path = os.path.join("..", "AssemblyData_Analysis")

In [30]:
#データファイル取得
#csv_files = GetIntervalCSVFiles(r_path)
csv_files = GetConcatSACSVFiles(r_path)

In [91]:
CalcMethodComparison(csv_files[1])

In [92]:
GetRFCFeatures(csv_files[1])

In [31]:
#ヒストグラム
for csv_file in csv_files:
    PlotHistgramInterval(csv_file)

In [44]:
#Assyインターバル学習
csv_files = GetConcatAssyFiles(r_path)
for csv_file in csv_files:
    CalcMethodComparison(csv_file)

MemoryError: 

■■■■■　確認　■■■■■

■■■■■　バックアップ　■■■■■

#工程間　単体
def PrepareFittingData(csv_file):
    
    #IntervalCSV読み込み
    df = pd.read_csv(os.path.join(r_path, csv_file))
    
    #OK:0, NG:31のデータ抽出 query は　boolインデックスより新しいみたい
    df = df.query("defective_cat_16 == 0 or defective_cat_16 == 31") 
    
    #X, Yとなるデータの抽出
    cols = ["ti_%s_%s[s]" %(i, i+1) for i in np.arange(11,16)]
    cols.extend(["cure_time[s]", "defective_cat_16"])
    df = df[cols]
    
    #NG:31を1に変換
    df.loc[df["defective_cat_16"]==31, "defective_cat_16"] = 1
    
    df = df.dropna()
    df = df.reset_index()
    df = df.drop("index", axis=1)

    return df

#工程間　相互
def PrepareFittingData(csv_file):
    
    #IntervalCSV読み込み
    df = pd.read_csv(os.path.join(r_path, csv_file))
    
    #OK:0, NG:31のデータ抽出 query は　boolインデックスより新しいみたい
    df = df.query("defective_cat_16 == 0 or defective_cat_16 == 31") 
    
    #X, Yとなるデータの抽出
    cols = df.columns[df.columns.str.startswith("ti_")].tolist()
    cols.extend(["cure_time[s]", "defective_cat_16"])
    df = df[cols]
    
    #NG:31を1に変換
    df.loc[df["defective_cat_16"]==31, "defective_cat_16"] = 1
    
    df = df.dropna()
    df = df.reset_index()
    df = df.drop("index", axis=1)
    
    return df

def GetRFCFeatures(csv_file):
    
    #データ準備
    df = PrepareFittingData(csv_file)
    
    #学習データ，検証データに分類
    train_set, test_set = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

    #目的変数設定
    obj_cols = ["defective_cat_16"]
    
    X_train = train_set.drop(columns=obj_cols)
    X_test  = test_set.drop(columns=obj_cols)
    y_train = train_set[obj_cols[0]].copy()
    y_test  = test_set[obj_cols[0]].copy()
    exp_col = X_train.columns
    
    #説明変数の標準化
    X_train = DataScaling(X_train)
    X_test  = DataScaling(X_test)
    
    #RaodomForestの学習
    clf = RandomForestClassifier(criterion="entropy", n_estimators=10, random_state=42, class_weight="balanced")
    clf.fit(X_train, y_train)
    y_test_predict = clf.predict(X_test)

    #特徴量の重要度
    feature = clf.feature_importances_

    #特徴量の重要度順
    #indices = np.argsort(feature)[::-1]
    indices = np.argsort(feature)

    #書き出し
    result = pd.DataFrame({"Name":exp_col, "Feature":feature}).sort_values(by="Feature", ascending=False)
    result = result[["Name", "Feature"]]
    result.to_csv(os.path.join(w_path, "feature_"+csv_file), index=False)

    plt.title("Feature Importance")
    plt.barh(range(len(feature)),feature[indices], color="gray", align="center")
    plt.yticks(range(len(feature)), exp_col[indices], rotation=0)
    plt.ylim([-1, len(feature)])
    plt.tight_layout()
    SaveFig(csv_file[:-4])
    plt.show()