In [1]:
import os
import numpy as np
import pandas as pd

#0,1問題は分類に当たる
from sklearn.linear_model import SGDClassifier
from sklearn.kernel_approximation import RBFSampler 
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.cluster import KMeans

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve

import matplotlib.pyplot as plt
%matplotlib inline

■■■■■　関数　■■■■■

In [2]:
def SaveFig(fig_id, tight_layout=True, fig_extension="png", resolution=600):
   
    images_path = os.path.join("..", "Images")
    os.makedirs(images_path, exist_ok=True)
    file_name = os.path.join(images_path, fig_id + "." + fig_extension)
    
    #print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(file_name, format=fig_extension, dpi=resolution)
    plt.close()

In [3]:
def GetIntervalCSVFiles(r_path):
    files = os.listdir(r_path)
    csv_files = []
    
    for file in files:
        if(file.find(".csv") != -1 and file.upper().find("INTERVAL") != -1):
             csv_files.append(file)
    
    return csv_files

In [4]:
def GetConcatSubAssyFiles(r_path):
    files = os.listdir(r_path)
    csv_files = []
    
    for file in files:
        if(file.find(".csv") != -1 and file.upper().startswith("CONCAT") and file.find("SA") != -1):
            csv_files.append(file)
            
    return csv_files

In [5]:
def GetConcatAssyWithGelFiles(r_path):
    files = os.listdir(r_path)
    csv_files = []
    
    for file in files:
        if(file.find(".csv") != -1 and file.upper().startswith("CONCAT") and file.find("WithGel") != -1):
            csv_files.append(file)
            
    return csv_files

In [6]:
def GetConcatAssyFiles(r_path):
    files = os.listdir(r_path)
    csv_files = []
    
    for file in files:
        if (file.find(".csv") != -1 and file.upper().startswith("CONCAT") and file.find("Assy") != -1):
            csv_files.append(file)
            
    return csv_files

In [39]:
def UnderSampling(df,num,label) :

    #label = 少数派のラベル, num = ターゲット件数
    
    time_cols = ["ti_%s_to_%s[s]" %(i, i+1) for i in np.arange(32,37)]
    proc_cols = ["base_height_L_a_32", "base_height_R_a_32", "gel_plunger_stroke_a_35", "base_height_a_35", "gel_thick_b4_a_35"]
    eval_cols = time_cols + proc_cols
    obj_cols  = ["defective_cat_37"]
    
    X = df[eval_cols]
    Y = df[obj_cols]
    
    # KMeansによるクラスタリング
    km = KMeans(random_state=42)
    km.fit(X,Y)
    X["Cluster"] = km.predict(X)

    # 群別の構成比を少数派の件数に乗じて群別の抽出件数を計算
    count_sum = X.groupby("Cluster").count().iloc[0:,0].as_matrix()
    ratio = count_sum / count_sum.sum()
    samp_num = np.round(ratio * num,0).astype(np.int32)

    # 群別にサンプリング処理を実施
    for i in np.arange(8) :
        tmp = X[X["Cluster"]==i]
        if i == 0 :
            tmp1 = X.sample(samp_num[i],replace=True)
        else :
            tmp2 = X.sample(samp_num[i],replace=True)
            tmp1 = pd.concat([tmp1,tmp2])
    tmp1["Class"] = label
    
    return tmp1

#工程間　単体
def PrepareFittingData(csv_file):
    
    #IntervalCSV読み込み
    df = pd.read_csv(os.path.join(r_path, csv_file))
    
    #OK:0, NG:31のデータ抽出 query は　boolインデックスより新しいみたい
    df = df.query("defective_cat_16 == 0 or defective_cat_16 == 31") 
    
    #X, Yとなるデータの抽出
    cols = ["ti_%s_to_%s[s]" %(i, i+1) for i in np.arange(11,16)]
    cols.extend(["cure_time[s]", "defective_cat_16"])
    df = df[cols]
    
    #NG:31を1に変換
    df.loc[df["defective_cat_16"]==31, "defective_cat_16"] = 1
    
    df = df.dropna()
    df = df.reset_index()
    df = df.drop("index", axis=1)

    return df

In [37]:
#ゲル気泡　数値データのみ
def PrepareFittingData(csv_file):
    
    #IntervalCSV読み込み
    df = pd.read_csv(os.path.join(r_path, csv_file))
    
    #OK:0, NG:31のデータ抽出 query は　boolインデックスより新しいみたい
    df = df.query("defective_cat_37 == 0 or defective_cat_37 == 86") 
    
    #X, Yとなるデータの抽出
    time_cols = ["ti_%s_to_%s[s]" %(i, i+1) for i in np.arange(32,37)]
    proc_cols = ["base_height_L_a_32", "base_height_R_a_32", "gel_plunger_stroke_a_35", "base_height_a_35", "gel_thick_b4_a_35", "defective_cat_37"]
    eval_cols = time_cols + proc_cols
    
    df = df[eval_cols]
    
    #NG:31を1に変換
    df.loc[df["defective_cat_37"]==86, "defective_cat_37"] = 1
    
    df = df.dropna()
    df = df.reset_index()
    df = df.drop("index", axis=1)

    return df

def CalcMethodComparison(csv_file):
    
    #定義
    result = pd.DataFrame()
    clf_method = ["SGD", "Kernel", "LinearSVC", "KNeighbor", "SVC", "RandomForest"]
    conf_cols  = ["TN", "FP", "FN", "TP"]
    method, precision, recall, f1, conf = [], [], [], [], []
    
    #データ準備
    df = PrepareFittingData(csv_file)
    
    #データ数のバランス調整
    df = UnderSampling(df, 200, "")
    
    #学習データ，検証データに分類
    train_set, test_set = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

    #目的変数設定
    #AssyとSAで替えること
    obj_cols = ["defective_cat_16"] #subassy
    #obj_cols = ["blot_area%s_33" %r for r in ["A", "B"]]
    
    X_train = train_set.drop(columns=obj_cols)
    X_test  = test_set.drop(columns=obj_cols)
    y_train = train_set[obj_cols[0]].copy()
    y_test  = test_set[obj_cols[0]].copy()
    exp_col = X_train.columns
    
    #説明変数の標準化
    X_train = DataScaling(X_train)
    X_test  = DataScaling(X_test)
    
    #分類器毎の実力
    for flg in clf_method:
    
        #分類器選択
        if flg == "SGD":
            clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)            
        #elif flg == "Kernel":
            #rbf = RBFSampler(gamma=1, n_components=100, random_state=42)
            #X_rbf = rbf.fit_transform(X_train)
            #clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)            
        elif flg == "LinearSVC":
            clf = LinearSVC(C=1.0, class_weight="balanced", random_state=42)            
        elif flg == "KNeighbor":
            clf = KNeighborsClassifier(n_neighbors=5, weights="distance")        
        #elif flg == "SVC":
            #clf = SVC(kernel="rbf", C=1.0, class_weight="balanced", random_state=42)        
        elif flg == "RandomForest":
            clf = RandomForestClassifier(criterion="entropy", n_estimators=10, random_state=42, class_weight="balanced")
        else:
            print("No such a method")
        """
        #学習&評価
        if flg == "Kernel":
            pass
            #clf.fit(X_rbf, y_train)
            #y_train_predict = clf.predict(rbf.fit_transform(X_train))
        else:
            clf.fit(X_train, y_train)
            y_train_predict = clf.predict(X_train)
        
        method.append(flg)
        precision.append(precision_score(y_train, y_train_predict))
        recall.append(recall_score(y_train, y_train_predict))
        f1.append(f1_score(y_train, y_train_predict))
        conf.append(confusion_matrix(y_train, y_train_predict).ravel())
        
        print(flg, "is done")
        """

        #学習&評価
        if flg == "Kernel":
            pass
            #clf.fit(X_rbf, y_train)
            #y_test_predict = clf.predict(rbf.fit_transform(X_test))
        else:
            clf.fit(X_train, y_train)
            y_test_predict = clf.predict(X_test)
        
        #適合率と再現率のプロット
        #CalcPrecisionRecall_vs_Threshold(clf, X_train, y_train)

        method.append(flg)
        precision.append(precision_score(y_test, y_test_predict))
        recall.append(recall_score(y_test, y_test_predict))
        f1.append(f1_score(y_test, y_test_predict))
        conf.append(confusion_matrix(y_test, y_test_predict).ravel())
        
        print(flg, "is done")


    #書き出し
    confusion = pd.DataFrame(conf, columns = conf_cols)
    result["method"] = method
    result["precision"] = precision
    result["recall"] = recall
    result["f1"] = f1
    
    result = pd.concat([result, confusion], axis=1)
    
    result.to_csv(os.path.join(w_path, "result_OK_" + csv_file), index=False)
    #result.to_csv(os.path.join(w_path, "result_"+csv_file), index=False)

    """
    #モデル評価
    y_test_predict = reg.predict(X_test)
    y_test_predict = cross_val_predict(clf, X_train, y_train, cv=10, method="predict_proba") #RandomForest"の場合
    """

In [46]:
def CalcMethodComparison(csv_file):
    
    #定義
    result = pd.DataFrame()
    clf_method = ["SGD", "Kernel", "LinearSVC", "KNeighbor", "SVC", "RandomForest"]
    conf_cols  = ["TN", "FP", "FN", "TP"]
    method, precision, recall, f1, conf = [], [], [], [], []
    
    #データ準備
    df = PrepareFittingData(csv_file)
    
    #データ数のバランス調整
    #df = UnderSampling(df, 200, "")
    
    #学習データ，検証データに分類
    train_set, test_set = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

    #目的変数設定
    obj_cols = ["defective_cat_37"] #ゲル
    
    X_train = train_set.drop(columns=obj_cols)
    X_test  = test_set.drop(columns=obj_cols)
    y_train = train_set[obj_cols[0]].copy()
    y_test  = test_set[obj_cols[0]].copy()
    exp_col = X_train.columns
    
    #説明変数の標準化
    X_train = DataScaling(X_train)
    X_test  = DataScaling(X_test)
    
    #分類器毎の実力
    for flg in clf_method:
    
        #分類器選択
        if flg == "SGD":
            clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)            
        #elif flg == "Kernel":
            #rbf = RBFSampler(gamma=1, n_components=100, random_state=42)
            #X_rbf = rbf.fit_transform(X_train)
            #clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)            
        elif flg == "LinearSVC":
            clf = LinearSVC(C=1.0, class_weight="balanced", random_state=42)            
        elif flg == "KNeighbor":
            clf = KNeighborsClassifier(n_neighbors=5, weights="distance")        
        #elif flg == "SVC":
            #clf = SVC(kernel="rbf", C=1.0, class_weight="balanced", random_state=42)        
        elif flg == "RandomForest":
            clf = RandomForestClassifier(criterion="entropy", n_estimators=10, random_state=42, class_weight="balanced")
        else:
            print("No such a method")
        """
        #学習&評価
        if flg == "Kernel":
            pass
            #clf.fit(X_rbf, y_train)
            #y_train_predict = clf.predict(rbf.fit_transform(X_train))
        else:
            clf.fit(X_train, y_train)
            y_train_predict = clf.predict(X_train)
        
        method.append(flg)
        precision.append(precision_score(y_train, y_train_predict))
        recall.append(recall_score(y_train, y_train_predict))
        f1.append(f1_score(y_train, y_train_predict))
        conf.append(confusion_matrix(y_train, y_train_predict).ravel())
        
        print(flg, "is done")
        """

        #学習&評価
        if flg == "Kernel":
            pass
            #clf.fit(X_rbf, y_train)
            #y_test_predict = clf.predict(rbf.fit_transform(X_test))
        else:
            clf.fit(X_train, y_train)
            y_test_predict = clf.predict(X_test)
        
        #適合率と再現率のプロット
        #CalcPrecisionRecall_vs_Threshold(clf, X_train, y_train)

        method.append(flg)
        precision.append(precision_score(y_test, y_test_predict))
        recall.append(recall_score(y_test, y_test_predict))
        f1.append(f1_score(y_test, y_test_predict))
        conf.append(confusion_matrix(y_test, y_test_predict).ravel())
        
        print(flg, "is done")


    #書き出し
    confusion = pd.DataFrame(conf, columns = conf_cols)
    result["method"] = method
    result["precision"] = precision
    result["recall"] = recall
    result["f1"] = f1
    
    result = pd.concat([result, confusion], axis=1)
    
    result.to_csv(os.path.join(w_path, "result_" + csv_file), index=False)
    #result.to_csv(os.path.join(w_path, "result_"+csv_file), index=False)

    """
    #モデル評価
    y_test_predict = reg.predict(X_test)
    y_test_predict = cross_val_predict(clf, X_train, y_train, cv=10, method="predict_proba") #RandomForest"の場合
    """

In [24]:
def DataScaling(X):
    scaler = StandardScaler() #scaler = MinMaxScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    
    return X

In [25]:
def CalcPrecisionRecall_vs_Threshold(clf, X, y):
    y_scores = cross_val_predict(clf, X, y, cv=10, method="decision_function")
    precisions, recalls, thresholds = precision_recall_curve(y, y_scores)
    PlotPrecisionRecall_vs_Threshold(precisions, recalls, thresholds)
    SaveFig(flg)

In [26]:
def PlotPrecisionRecall_vs_Threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g--", label="Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="center left")
    plt.ylim([0, 1])

In [27]:
def CrossValidation(reg, X, y, columns):
    y_predict = reg.predict(X)
    reg_mse = mean_squared_error(y, y_predict)
    reg_rmse = np.sqrt(reg_mse)
    reg_scores = cross_val_score(reg, X, y, scoring="neg_mean_squared_error", cv=10)
    result = np.sqrt(-reg_scores)

    f1 = pd.DataFrame({"Name":columns, "Coefficients":reg.coef_}).sort_values(by='Coefficients', ascending=False)
    f2 = f1.loc[:, ["Name", "Coefficients"]]
    
    f2.to_csv(os.path.join(w_folder, csv_file), index=False)

    return result

In [43]:
def GetRFCFeatures(csv_file):
    
    #データ準備
    df = PrepareFittingData(csv_file)
    
    #学習データ，検証データに分類
    train_set, test_set = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

    #目的変数設定
    obj_cols = ["defective_cat_37"]
    
    X_train = train_set.drop(columns=obj_cols)
    X_test  = test_set.drop(columns=obj_cols)
    y_train = train_set[obj_cols[0]].copy()
    y_test  = test_set[obj_cols[0]].copy()
    exp_col = X_train.columns
    
    #説明変数の標準化
    X_train = DataScaling(X_train)
    X_test  = DataScaling(X_test)
    
    #RaodomForestの学習
    clf = RandomForestClassifier(criterion="entropy", n_estimators=10, random_state=42, class_weight="balanced")
    clf.fit(X_train, y_train)
    y_test_predict = clf.predict(X_test)

    #特徴量の重要度
    feature = clf.feature_importances_

    #特徴量の重要度順
    #indices = np.argsort(feature)[::-1]
    indices = np.argsort(feature)

    #書き出し
    result = pd.DataFrame({"Name":exp_col, "Feature":feature}).sort_values(by="Feature", ascending=False)
    result = result[["Name", "Feature"]]
    result.to_csv(os.path.join(w_path, "feature_"+csv_file), index=False)

    plt.title("Feature Importance")
    plt.barh(range(len(feature)),feature[indices], color="gray", align="center")
    plt.yticks(range(len(feature)), exp_col[indices], rotation=0)
    plt.ylim([-1, len(feature)])
    plt.tight_layout()
    SaveFig(csv_file[:-4])
    plt.show()

In [29]:
def PlotHistgramInterval(csv_file):
    
    #IntervalCSV読み込み
    df = pd.read_csv(os.path.join(r_path, csv_file))
    
    #OK, NGデータ抽出
    df_okay = df[df["defective_cat_16"]==0]
    df_blob = df[df["defective_cat_16"]==31]
    
    df_okay = df_okay.dropna()
    df_blob = df_blob.dropna()
    
    #xmax = max([df_okay["cure_time[s]"].max(), df_blob["cure_time[s]"].max()]) + 100
    plot_cols = ["ti_14_15[s]", "cure_time[s]", "ti_13_14[s]", "ti_15_16[s]", "ti_12_13[s]", "ti_11_12[s]"]
    #plot_cols = ["ti_14_15[s]", "ti_14_16[s]", "ti_13_15[s]", "ti_15_16[s]", "ti_12_15[s]"]
    
    #NGcsvファイル全体をプロット
    #1画像辺りのサブプロット数
    plt_row = 6
    plt_col = 1
    
    #figNo設定
    i_mat = 0
    i_fig = 0
    
    plt.figure(figsize=(10,8))
    
    for col in plot_cols:
        i_mat += 1
        
        plt.subplot(plt_row, plt_col, i_mat)
        
        if not df_okay.empty:
            plt.hist(df_okay[col].values, bins=50, alpha=0.3, histtype="stepfilled", color="b", label="OK")
        if not df_blob.empty:
            plt.hist(df_blob[col].values, bins=50, alpha=0.3, histtype="stepfilled", color="r", label="NG")
        
        plt.title(col)
        plt.tick_params(labelsize=8)
        plt.ylim([0, 2000])
        plt.legend()
        plt.tight_layout()
    
    SaveFig("test")

■■■■■　MainProgram　■■■■■

In [30]:
#定義
r_path = os.path.join("..", "AssemblyData", "Extract")
w_path = os.path.join("..", "AssemblyData_Analysis")

In [31]:
#データファイル取得
#csv_files = GetIntervalCSVFiles(r_path)
#csv_files = GetConcatSACSVFiles(r_path)


In [24]:
#SubAssyインターバル学習
csv_files = GetConcatSubAssyFiles(r_path)
for csv_file in csv_files:
    CalcMethodComparison(csv_file)

  exec(code_obj, self.user_global_ns, self.user_ns)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


SGD is done
No such a method
Kernel is done
LinearSVC is done
KNeighbor is done
No such a method
SVC is done
RandomForest is done


In [42]:
#Assyゲル気泡学習
csv_files = GetConcatAssyWithGelFiles(r_path)
for csv_file in csv_files:
    CalcMethodComparison(csv_file)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


SGD is done
No such a method
Kernel is done
LinearSVC is done
KNeighbor is done
No such a method
SVC is done
RandomForest is done


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [44]:
GetRFCFeatures(csv_files[0])

In [31]:
#ヒストグラム
for csv_file in csv_files:
    PlotHistgramInterval(csv_file)

In [44]:
#Assyインターバル学習
csv_files = GetConcatAssyFiles(r_path)
for csv_file in csv_files:
    CalcMethodComparison(csv_file)

MemoryError: 

■■■■■　確認　■■■■■

■■■■■　バックアップ　■■■■■

#工程間　単体
def PrepareFittingData(csv_file):
    
    #IntervalCSV読み込み
    df = pd.read_csv(os.path.join(r_path, csv_file))
    
    #OK:0, NG:31のデータ抽出 query は　boolインデックスより新しいみたい
    df = df.query("defective_cat_16 == 0 or defective_cat_16 == 31") 
    
    #X, Yとなるデータの抽出
    cols = ["ti_%s_%s[s]" %(i, i+1) for i in np.arange(11,16)]
    cols.extend(["cure_time[s]", "defective_cat_16"])
    df = df[cols]
    
    #NG:31を1に変換
    df.loc[df["defective_cat_16"]==31, "defective_cat_16"] = 1
    
    df = df.dropna()
    df = df.reset_index()
    df = df.drop("index", axis=1)

    return df

#工程間　相互
def PrepareFittingData(csv_file):
    
    #IntervalCSV読み込み
    df = pd.read_csv(os.path.join(r_path, csv_file))
    
    #OK:0, NG:31のデータ抽出 query は　boolインデックスより新しいみたい
    df = df.query("defective_cat_16 == 0 or defective_cat_16 == 31") 
    
    #X, Yとなるデータの抽出
    cols = df.columns[df.columns.str.startswith("ti_")].tolist()
    cols.extend(["cure_time[s]", "defective_cat_16"])
    df = df[cols]
    
    #NG:31を1に変換
    df.loc[df["defective_cat_16"]==31, "defective_cat_16"] = 1
    
    df = df.dropna()
    df = df.reset_index()
    df = df.drop("index", axis=1)
    
    return df