In [1]:
import os
import numpy as np
import pandas as pd

#シミは数値
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet

from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
%matplotlib inline

■■■■■　関数　■■■■■

In [2]:
def SaveFig(fig_id, tight_layout=True, fig_extension="png", resolution=600):
   
    images_path = os.path.join("..", "Images")
    os.makedirs(images_path, exist_ok=True)
    file_name = os.path.join(images_path, fig_id + "." + fig_extension)
    
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(file_name, format=fig_extension, dpi=resolution)
    plt.close()

In [3]:
def GetConcatSubAssyFiles(r_path):
    files = os.listdir(r_path)
    csv_files = []
    
    for file in files:
        if(file.find(".csv") != -1 and file.upper().startswith("CONCAT") and file.find("SA") != -1):
            csv_files.append(file)
            
    return csv_files

In [4]:
def GetConcatAssyFiles(r_path):
    files = os.listdir(r_path)
    csv_files = []
    
    for file in files:
        if (file.find(".csv") != -1 and file.upper().startswith("CONCAT") and file.find("Assy") != -1):
            csv_files.append(file)
            
    return csv_files

In [5]:
def GetConcatAllFiles(r_path):
    files = os.listdir(r_path)
    csv_files = []
    
    for file in files:
        if (file.find(".csv") != -1 and file.upper().startswith("CONCAT") and file.find("All") != -1):
            csv_files.append(file)
            
    return csv_files

In [6]:
#工程間時間
def PrepareFittingData(csv_file):
    
    #IntervalCSV読み込み
    #df = pd.read_csv(os.path.join(r_path, csv_file))
    reader = pd.read_csv(os.path.join(r_path, csv_file), low_memory=False, chunksize=2000)
    df = pd.concat((r for r in reader), ignore_index=True)
    
    #X, Yとなるデータの抽出
    ti_cols = df.columns[df.columns.str.startswith("ti_")].tolist()
    ct_cols = df.columns[df.columns.str.startswith("cure_")].tolist()
    bl_cols = df.columns[df.columns.str.startswith("blot_")].tolist()
    cols = ti_cols + ct_cols + bl_cols
    df = df[cols]
    
    df = df.dropna()
    df = df.reset_index()
    df = df.drop("index", axis=1)
    
    return df

In [7]:
#工程間時間
def PrepareAllFittingData(csv_file):
    
    #IntervalCSV読み込み
    #df = pd.read_csv(os.path.join(r_path, csv_file))
    reader = pd.read_csv(os.path.join(r_path, csv_file), low_memory=False, chunksize=2000)
    df = pd.concat((r for r in reader), ignore_index=True)
    
    #X, Yとなるデータの抽出
    ti_cols = df.columns[df.columns.str.startswith("ti_")].tolist()
    ct_cols = df.columns[df.columns.str.startswith("cure_")].tolist()
    bl_cols = df.columns[df.columns.str.startswith("blot_")].tolist()
    cols = ti_cols + ct_cols + bl_cols
    cols.remove("cure_time[s]")
    df = df[cols]
    
    df = df.dropna(subset=cols)
    df = df.reset_index()
    df = df.drop("index", axis=1)
    
    return df

In [8]:
def DataScaling(X):
    scaler = StandardScaler() #scaler = MinMaxScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    
    return X

In [9]:
def CrossValidation(reg, X, y):
    y_predict = reg.predict(X)
    reg_mse = mean_squared_error(y, y_predict)
    reg_rmse = np.sqrt(reg_mse)
    reg_scores = cross_val_score(reg, X, y, scoring="neg_mean_squared_error", cv=10)
    result = np.sqrt(-reg_scores)

    return result

In [10]:
def CalcCoefficients(reg, flg, columns):
    if flg == "RandomForest":
        feature = reg.feature_importances_
        f1 = pd.DataFrame({"Name": columns, "Features": feature[:]}).sort_values(by="Features", ascending=False)
        f2 = f1.loc[:, ["Name", "Features"]]
    else:
        f1 = pd.DataFrame({"Name": columns, "Coefficients": reg.coef_}).sort_values(by="Coefficients", ascending=False)
        f2 = f1.loc[:, ["Name", "Coefficients"]]
    
    f2.to_csv(os.path.join(w_path, flg+".csv"), index=False)

In [11]:
def CalcMethodComparison(csv_file):
    
    #定義
    result = pd.DataFrame()
    reg_method = ["SGD", "Lasso", "Ridge", "ElasticNet", "SVR", "RandomForest"]
    method, rmse_mean, rmse_dev, corr = [], [], [], []
    
    #データ準備
    df = PrepareAllFittingData(csv_file)
    
    #学習データ，検証データに分類
    train_set, test_set = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

    #目的変数設定
    #obj_cols = ["defective_cat_16"] #subassy
    obj_cols = ["blot_area%s_33" %r for r in ["A", "B"]]
    
    X_train  = train_set.drop(columns=obj_cols)
    X_test   = test_set.drop(columns=obj_cols)
    y_train  = train_set[obj_cols[0]].copy()
    y_test   = test_set[obj_cols[0]].copy()
    exp_cols = X_train.columns

    
    #説明変数の標準化
    X_train = DataScaling(X_train)
    X_test  = DataScaling(X_test)
    
    plt.figure()
    i_mat = 1
    
    #分類器毎の実力
    for flg in reg_method:
    
        #分類器選択
        if flg == "SGD":
            reg = SGDRegressor(max_iter=1000, penalty="l2", tol=1e-3, random_state=42)            
        elif flg == "Lasso":
            reg = Lasso(alpha=1.0, random_state=42)
        elif flg == "Ridge":
            reg = Ridge(alpha=1.0, solver="cholesky", random_state=42)
        elif flg == "ElasticNet":
            reg = ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42)
        #elif flg == "SVR":
            #reg = SVR(kernel="rbf")
        elif flg == "RandomForest":
            reg = RandomForestRegressor(n_estimators=10, random_state=42)
        else:
            print("No such a method")
            continue

        #学習
        if flg == "SGD":
            reg.fit(X_train, y_train.ravel())
        else:
            reg.fit(X_train, y_train)
        
        #予測
        y_test_predict = reg.predict(X_test)
        
        #検証
        reg_test_rmse_scores = CrossValidation(reg, X_test, y_test)
        
        #特徴量抽出
        CalcCoefficients(reg, flg, exp_cols)

        method.append(flg)
        rmse_mean.append(reg_test_rmse_scores.mean())
        rmse_dev.append(reg_test_rmse_scores.std())
        corr.append(np.corrcoef(y_test, y_test_predict)[0, 1])
        
        #予測結果プロット
        plt.subplot(2, 3, i_mat)
        plt.scatter(y_test_predict, y_test, color="black", marker="o", s=5, alpha=0.3)
        plt.plot([0, 2500],[0, 2500], color="gray", linestyle="-", linewidth=0.5)  
        plt.xlabel("Predicted", fontsize=8)
        plt.ylabel("Measured", fontsize=8)
        plt.title(flg, fontsize=10)
        plt.tick_params(labelsize=8)
        plt.tight_layout()
        plt.axis([0, 2500, 0, 2500])
        i_mat += 1
        
        print(flg, "is done")

    #グラフ保存    
    SaveFig("Summary")
    
    #書き出し
    result["method"] = method
    result["rmse_score"] = rmse_mean
    result["rmse_stddev"] = rmse_dev
    result["correlation"] = corr
        
    result.to_csv(os.path.join(w_path, "result_"+csv_file), index=False)
    

    """
    #モデル評価
    y_test_predict = reg.predict(X_test)
    y_test_predict = cross_val_predict(reg, X_train, y_train, cv=10, method="predict_proba") #RandomForest"の場合
    """

In [12]:
def PlotHistgramInterval(csv_file):
    
    #IntervalCSV読み込み
    df = pd.read_csv(os.path.join(r_path, csv_file))
    print(df.describe())
    cols = ["ti_16_to_32[s]", "ti_32_to_33[s]", "cure_time_33[s]"]
    
    #1画像辺りのサブプロット数
    plt_row = 3
    plt_col = 1
    
    #figNo設定
    i_mat = 0
    i_fig = 0
    
    plt.figure(figsize=(10,8))
    
    for col in cols:
        i_mat += 1
        
        plt.subplot(plt_row, plt_col, i_mat)
        plt.hist(df[col].values, bins=100, alpha=0.3, color="b")
        
        plt.title(col)
        plt.tick_params(labelsize=8)
        plt.tight_layout()
        
    SaveFig("assy_interval")

■■■■■　MainProgram　■■■■■

In [13]:
#定義
r_path = os.path.join("..", "AssemblyData", "Extract")
w_path = os.path.join("..", "AssemblyData_Analysis")

#SubAssy
#csv_files = GetIntervalCSVFiles(r_path)
csv_files = GetConcatSubAssyFiles(r_path)
CalcMethodComparison(csv_files[1])
GetRFCFeatures(csv_files[1])

In [None]:
#Assyインターバル学習
csv_files = GetConcatAssyFiles(r_path)
for csv_file in csv_files:
    CalcMethodComparison(csv_file)

In [14]:
#Assyインターバル学習 SubAssy時間有り
csv_files = GetConcatAllFiles(r_path)
for csv_file in csv_files:
    CalcMethodComparison(csv_file)

['ti_11_to_12[s]', 'ti_12_to_13[s]', 'ti_13_to_14[s]', 'ti_14_to_15[s]', 'ti_15_to_16[s]', 'ti_16_to_32[s]', 'ti_32_to_33[s]', 'cure_time_14[s]', 'cure_time_33[s]', 'blot_areaA_33', 'blot_areaB_33']
SGD is done
Lasso is done
Ridge is done
ElasticNet is done
No such a method
RandomForest is done
Saving figure Summary


In [24]:
PlotHistgramInterval(csv_files[0])

              sa_lot        sa_seri     sa_hinban       assy_lot  \
count  374745.000000  374745.000000  3.747450e+05  374745.000000   
mean    97222.006524   10982.054488  4.689100e+09   96673.695814   
std        70.418014    6604.124085  0.000000e+00    3606.485985   
min     97101.000000       1.000000  4.689100e+09   54321.000000   
25%     97161.000000    5473.000000  4.689100e+09   97253.000000   
50%     97218.000000   10577.000000  4.689100e+09   97371.000000   
75%     97280.000000   16146.000000  4.689100e+09   97501.000000   
max     97345.000000   26880.000000  4.689100e+09   98184.000000   

        assy_hinban  storage_loc_14  defective_cat_16  storage_loc_33  \
count  3.747450e+05   374745.000000          374745.0   374745.000000   
mean   4.689001e+09      806.519342               0.0      100.999872   
std    1.580905e+03      431.326028               0.0       58.378961   
min    4.689000e+09      101.000000               0.0        1.000000   
25%    4.689000e+09   

■■■■■　確認　■■■■■

■■■■■　バックアップ　■■■■■

#工程間　相互
def PrepareFittingData(csv_file):
    
    #IntervalCSV読み込み
    df = pd.read_csv(os.path.join(r_path, csv_file))
    
    #OK:0, NG:31のデータ抽出 query は　boolインデックスより新しいみたい
    df = df.query("defective_cat_16 == 0 or defective_cat_16 == 31") 
    
    #X, Yとなるデータの抽出
    cols = df.columns[df.columns.str.startswith("ti_")].tolist()
    cols.extend(["cure_time[s]", "defective_cat_16"])
    df = df[cols]
    
    #NG:31を1に変換
    df.loc[df["defective_cat_16"]==31, "defective_cat_16"] = 1
    
    df = df.dropna()
    df = df.reset_index()
    df = df.drop("index", axis=1)
    
    return df