## Decision Tree Demo

In [None]:
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [4, 3]
rng = np.random.RandomState(0)
X = rng.normal(size=(50, 2))
y = np.zeros(X.shape[0], dtype=np.int)
y[X[:, 1] > 1.2] = 1
# add one point
X = np.vstack((X,[[0.5, 2.5], [2.1, 1.4], [1.9, 1.5]]))
y = np.append(y, [0, 0, 0])
def plot_points(X, y):
    x_up = X[np.argwhere(y==1)]
    x_low = X[np.argwhere(y==0)]
    plt.scatter([s[0][0] for s in x_up], [s[0][1] for s in x_up], s = 25, color = 'red', edgecolor = 'k')
    plt.scatter([s[0][0] for s in x_low], [s[0][1] for s in x_low], s = 25, color = 'cyan', edgecolor = 'k')
    #plt.plot(X[:,0], [1.2]*len(y))
    #plt.plot([1.716]*len(y), np.linspace(1.2,2.8, len(y)))
    #plt.plot(X[:,0], [2.37]*len(y))
    plt.xlabel('dim1')
    plt.ylabel('dim2')
    
plot_points(X, y)
plt.show()

In [None]:
# Import the classifier from sklearn
from sklearn.tree import DecisionTreeClassifier

# TODO: Define the classifier, and fit it to the data
model = DecisionTreeClassifier(max_depth=3)
tree = model.fit(X, y)

from sklearn.tree import export_graphviz
from graphviz import Source

treedot = export_graphviz(
        model,
        out_file=None,
        filled=True, rounded=True,
        special_characters=True,
        rotate=True
    )
treegraph = Source(treedot)
treegraph

# 数据分析步骤：
依据不同标签数据的特征构建标签判别决策树模型，并对模型判别给出特征差异大小排序，从而定位特征差异结果。
1. 数据：ATE_PASS 146个log多芯片。ATE_FAILED 600个log单芯片
2. 清洗：过滤出所有具备测试值的测试项整理为Dateframe表格
3. 过滤：整合清洗数据为单DateFrame，行为文件名，列为Pin+Number名，并构建对应PF标签（可暂存最后一列或新Dateframe）
4. 建模：构建随机森林模型，并切分原数据为 Train，Valid，Test三部分
5. 训练：训练模型，通过调整参数使Train,Valid趋近收敛精度达到80%以上（实际判别过程过于简单可合并train 与 valid 数据一起训练模型）
6. 细分：根据分析结果抽样单独用例组合训练数据构建决策树模型给出当前用例下的特征差异性，基于此特征差异判断下计算判别成功率（90%以上）。
7. 验证：基于夏普加性解释使用验证集或测试集数据对随机森林模型特征重要性进行二次判断验证特征重要性排序结果。

## 1. Data Prepare
### load package
I define a `project_helper` python file to wrap some function

In [None]:
import numpy as np
import pandas as pd
import re
import os
from tqdm import tqdm
from matplotlib import pyplot as plt
import random

# custom py file
import project_helper as ph

### load file names

In [None]:
path_good = '../log_good/'
path_bad = '../log_bad_20230315/'
path_bad2 = '../log_bad_20230313/'

files_good = os.listdir(path_good)
files_bad = os.listdir(path_bad)
files_bad2 = os.listdir(path_bad2)

files_list_good = [path_good + file for file in files_good if not os.path.isdir(file)]
files_list_bad = [path_bad + file for file in files_bad if not os.path.isdir(file)]
files_list_bad2 = [path_bad2 + file for file in files_bad2 if not os.path.isdir(file)]
valid_file = 'SD8023EV101_OSDC_SITE1_negsample.txt'

print(f'total goodchip files count is {len(files_list_good)}')
print(f'total badchip files count is {len(files_list_bad)}')
print(f'total badchip files count is {len(files_list_bad2)}')

## 2. Data Clean
### parse log data to one dataframe
For fast run, advice load file list length blow 50

In [None]:
# extract all good data
df_raw_good = ph.get_test_df((random.sample(files_list_good, 100)))

In [None]:
# random extract part of bad data 
f_list = random.sample(files_list_bad, 50) +  random.sample(files_list_bad2, 50)
df_raw_bad = ph.get_test_df(f_list)

In [None]:
df_raw_unknown = ph.get_test_df([valid_file])

In [None]:
# take a look of raw df
df_raw_bad.head()

In [None]:
df_raw_unknown

## 3. Filte Data
Pivote data from raw df to a new one. The new df row is filename and columns is Pins+P/F(at last) of all test in a log.
#### Note:
I fill nan to zero and change Measured to Measure/Force value

In [None]:
pivote_df_good, case2pins = ph.concat_tests_pivot(df_raw_good, 1)
pivote_df_bad, case2pins = ph.concat_tests_pivot(df_raw_bad, 0)

pivote_df_good = pivote_df_good.fillna(0.)
pivote_df_bad = pivote_df_bad.fillna(0.)

# good and bad pin are different, fix it
pivote_df_good = pivote_df_good[pivote_df_bad.columns]

In [None]:
print(pivote_df_bad.shape, pivote_df_good.shape)
pivote_df_good.head()

## 4. Define Model

### split data
split data into three sections involve train valid and test
#### Note
Due to negative data is much larger than positive, so the bad data can be extracted random samples which size as same as good data.

Integrate good and bad sample data into one dataframe.

Due to `train_data_split` function shuffle data rows. It need to be split into train valid and test data before seperate data into features and target.

In [None]:
#  extract samples from bad data
pivote_df_bad_sample = pivote_df_bad.sample(frac=1.)
print(f'Extract bad data samples shape: {pivote_df_bad_sample.shape}')

# append good and bad data into one dataframe
all_df = pivote_df_good.append(pivote_df_bad_sample)
all_df.replace([np.inf, -np.inf], 0., inplace=True)
print(f'Full data shape(good add bad samples): {all_df.shape}')

# spilt all data into train valid and test 
df_train, df_valid, df_test = ph.train_data_split(all_df.copy().fillna(0.), set_size=[0.6, 0.4, 0.0])

# seperate df into frature and target
features_columns = [feature for feature in all_df.columns if feature != 'PF']
y_train, y_valid, y_test = df_train['PF'], df_valid['PF'], df_test['PF']
X_train, X_valid, X_test = df_train[features_columns], df_valid[features_columns], df_test[features_columns]

In [None]:
X_valid.shape

## 5. Train Complete Model
### random forests model

train and valid model then check accuracy and tune up model parameter

In [None]:
clf_parameters = {
    # 'n_estimators': 200,
    'criterion': 'entropy',
    'max_depth':3,
    #'min_samples_split': int(pivote_df_bad_sample.shape[0] * 0.1),
    'min_samples_leaf': int(pivote_df_bad_sample.shape[0] * 0.1),
    'max_features': 'sqrt',
    'random_state' : 0,
    'oob_score':True,
}

n_trees_l = [20, 30]

In [None]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, ExtraTreesRegressor, RandomForestRegressor

train_score = []
valid_score = []
oob_score = []
feature_importances = []

for n_trees in tqdm(n_trees_l, desc='Training Models', unit='Model'):
    clf = RandomForestClassifier(n_trees, **clf_parameters)
    clf.fit(X_train, y_train)
    
    train_score.append(clf.score(X_train, y_train.values))
    valid_score.append(clf.score(X_valid, y_valid.values))
    oob_score.append(clf.oob_score_)
    feature_importances.append(clf.feature_importances_)

In [None]:
print(train_score)
print(valid_score)
print(oob_score)

In [None]:
print('Features Ranked by Average Importance:\n')
importance_order = ph.rank_features_by_importance(np.average(feature_importances, axis=0), features_columns, max_print=30)

In [None]:
ph.plot_tree_classifier(clf, tree_num=0, feature_names=features_columns)

In [None]:
ph.plot_tree_classifier(clf, tree_num=1, feature_names=features_columns)

In [None]:
ph.plot_tree_classifier(clf, tree_num=17, feature_names=features_columns)

## 6. Analysis One Test Case
### Simple Decision Model For One Test Case
In order to locate some value different bettwen pos and neg values in one test case. That's ok to use DecisionTree
Such as we want see more detail in `DC_IOH_VDDN_TEST`

In [None]:
# filter to get one case data
def get_pins_by_names(df, pin_name_list):
    all_names = []
    for name in pin_name_list:
        all_names += list(filter(lambda x: name in x, df.columns))
    return all_names

#pin_name_list = case2pins['DC_IOH_VDDN_TEST'][2:-3]
pin_name_list = ['ODSPA_PGPIO_BIT0_B', 'ODSPA_PGPIO_BIT1_B',
 'ODSPA_PCPU_INT_A', 'ODSPA_PGPIO_BIT0_A', 'ODSPA_PGPIO_BIT1_A',
 'ODSPA_PGPIO_BIT2_B', 'ODSPA_PGPIO_BIT3_B', 'ODSPA_PGPIO_BIT4_B',
 'ODSPA_PGPIO_BIT5_B', 'PFP_TX_B' 'ODSPA_PGPIO_BIT2_A', 'ODSPA_PGPIO_BIT3_A',
 'ODSPA_PGPIO_BIT4_A', 'ODSPA_PGPIO_BIT5_A' ]
print(pin_name_list)
pin_name_list = get_pins_by_names(X_train, pin_name_list)
X_train_onecase, X_valid_onecase = X_train[pin_name_list], X_valid[pin_name_list]
X_train_onecase.head()

In [None]:
from IPython.display import display
from sklearn.tree import DecisionTreeClassifier

simple_clf = DecisionTreeClassifier(
    max_depth=6,
    criterion='entropy',
    random_state=0)
simple_clf.fit(X_train_onecase, y_train)

pin_list=ph.rank_features_by_importance(simple_clf.feature_importances_, pin_name_list, max_print=10)
graph = ph.plot_tree_classifier(simple_clf, feature_names=pin_name_list )
graph.render('dtree_render',view=True)
graph

This test case preduce above 91 percent accuracy, it is reasonably belive these features values of test case contribute to chip failed 

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Predict on the test data
predictions = simple_clf.predict(X_valid_onecase)
print('Sample size: ',format(len(y_valid.values)))
print('Accuracy score: ', format(accuracy_score(y_valid.values, predictions)))
print('Precision score: ', format(precision_score(y_valid.values, predictions)))
print('Recall score: ', format(recall_score(y_valid.values, predictions, average='micro')))
print('F1 score: ', format(f1_score(y_valid.values, predictions, average='micro')))

### check pin value graph
default to check top 20 imprtant features

In [None]:
def plot_compare_pin(column_name):
    df = pd.DataFrame()
    lenth = min(pivote_df_good[column_name].size, pivote_df_bad[column_name].size)
    df['good_' + column_name] = pivote_df_good[column_name].values[:lenth]
    df['bad_' + column_name] = pivote_df_bad[column_name].values[:lenth]
    df.plot(grid='on', figsize=(6,3))

In [None]:
for pin_name in pin_list:
    plot_compare_pin(pin_name)

## 7. Sharp Additive Valid
Is that value can be trust? maybe some features are same important.
Valid entropy value by Sharp Additive Explanation

In [None]:
import shap
explainer = shap.TreeExplainer(clf)
# limit is a target unique value count
shap_values = explainer.shap_values(X_valid)

In [None]:
tmp1 = np.concatenate(shap_values)
tmp2 = np.abs(tmp1)
tmp3 = np.nanmean(tmp2,axis=0)
print(tmp3.shape)
np.argsort(tmp3)[::-1]

In [None]:
print('gini calculate feature importance rank:')
importance_order = ph.rank_features_by_importance(np.average(feature_importances, axis=0), features_columns, max_print=5)
print('\nshap additive calculate feature importance rank:')
shap_importances = ph.rank_features_by_importance(tmp3, features_columns, max_print=5)

If two result is different, the result given by Sharp prevails.

In [None]:
# plt.figure(figsize=(8, 6))
shap.summary_plot(shap_values, X_test, plot_type="bar")
shap.summary_plot(shap_values, X_valid, plot_type="bar")

check pin value by graph agin

In [None]:
for pin_name in importance_order:
    plot_compare_pin(pin_name)