In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import  StandardScaler,RobustScaler
from sklearn.impute import  SimpleImputer

from sklearn.model_selection import  train_test_split,RandomizedSearchCV
from sklearn.metrics import  classification_report,accuracy_score,confusion_matrix,precision_score

from sklearn.linear_model import  LogisticRegression
from sklearn.naive_bayes import  GaussianNB
from sklearn.tree import  DecisionTreeClassifier
from sklearn.neighbors import  KNeighborsClassifier
from sklearn.ensemble import  RandomForestClassifier

from sklearn.pipeline import  Pipeline
from sklearn.compose import  ColumnTransformer


In [2]:
df = pd.read_csv('data/wafer_preprocess.csv')
df.head()

Unnamed: 0,Sensor-1,Sensor-2,Sensor-3,Sensor-4,Sensor-5,Sensor-7,Sensor-8,Sensor-9,Sensor-10,Sensor-11,...,Sensor-582,Sensor-583,Sensor-584,Sensor-585,Sensor-586,Sensor-587,Sensor-588,Sensor-589,Sensor-590,Good/Bad
0,2968.33,2476.58,2216.7333,1748.0885,1.1127,97.5822,0.1242,1.53,-0.0279,-0.004,...,65.12755,0.5004,0.012,0.0033,2.4069,0.0545,0.0184,0.0055,33.7876,-1
1,2961.04,2506.43,2170.0666,1364.5157,1.5447,96.77,0.123,1.3953,0.0084,0.0062,...,65.12755,0.4994,0.0115,0.0031,2.302,0.0545,0.0184,0.0055,33.7876,1
2,3072.03,2500.68,2205.7445,1363.1048,1.0518,101.8644,0.122,1.3896,0.0138,0.0,...,65.12755,0.4987,0.0118,0.0036,2.3719,0.0545,0.0184,0.0055,33.7876,-1
3,3021.83,2419.83,2205.7445,1363.1048,1.0518,101.8644,0.122,1.4108,-0.0046,-0.0024,...,65.12755,0.4934,0.0123,0.004,2.4923,0.0545,0.0184,0.0055,33.7876,-1
4,3006.95,2435.34,2189.8111,1084.6502,1.1993,104.8856,0.1234,1.5094,-0.0046,0.0121,...,65.12755,0.4987,0.0145,0.0041,2.8991,0.0545,0.0184,0.0055,33.7876,-1


In [3]:
df.columns

Index(['Sensor-1', 'Sensor-2', 'Sensor-3', 'Sensor-4', 'Sensor-5', 'Sensor-7',
       'Sensor-8', 'Sensor-9', 'Sensor-10', 'Sensor-11',
       ...
       'Sensor-582', 'Sensor-583', 'Sensor-584', 'Sensor-585', 'Sensor-586',
       'Sensor-587', 'Sensor-588', 'Sensor-589', 'Sensor-590', 'Good/Bad'],
      dtype='object', length=465)

## Model Training

In [4]:
# Let's split dataste into Dependent and Independent features
X = df.iloc[:,:-1]
y = df[['Good/Bad']]

In [5]:
X.head()

Unnamed: 0,Sensor-1,Sensor-2,Sensor-3,Sensor-4,Sensor-5,Sensor-7,Sensor-8,Sensor-9,Sensor-10,Sensor-11,...,Sensor-581,Sensor-582,Sensor-583,Sensor-584,Sensor-585,Sensor-586,Sensor-587,Sensor-588,Sensor-589,Sensor-590
0,2968.33,2476.58,2216.7333,1748.0885,1.1127,97.5822,0.1242,1.53,-0.0279,-0.004,...,0.00385,65.12755,0.5004,0.012,0.0033,2.4069,0.0545,0.0184,0.0055,33.7876
1,2961.04,2506.43,2170.0666,1364.5157,1.5447,96.77,0.123,1.3953,0.0084,0.0062,...,0.00385,65.12755,0.4994,0.0115,0.0031,2.302,0.0545,0.0184,0.0055,33.7876
2,3072.03,2500.68,2205.7445,1363.1048,1.0518,101.8644,0.122,1.3896,0.0138,0.0,...,0.00385,65.12755,0.4987,0.0118,0.0036,2.3719,0.0545,0.0184,0.0055,33.7876
3,3021.83,2419.83,2205.7445,1363.1048,1.0518,101.8644,0.122,1.4108,-0.0046,-0.0024,...,0.00385,65.12755,0.4934,0.0123,0.004,2.4923,0.0545,0.0184,0.0055,33.7876
4,3006.95,2435.34,2189.8111,1084.6502,1.1993,104.8856,0.1234,1.5094,-0.0046,0.0121,...,0.00385,65.12755,0.4987,0.0145,0.0041,2.8991,0.0545,0.0184,0.0055,33.7876


In [6]:
y.head()

Unnamed: 0,Good/Bad
0,-1
1,1
2,-1
3,-1
4,-1


In [29]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=42)

In [30]:
# Pipeline for data preprocessing.Since, all are numerical features only one pipeline is suffiecient.
preprocessor = Pipeline(
    steps = [
        ('impute',SimpleImputer(strategy='median')),
        ('scaler',RobustScaler())
    ]
)

preprocessor

In [31]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.fit_transform(X_test),columns=preprocessor.get_feature_names_out())

In [32]:
X_train.head()

Unnamed: 0,Sensor-1,Sensor-2,Sensor-3,Sensor-4,Sensor-5,Sensor-7,Sensor-8,Sensor-9,Sensor-10,Sensor-11,...,Sensor-581,Sensor-582,Sensor-583,Sensor-584,Sensor-585,Sensor-586,Sensor-587,Sensor-588,Sensor-589,Sensor-590
0,-0.490228,0.973745,0.937179,0.486398,-0.234628,-0.080098,-0.283615,0.397269,0.337366,-0.930644,...,0.00485,-6.04505,-1.672919,0.822134,0.937751,0.861105,2.285714,1.400199,1.401069,-0.0306
1,-0.432174,1.230817,-0.11031,-0.358297,-0.259068,0.346855,0.246892,-0.225623,-2.611645,-0.72086,...,0.0,0.0,0.515202,0.566394,0.18755,0.501629,0.0,-0.689201,-0.600458,-0.43797
2,-0.225355,-0.113328,0.36523,0.216181,2.226178,-0.059833,0.225262,0.370489,-0.111377,-0.484727,...,0.0,0.0,1.693938,1.433883,1.17125,1.308995,-0.126399,-0.076885,-0.121106,0.365598
3,-0.501421,0.929395,0.246086,-0.514139,-1.072738,0.418927,-0.446848,-0.019432,-0.46219,-0.070528,...,-0.00025,-10.65145,1.085025,-0.23736,-0.562651,-0.276701,0.414286,-0.085797,-0.057186,-0.097463
4,-0.183361,1.823869,-0.11031,-0.358297,-0.259068,0.346855,0.246892,-0.319543,-3.348898,-0.878198,...,0.0,0.0,-1.399404,3.123793,3.751004,3.125303,0.3,0.220409,0.257339,0.215013


In [33]:
X_test.head()

Unnamed: 0,Sensor-1,Sensor-2,Sensor-3,Sensor-4,Sensor-5,Sensor-7,Sensor-8,Sensor-9,Sensor-10,Sensor-11,...,Sensor-581,Sensor-582,Sensor-583,Sensor-584,Sensor-585,Sensor-586,Sensor-587,Sensor-588,Sensor-589,Sensor-590
0,0.764372,-1.266567,1.556561,0.696782,-0.123874,-0.701398,-1.786346,1.246428,1.725762,0.807455,...,-5.590717,-2.34865,1.621594,-0.268522,-0.048821,-0.330646,0.173528,-0.673185,-0.60172,-0.317214
1,-0.279234,-0.219975,1.272773,0.552952,0.673474,-0.53105,-1.3576,-0.048556,0.011777,1.202139,...,0.0,0.0,0.366239,-0.066876,-0.299078,-0.071831,-0.036535,-0.574092,-0.521582,-0.119022
2,-0.051429,0.302736,-0.143975,-0.67524,0.448847,0.578994,-0.233377,0.084524,-0.834474,1.192067,...,10.824244,2.978539,0.167933,-0.443304,-0.251652,-0.446416,0.579523,0.879723,1.215201,0.682604
3,0.08262,0.703889,0.588354,0.279325,-0.063035,0.016397,-0.29887,0.132804,0.714132,0.023537,...,0.0,0.0,0.0,0.75748,1.030848,0.734447,1.061886,0.017249,0.107676,-0.086338
4,-0.586819,-0.70222,0.065839,-0.663642,-1.372514,0.748845,-0.626333,-1.022697,-0.534431,0.023537,...,0.0,0.0,1.721318,-0.231401,-0.251652,-0.279559,1.569637,-0.551983,-0.415322,-0.461822


In [34]:
# Model Training
model = LogisticRegression()
model

In [35]:
model.fit(X_train,y_train)

In [36]:
y_pred = model.predict(X_test)
y_pred

array([ 1,  1, -1, -1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1,  1,
        1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1,  1, -1,
        1,  1, -1,  1], dtype=int64)

In [37]:
precision_score(y_pred,y_test)

1.0

In [38]:
accuracy_score(y_pred,y_test)

1.0

In [39]:
model.intercept_

array([1.75043981])

In [40]:
# Lets have a function for different models(return confusion matrix,accuracy score,precision score)
def evaluate_models(true,predicted):
    acc_score = accuracy_score(y_pred=predicted,y_true=true)
    pre_score = precision_score(y_pred=predicted,y_true=true)
    # conf_matr = confusion_matrix(y_pred=predicted,y_true=true)
    

    return acc_score, pre_score


In [41]:
# Train Multiple Models
## Model Evaluation
models = {
    'GaussianNaiveBayes'     : GaussianNB(),
    'RandomForestClassifier' : RandomForestClassifier(),
    'LogisticRegression'     : LogisticRegression(),
    'DecisionTreeClassifier' : DecisionTreeClassifier()
}

trained_models_list = []
model_list = []
accuracy_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    # Make Predictions
    y_pred = model.predict(X_test)

    accuracy,precision = evaluate_models(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    print("Model Training Performance: ")
    print("Accuracy: ",accuracy)
    print("Precision: ",precision)
    # print("R2 Score: ",r2_Score*100)

    accuracy_list.append(accuracy)
    print("="*40)
    print("\n")


GaussianNaiveBayes
Model Training Performance: 
Accuracy:  0.7894736842105263
Precision:  1.0


RandomForestClassifier
Model Training Performance: 
Accuracy:  1.0
Precision:  1.0


LogisticRegression
Model Training Performance: 
Accuracy:  1.0
Precision:  1.0


DecisionTreeClassifier
Model Training Performance: 
Accuracy:  0.8947368421052632
Precision:  1.0




In [20]:
model_list

['GaussianNaiveBayes',
 'RandomForestClassifier',
 'LogisticRegression',
 'DecisionTreeClassifier']

In [21]:
# Performing Hyperparameter Tuning on Random Forest 
params= {
    'n_estimators' : [100,200,300],
    'max_depth' : [3,5,10,None],
    'criterion' : ['gini','entrpoy']
}

In [22]:
cv = RandomizedSearchCV(RandomForestClassifier(),params,cv=5,verbose=3,scoring='accuracy')
cv.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=gini, max_depth=None, n_estimators=200;, score=1.000 total time=   0.4s
[CV 2/5] END criterion=gini, max_depth=None, n_estimators=200;, score=1.000 total time=   0.3s
[CV 3/5] END criterion=gini, max_depth=None, n_estimators=200;, score=1.000 total time=   0.3s
[CV 4/5] END criterion=gini, max_depth=None, n_estimators=200;, score=1.000 total time=   0.3s
[CV 5/5] END criterion=gini, max_depth=None, n_estimators=200;, score=1.000 total time=   0.3s
[CV 1/5] END criterion=gini, max_depth=None, n_estimators=100;, score=1.000 total time=   0.1s
[CV 2/5] END criterion=gini, max_depth=None, n_estimators=100;, score=1.000 total time=   0.1s
[CV 3/5] END criterion=gini, max_depth=None, n_estimators=100;, score=1.000 total time=   0.1s
[CV 4/5] END criterion=gini, max_depth=None, n_estimators=100;, score=1.000 total time=   0.1s
[CV 5/5] END criterion=gini, max_depth=None, n_estimators=100;, score=1.000 total ti

In [23]:
cv.best_params_

{'n_estimators': 200, 'max_depth': None, 'criterion': 'gini'}

In [24]:
cv.best_score_

1.0

In [25]:
y_pred = cv.predict(X_test)
y_pred

array([ 1,  1, -1, -1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1,  1,
        1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1,  1, -1,
        1,  1], dtype=int64)

In [26]:
accuracy_score(y_pred,y_test)

1.0

In [27]:
confusion_matrix(y_pred,y_test)

array([[18,  0],
       [ 0, 18]], dtype=int64)

In [28]:
precision_score(y_pred,y_test)

1.0