In [1]:
from pycaret.classification import *
from sklearn.model_selection import train_test_split

import pandas as pd
import os

In [2]:
ROOT_DIR = "../data"
RANDOM_STATE = 200

train_data = pd.read_csv(os.path.join(ROOT_DIR, "pca_train.csv"))
train_data.rename(columns={'12': 'target'}, inplace=True)

df_normal = train_data[train_data["target"] == "Normal"]
df_abnormal = train_data[train_data["target"] == "AbNormal"]

normal_ratio = 0.5

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)

## under sampling
df_normal = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)

train_data = df_concat

# df_train, df_val = train_test_split(
#     df_concat,
#     test_size=0.3,
#     stratify=df_concat["target"],
#     random_state=RANDOM_STATE,
# )

In [3]:
setup_clf = setup(data=train_data, target="target", train_size=0.7, session_id=333, fold_shuffle=True)

Unnamed: 0,Description,Value
0,Session id,333
1,Target,target
2,Target type,Binary
3,Target mapping,"AbNormal: 0, Normal: 1"
4,Original data shape,"(3525, 37)"
5,Transformed data shape,"(3525, 61)"
6,Transformed train set shape,"(2467, 61)"
7,Transformed test set shape,"(1058, 61)"
8,Numeric features,24
9,Categorical features,12


In [4]:
model = compare_models(sort='F1', n_select=10)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.6676,0.6248,0.6676,0.6445,0.6474,0.1849,0.1922,0.024
lda,Linear Discriminant Analysis,0.6595,0.6233,0.6595,0.6396,0.6443,0.1803,0.1845,0.023
gbc,Gradient Boosting Classifier,0.6583,0.6171,0.6583,0.6377,0.6421,0.1747,0.1795,0.33
ada,Ada Boost Classifier,0.6518,0.6139,0.6518,0.6361,0.6413,0.1769,0.179,0.084
rf,Random Forest Classifier,0.6429,0.6133,0.6429,0.6303,0.6347,0.1649,0.1665,0.102
lightgbm,Light Gradient Boosting Machine,0.6389,0.6052,0.6389,0.6238,0.6289,0.1499,0.1516,0.294
et,Extra Trees Classifier,0.6336,0.6049,0.6336,0.6256,0.6286,0.1557,0.1566,0.076
dt,Decision Tree Classifier,0.619,0.5734,0.619,0.6209,0.6197,0.1466,0.1467,0.033
knn,K Neighbors Classifier,0.6348,0.5752,0.6348,0.6078,0.6129,0.107,0.1119,0.026
lr,Logistic Regression,0.6632,0.5713,0.6632,0.6072,0.5858,0.0654,0.0858,0.232


In [5]:
tuned_model = [tune_model(i) for i in model[:9]]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6883,0.66,0.6883,0.6692,0.6698,0.2404,0.2494
1,0.6883,0.6073,0.6883,0.6647,0.6518,0.1989,0.2224
2,0.6721,0.6506,0.6721,0.6402,0.6355,0.1543,0.1707
3,0.6761,0.6499,0.6761,0.6496,0.6489,0.1848,0.197
4,0.6883,0.6096,0.6883,0.6654,0.6632,0.218,0.2314
5,0.6478,0.5518,0.6478,0.6146,0.6194,0.1164,0.1236
6,0.668,0.6469,0.668,0.6366,0.6351,0.1528,0.1661
7,0.6829,0.6325,0.6829,0.6587,0.6564,0.2041,0.2176
8,0.6707,0.615,0.6707,0.6508,0.6547,0.2033,0.2087
9,0.7073,0.6791,0.7073,0.6896,0.6828,0.2653,0.2829


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6599,0.6148,0.6599,0.5973,0.5603,0.0301,0.0542
1,0.6599,0.5613,0.6599,0.5867,0.5483,0.0152,0.0335
2,0.6599,0.5363,0.6599,0.5924,0.5685,0.0312,0.0503
3,0.6883,0.5903,0.6883,0.7874,0.5798,0.0798,0.2039
4,0.6721,0.6339,0.6721,0.6491,0.558,0.032,0.0818
5,0.6518,0.5329,0.6518,0.4988,0.5341,-0.0236,-0.0554
6,0.6761,0.5403,0.6761,0.653,0.5784,0.0628,0.1169
7,0.6667,0.6105,0.6667,0.4444,0.5333,0.0,0.0
8,0.6707,0.5808,0.6707,0.6396,0.5626,0.0395,0.0864
9,0.6707,0.5591,0.6707,0.6481,0.5563,0.0319,0.0815


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6842,0.6483,0.6842,0.6661,0.6683,0.2379,0.2447
1,0.6842,0.6028,0.6842,0.6613,0.6584,0.2127,0.2264
2,0.6599,0.6214,0.6599,0.6324,0.636,0.1556,0.1632
3,0.6802,0.6342,0.6802,0.6575,0.6587,0.2085,0.2179
4,0.6397,0.5667,0.6397,0.6216,0.6277,0.1429,0.1449
5,0.6194,0.5408,0.6194,0.5936,0.6018,0.0799,0.0818
6,0.668,0.6518,0.668,0.6451,0.6489,0.1867,0.1932
7,0.6707,0.6186,0.6707,0.6508,0.6547,0.2033,0.2087
8,0.6626,0.6147,0.6626,0.6432,0.648,0.1889,0.193
9,0.7033,0.6878,0.7033,0.685,0.6815,0.2626,0.2765


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6802,0.6508,0.6802,0.6567,0.6551,0.2052,0.2175
1,0.6802,0.605,0.6802,0.6531,0.6428,0.1781,0.1991
2,0.6721,0.6402,0.6721,0.6391,0.6327,0.1483,0.1663
3,0.668,0.6272,0.668,0.6379,0.6377,0.1586,0.1707
4,0.668,0.5717,0.668,0.6422,0.6447,0.1757,0.1842
5,0.6478,0.5334,0.6478,0.6146,0.6194,0.1164,0.1236
6,0.664,0.6367,0.664,0.6289,0.6265,0.1334,0.1476
7,0.6911,0.6172,0.6911,0.6676,0.658,0.2083,0.2292
8,0.6667,0.6119,0.6667,0.6439,0.6475,0.1854,0.1919
9,0.7114,0.689,0.7114,0.695,0.684,0.268,0.29


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.664,0.6189,0.664,0.6379,0.6399,0.1704,0.1793
1,0.664,0.6139,0.664,0.6365,0.6376,0.165,0.1749
2,0.6761,0.5983,0.6761,0.6446,0.6358,0.1558,0.176
3,0.6761,0.6267,0.6761,0.6464,0.6414,0.1676,0.1844
4,0.668,0.5543,0.668,0.6407,0.6424,0.1701,0.1797
5,0.6478,0.5706,0.6478,0.6206,0.6264,0.1342,0.1393
6,0.6842,0.6341,0.6842,0.6608,0.6599,0.2105,0.2225
7,0.687,0.6088,0.687,0.6635,0.6596,0.2116,0.2267
8,0.6667,0.6061,0.6667,0.6486,0.6531,0.2013,0.2052
9,0.7114,0.6578,0.7114,0.695,0.684,0.268,0.29


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.664,0.6495,0.664,0.4409,0.5299,0.0,0.0
1,0.664,0.6024,0.664,0.4409,0.5299,0.0,0.0
2,0.668,0.6268,0.668,0.4462,0.5351,0.0,0.0
3,0.668,0.6337,0.668,0.4462,0.5351,0.0,0.0
4,0.668,0.5782,0.668,0.4462,0.5351,0.0,0.0
5,0.668,0.5297,0.668,0.4462,0.5351,0.0,0.0
6,0.668,0.6564,0.668,0.4462,0.5351,0.0,0.0
7,0.6667,0.6312,0.6667,0.4444,0.5333,0.0,0.0
8,0.6667,0.6005,0.6667,0.4444,0.5333,0.0,0.0
9,0.6667,0.6804,0.6667,0.4444,0.5333,0.0,0.0


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6802,0.6473,0.6802,0.6567,0.6551,0.2052,0.2175
1,0.6802,0.6008,0.6802,0.6531,0.6428,0.1781,0.1991
2,0.664,0.636,0.664,0.6304,0.6293,0.1394,0.1525
3,0.6761,0.6295,0.6761,0.6496,0.6489,0.1848,0.197
4,0.668,0.5672,0.668,0.6422,0.6447,0.1757,0.1842
5,0.6316,0.5365,0.6316,0.5992,0.6069,0.0883,0.0922
6,0.668,0.6546,0.668,0.6352,0.6324,0.1468,0.1615
7,0.7033,0.6218,0.7033,0.6843,0.6773,0.2526,0.2705
8,0.6667,0.6036,0.6667,0.6439,0.6475,0.1854,0.1919
9,0.7154,0.6894,0.7154,0.7001,0.6895,0.2808,0.3023


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.668,0.6163,0.668,0.6435,0.6453,0.1831,0.1919
1,0.6397,0.6099,0.6397,0.6303,0.6342,0.1703,0.1709
2,0.6842,0.6118,0.6842,0.6565,0.6449,0.1769,0.1998
3,0.6883,0.6364,0.6883,0.6654,0.6632,0.218,0.2314
4,0.6559,0.5792,0.6559,0.6283,0.6328,0.1484,0.1551
5,0.6275,0.5551,0.6275,0.598,0.606,0.0875,0.0905
6,0.6802,0.6354,0.6802,0.6552,0.6544,0.1977,0.2098
7,0.6748,0.6015,0.6748,0.6484,0.6475,0.1837,0.1958
8,0.6585,0.5992,0.6585,0.6431,0.648,0.1923,0.1946
9,0.7073,0.6516,0.7073,0.6896,0.6828,0.2653,0.2829


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6437,0.5765,0.6437,0.5603,0.556,0.0065,0.0097
1,0.6437,0.5933,0.6437,0.5761,0.5704,0.0281,0.0375
2,0.6599,0.5414,0.6599,0.5997,0.5787,0.046,0.067
3,0.6721,0.6107,0.6721,0.632,0.5865,0.0694,0.1087
4,0.6721,0.6031,0.6721,0.632,0.5865,0.0694,0.1087
5,0.6356,0.4981,0.6356,0.5254,0.5433,-0.03,-0.0458
6,0.6721,0.5615,0.6721,0.632,0.6005,0.0905,0.1242
7,0.687,0.6555,0.687,0.6757,0.6101,0.1217,0.1813
8,0.6667,0.6023,0.6667,0.6178,0.5772,0.0538,0.0869
9,0.6667,0.5585,0.6667,0.6178,0.5772,0.0538,0.0869


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [6]:
blended_hard = blend_models(estimator_list=tuned_model,
                       fold=10,
                       method="hard",
                       optimize="F1")

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6761,0.0,0.6761,0.6522,0.6518,0.1978,0.2088
1,0.6721,0.0,0.6721,0.6415,0.6337,0.1573,0.1759
2,0.6802,0.0,0.6802,0.6502,0.6389,0.1634,0.1859
3,0.6842,0.0,0.6842,0.6576,0.6503,0.1885,0.2073
4,0.664,0.0,0.664,0.6349,0.6369,0.1571,0.1667
5,0.6397,0.0,0.6397,0.6022,0.6081,0.0899,0.0963
6,0.6721,0.0,0.6721,0.6414,0.6382,0.1602,0.1751
7,0.6911,0.0,0.6911,0.6676,0.658,0.2083,0.2292
8,0.6707,0.0,0.6707,0.6479,0.6508,0.1927,0.2001
9,0.7073,0.0,0.7073,0.6897,0.6784,0.2552,0.2775


In [7]:
final_model = finalize_model(blended_hard)
evaluate_model(final_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [8]:
final_model

In [9]:
test_data = pd.read_csv(os.path.join(ROOT_DIR, "pca_test.csv"))

train_columns = train_data.columns
#test_data = test_data[train_columns]

df_test_x = test_data#.drop(columns=['target'])

In [10]:
test_pred = predict_model(final_model, data=test_data)
test_pred



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,prediction_label
0,Dam dispenser #2,AJX75334501,3J1XF767-1,AJX75334501,3J1XF767-1,OK,Fill1 dispenser #2,AJX75334501,3J1XF767-1,Fill2 dispenser #2,...,0.455273,0.269325,-1.238153,0.492180,6.959485,-5.926402,2.009640,-1.824390,-2.626029,AbNormal
1,Dam dispenser #2,AJX75334501,4B1XD472-2,AJX75334501,4B1XD472-2,OK,Fill1 dispenser #2,AJX75334501,4B1XD472-2,Fill2 dispenser #2,...,9.639068,20.259314,3.928456,-4.542729,-22.891991,-21.994287,0.685102,10.901917,14.508316,Normal
2,Dam dispenser #1,AJX75334501,3H1XE355-1,AJX75334501,3H1XE355-1,OK,Fill1 dispenser #1,AJX75334501,3H1XE355-1,Fill2 dispenser #1,...,-0.000310,-0.845370,-2.121011,1.438319,12.259931,-8.430806,4.413903,-2.896132,-13.369293,AbNormal
3,Dam dispenser #2,AJX75334501,3L1XA128-1,AJX75334501,3L1XA128-1,OK,Fill1 dispenser #2,AJX75334501,3L1XA128-1,Fill2 dispenser #2,...,7.138623,-20.968945,-10.638637,5.445040,19.410360,18.703112,-12.371858,2.363786,23.805340,Normal
4,Dam dispenser #1,AJX75334501,4A1XA639-1,AJX75334501,4A1XA639-1,OK,Fill1 dispenser #1,AJX75334501,4A1XA639-1,Fill2 dispenser #1,...,1.432299,-10.076963,-4.993634,1.617880,9.122725,8.559978,-4.423807,-1.227007,2.612746,AbNormal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,Dam dispenser #2,AJX75334501,3K1XB597-1,AJX75334501,3K1XB597-1,OK,Fill1 dispenser #2,AJX75334501,3K1XB597-1,Fill2 dispenser #2,...,4.206943,-23.325636,-22.888144,6.355010,23.373249,23.745441,-1.180442,-9.862477,-8.717693,AbNormal
17357,Dam dispenser #2,AJX75334501,4A1XB974-1,AJX75334501,4A1XB974-1,OK,Fill1 dispenser #2,AJX75334501,4A1XB974-1,Fill2 dispenser #2,...,-0.705675,9.064880,5.279408,-1.777022,-8.853498,-7.986886,4.092596,1.211678,-2.399325,Normal
17358,Dam dispenser #1,AJX75334501,3L1XA998-1,AJX75334501,3L1XA998-1,OK,Fill1 dispenser #1,AJX75334501,3L1XA998-1,Fill2 dispenser #1,...,11.462799,-2.212937,-18.890753,2.500339,-0.450027,2.360564,0.850209,-1.942946,-4.352603,AbNormal
17359,Dam dispenser #1,AJX75334501,3F1XC376-1,AJX75334501,3F1XC376-1,OK,Fill1 dispenser #1,AJX75334501,3F1XC376-1,Fill2 dispenser #1,...,2.285775,0.859113,-2.722195,-0.715221,-22.827568,21.756470,-1.748279,-0.944664,-2.326972,AbNormal


In [11]:
df_sub = pd.read_csv("../data/submission.csv")
df_sub["target"] = test_pred["prediction_label"]
df_sub.to_csv("../data/automl/submission-14.csv", index=False)