# Execute this block if your running this notebook in Colab

In [None]:
!gdown 1iOQziZqU0v7eGaHnCZmdKEOHJP5idc_y
!mkdir /root/.kaggle/
!mv ./kaggle.json /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json
!pip install -q kaggle
!kaggle competitions download -c tabular-playground-series-aug-2022
!unzip tabular-playground-series-aug-2022.zip

Downloading...
From: https://drive.google.com/uc?id=1iOQziZqU0v7eGaHnCZmdKEOHJP5idc_y
To: /content/kaggle.json
100% 64.0/64.0 [00:00<00:00, 81.9kB/s]
Downloading tabular-playground-series-aug-2022.zip to /content
  0% 0.00/2.27M [00:00<?, ?B/s]
100% 2.27M/2.27M [00:00<00:00, 36.2MB/s]
Archive:  tabular-playground-series-aug-2022.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


# Start from here if your running this notebook in other environments

In [None]:
!pip install feature_engine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting feature_engine
  Downloading feature_engine-1.5.2-py2.py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.0/290.0 KB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature_engine
Successfully installed feature_engine-1.5.2


In [None]:
from tensorflow.keras.layers import Input, BatchNormalization, Dense, Dropout, concatenate
from tensorflow.nn import sigmoid
from tensorflow.keras.activations import swish
from tensorflow.keras.models import Model
import tensorflow as tf
from sklearn.linear_model import HuberRegressor
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from feature_engine.encoding import WoEEncoder
import pandas as pd
import numpy as np

pd.options.display.max_columns = 999

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")

In [None]:
# [ref] https://www.kaggle.com/code/purist1024/principled-3-vs-2-cv-splitting-on-product-code
folds_dict = {f'Fold 1': [['C', 'D', 'E'], ['A', 'B']], 
               'Fold 2': [['B', 'D', 'E'], ['A', 'C']],
               'Fold 3': [['B', 'C', 'E'], ['A', 'D']],
               'Fold 4': [['B', 'C', 'D'], ['A', 'E']],
               'Fold 5': [['A', 'D', 'E'], ['B', 'C']],
               'Fold 6': [['A', 'C', 'E'], ['B', 'D']],
               'Fold 7': [['A', 'C', 'D'], ['B', 'E']],
               'Fold 8': [['A', 'B', 'E'], ['C', 'D']],
               'Fold 9': [['A', 'B', 'D'], ['C', 'E']],
               'Fold 10': [['A', 'B', 'C'], ['D', 'E']]}

In [None]:
def preprocessing(df_train, df_test):
    data = pd.concat([df_train, df_test])
    
    data['m3_missing'] = data['measurement_3'].isnull().astype(np.int8)
    data['m5_missing'] = data['measurement_5'].isnull().astype(np.int8)
    data['area'] = data['attribute_2'] * data['attribute_3']

    feature = [f for f in df_test.columns if f.startswith('measurement') or f=='loading']

    
    full_fill_dict = dict()
    full_fill_dict['measurement_17'] = {
        'A': ['measurement_5','measurement_6','measurement_8'],
        'B': ['measurement_4','measurement_5','measurement_7'],
        'C': ['measurement_5','measurement_7','measurement_8','measurement_9'],
        'D': ['measurement_5','measurement_6','measurement_7','measurement_8'],
        'E': ['measurement_4','measurement_5','measurement_6','measurement_8'],
        'F': ['measurement_4','measurement_5','measurement_6','measurement_7'],
        'G': ['measurement_4','measurement_6','measurement_8','measurement_9'],
        'H': ['measurement_4','measurement_5','measurement_7','measurement_8','measurement_9'],
        'I': ['measurement_3','measurement_7','measurement_8']
    }

    
    col = [col for col in df_test.columns if 'measurement' not in col]+ ['loading','m3_missing','m5_missing']
    a = []
    b =[]
    for x in range(3,17):
        corr = np.absolute(data.drop(col, axis=1).corr()[f'measurement_{x}']).sort_values(ascending=False)
        a.append(np.round(np.sum(corr[1:4]),3))
        b.append(f'measurement_{x}')
    c = pd.DataFrame()
    c['Selected columns'] = b
    c['correlation total'] = a
    c = c.sort_values(by = 'correlation total',ascending=False).reset_index(drop = True)

    for i in range(10):
        measurement_col = 'measurement_' + c.iloc[i,0][12:]
        fill_dict = {}
        for x in data.product_code.unique() : 
            corr = np.absolute(data[data.product_code == x].drop(col, axis=1).corr()[measurement_col]).sort_values(ascending=False)
            measurement_col_dic = {}
            measurement_col_dic[measurement_col] = corr[1:5].index.tolist()
            fill_dict[x] = measurement_col_dic[measurement_col]
        full_fill_dict[measurement_col] =fill_dict

    feature = [f for f in data.columns if f.startswith('measurement') or f=='loading']
    nullValue_cols = [col for col in df_train.columns if df_train[col].isnull().sum()!=0]

    for code in data.product_code.unique():
        total_na_filled_by_linear_model = 0
        for measurement_col in list(full_fill_dict.keys()):
            tmp = data[data.product_code == code]
            column = full_fill_dict[measurement_col][code]
            tmp_train = tmp[column+[measurement_col]].dropna(how='any')
            tmp_test = tmp[(tmp[column].isnull().sum(axis=1)==0)&(tmp[measurement_col].isnull())]

            model = HuberRegressor(epsilon=1.9)
            model.fit(tmp_train[column], tmp_train[measurement_col])
            data.loc[(data.product_code==code)&(data[column].isnull().sum(axis=1)==0)&(data[measurement_col].isnull()),measurement_col] = model.predict(tmp_test[column])
            print(f'{measurement_col} : {len(tmp_test)}')
            total_na_filled_by_linear_model += len(tmp_test)

        # non-numeric columns:
        NA = data.loc[data["product_code"] == code,nullValue_cols ].isnull().sum().sum()
        model1 = KNNImputer(n_neighbors=5)
        data.loc[data.product_code==code, feature] = model1.fit_transform(data.loc[data.product_code==code, feature])

    data['measurement_avg'] = data[[f'measurement_{i}' for i in range(3, 17)]].mean(axis=1)
    data['measurement_std'] = data[[f'measurement_{i}' for i in range(3, 17)]].std(axis=1)
    data['measurement_median'] = data[[f'measurement_{i}' for i in range(3, 17)]].median(axis=1)
    data['measurement_max'] = data[[f'measurement_{i}' for i in range(3, 17)]].max(axis=1)
    data['measurement_min'] = data[[f'measurement_{i}' for i in range(3, 17)]].min(axis=1)
    data['measurement_skew'] = data[[f'measurement_{i}' for i in range(3, 17)]].skew(axis=1)
    
    df_train = data.iloc[:df_train.shape[0],:]
    df_test = data.iloc[df_train.shape[0]:,:]

    woe_encoder = WoEEncoder(variables=['attribute_0'])
    woe_encoder.fit(df_train, df_train['failure'])
    df_train = woe_encoder.transform(df_train)
    df_test = woe_encoder.transform(df_test)

    df_train = df_train.drop(columns=['measurement_std', 'measurement_median', 'measurement_max', 'measurement_min', 'measurement_skew'])
    df_test = df_test.drop(columns=['measurement_std', 'measurement_median', 'measurement_max', 'measurement_min', 'measurement_skew'])

    return df_train, df_test

In [None]:
df_train, df_test = preprocessing(train, test)

Columns selected by correlation sum of the 3 first rows : 


Unnamed: 0,Selected columns,correlation total
0,measurement_8,0.454
1,measurement_11,0.395
2,measurement_5,0.386
3,measurement_6,0.365
4,measurement_7,0.336
5,measurement_4,0.331
6,measurement_15,0.301
7,measurement_10,0.3
8,measurement_16,0.252
9,measurement_14,0.225



-------- Product code A ----------

filled by linear model :
measurement_17 : 386
measurement_8 : 167
measurement_11 : 225
measurement_5 : 113


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


measurement_6 : 146
measurement_7 : 153


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


measurement_4 : 79
measurement_15 : 273
measurement_10 : 209
measurement_16 : 293
measurement_14 : 237

2281 filled by linear model 
1568 filled by KNN 

-------- Product code B ----------

filled by linear model :
measurement_17 : 418
measurement_8 : 165
measurement_11 : 220
measurement_5 : 83
measurement_6 : 106
measurement_7 : 176
measurement_4 : 80
measurement_15 : 294
measurement_10 : 197
measurement_16 : 358
measurement_14 : 330

2427 filled by linear model 
1548 filled by KNN 

-------- Product code C ----------

filled by linear model :
measurement_17 : 391
measurement_8 : 211
measurement_11 : 231
measurement_5 : 141


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


measurement_6 : 150
measurement_7 : 140
measurement_4 : 110
measurement_15 : 319
measurement_10 : 262
measurement_16 : 343
measurement_14 : 340

2638 filled by linear model 
1706 filled by KNN 

-------- Product code D ----------

filled by linear model :
measurement_17 : 398
measurement_8 : 146
measurement_11 : 265
measurement_5 : 87
measurement_6 : 118
measurement_7 : 146
measurement_4 : 88
measurement_15 : 313
measurement_10 : 174
measurement_16 : 322
measurement_14 : 316

2373 filled by linear model 
1600 filled by KNN 

-------- Product code E ----------

filled by linear model :
measurement_17 : 429


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


measurement_8 : 171
measurement_11 : 244
measurement_5 : 116


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


measurement_6 : 127
measurement_7 : 185
measurement_4 : 105
measurement_15 : 315
measurement_10 : 193
measurement_16 : 316
measurement_14 : 297

2498 filled by linear model 
1634 filled by KNN 

-------- Product code F ----------

filled by linear model :
measurement_17 : 420
measurement_8 : 194
measurement_11 : 226
measurement_5 : 90


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


measurement_6 : 137
measurement_7 : 147
measurement_4 : 91
measurement_15 : 333
measurement_10 : 186
measurement_16 : 356
measurement_14 : 348

2528 filled by linear model 
1545 filled by KNN 

-------- Product code G ----------

filled by linear model :
measurement_17 : 373


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


measurement_8 : 188
measurement_11 : 221
measurement_5 : 104
measurement_6 : 146
measurement_7 : 145
measurement_4 : 93


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


measurement_15 : 299
measurement_10 : 226
measurement_16 : 343
measurement_14 : 268

2406 filled by linear model 
1518 filled by KNN 

-------- Product code H ----------

filled by linear model :
measurement_17 : 361


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


measurement_8 : 147
measurement_11 : 205


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


measurement_5 : 112
measurement_6 : 121
measurement_7 : 158
measurement_4 : 75
measurement_15 : 299
measurement_10 : 217
measurement_16 : 340
measurement_14 : 283

2318 filled by linear model 
1565 filled by KNN 

-------- Product code I ----------

filled by linear model :
measurement_17 : 377
measurement_8 : 192
measurement_11 : 209


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


measurement_5 : 119


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


measurement_6 : 132
measurement_7 : 136


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


measurement_4 : 89
measurement_15 : 350
measurement_10 : 246
measurement_16 : 294
measurement_14 : 283

2427 filled by linear model 
1402 filled by KNN 


In [None]:
features = ['loading', 'attribute_0', 'measurement_17', 'measurement_0', 'measurement_1', 'measurement_2', 'area', 'm3_missing', 'm5_missing', 'measurement_avg']

In [None]:
NUM_FEATURE = len(features)
DROPOUT_RATE = 0.2

In [None]:
# [ref] https://www.kaggle.com/competitions/tabular-playground-series-aug-2022/discussion/349385
def get_stacked_dense(x, cur_dim, stack_depth, activation):
    """
    Parameters
    ----------
        `x`: current layer
        `cur_dim`: the number of dimensions of the first stacked layer
        `stack_depth`: the number of stacked layers
        `activation`: the activation function used for the dense layers

    Returns
    -------
        `x`: the output of the stacked layers
    """
    for i in range(stack_depth):
        x = Dense(cur_dim-i, activation)(x)
    return x

def get_triple_stacked(x_0, start_dim):
    """
    Parameters
    ----------
        `x_0`: current layer
        `start_dim`: the number of dimensions of the first stacked layer

    Returns
    -------
        `x_00000, x_00001, x_00010, x_00011, x_00100, x_00101, x_00110, x_00111`: the outputs of the stacked layers
    """
    x_00 = Dense(start_dim, swish)(x_0)
    x_000 = Dense(start_dim-1, swish)(x_00)
    x_0000 = Dense(start_dim-2, swish)(x_000)

    x_00000 = get_stacked_dense(x_0000, start_dim-3, 2, swish)
    x_00000 = BatchNormalization()(x_00000)

    x_00001 = get_stacked_dense(x_0000, start_dim-4, 2, swish)
    x_00001 = BatchNormalization()(x_00001)

    x_0001 = Dense(start_dim-3, swish)(x_000)
    x_00010 = get_stacked_dense(x_0001, start_dim-4, 2, swish)
    x_00010 = BatchNormalization()(x_00010)

    x_00011 = get_stacked_dense(x_0001, start_dim-5, 2, swish)
    x_00011 = BatchNormalization()(x_00011)

    x_001 = Dense(start_dim-2, swish)(x_00)
    x_0010 = Dense(start_dim-3, swish)(x_001)
    x_00100 = get_stacked_dense(x_0010, start_dim-4, 2, swish)
    x_00100 = BatchNormalization()(x_00100)

    x_00101 = get_stacked_dense(x_0010, start_dim-5, 2, swish)
    x_00101 = BatchNormalization()(x_00101)

    x_0011 = Dense(start_dim-4, swish)(x_001)
    x_00110 = get_stacked_dense(x_0011, start_dim-5, 2, swish)
    x_00110 = BatchNormalization()(x_00110)

    x_00111 = get_stacked_dense(x_0011, start_dim-6, 2, swish)
    x_00111 = BatchNormalization()(x_00111)

    return x_00000, x_00001, x_00010, x_00011, x_00100, x_00101, x_00110, x_00111

def get_model(feature_num):
    """
    Parameters
    ----------
        `feature_num`: the number of the features from the training dataset

    Returns
    -------
        `model`: the assembled model
    """
    inputs = Input(shape=(feature_num,))
    x = Dense(20, swish)(inputs)
    x = BatchNormalization()(x)
    x = Dropout(DROPOUT_RATE)(x)
    x_0 = Dense(20, swish)(x)
    x_00000, x_00001, x_00010, x_00011, x_00100, x_00101, x_00110, x_00111 = get_triple_stacked(x_0, 19)
    x_01000, x_01001, x_01010, x_01011, x_01100, x_01101, x_10110, x_01111 = get_triple_stacked(x_0, 18)
    cat_layer_list = [x_00000, x_00001, x_00010, x_00011, x_00100, x_00101, x_00110, x_00111, x_01000, x_01001, x_01010, x_01011, x_01100, x_01101, x_10110, x_01111]
    cat = concatenate(cat_layer_list)
    dense1 = Dense(20, swish)(cat)
    output = Dense(1, sigmoid)(dense1)
    model = Model(inputs, output)
    return model

In [None]:
ES = tf.keras.callbacks.EarlyStopping(
            monitor="val_loss", patience=12, mode="min", restore_best_weights=True, verbose=2)
LR = tf.keras.callbacks.ReduceLROnPlateau(
            monitor="val_loss", factor=0.5, patience=5, mode="min", restore_best_weights=True, min_lr=1e-12, verbose=2)

In [None]:
test_predictions = np.zeros((df_test.shape[0], 1))
total_folds = len(folds_dict.keys())
for i, fold in enumerate(folds_dict.keys()):
    
    print(fold)
    
    x_train, y_train = df_train[df_train['product_code'].isin(folds_dict[fold][0])][features].values, df_train[df_train['product_code'].isin(folds_dict[fold][0])]['failure'].values
    x_val, y_val = df_train[df_train['product_code'].isin(folds_dict[fold][1])][features].values, df_train[df_train['product_code'].isin(folds_dict[fold][1])]['failure'].values

    model = get_model(NUM_FEATURE)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-2), loss='binary_crossentropy', metrics=['binary_accuracy'])
    history = model.fit(x_train, y_train,
                        batch_size=32,
                        epochs=300,
                        validation_data=(x_val, y_val),
                        callbacks=[ES, LR],
                        verbose=1)
    history_df = pd.DataFrame(history.history)
    print(f"Min val loss: {min(history_df['val_loss'].values)}")
    model.save(f"model{i}.h5")
    y_pred = model.predict(df_test[features].values)
    test_predictions += y_pred / total_folds

Fold 1
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 8: ReduceLROnPlateau reducing learning rate to 0.004999999888241291.
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 13: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 21: ReduceLROnPlateau reducing learning rate to 0.0012499999720603228.
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 29: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 35: ReduceLROnPlateau reducing learning rate to 0.0003124999930150807.
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 40: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
Epoch 41/300
Epoch 42

In [None]:
sub = pd.DataFrame({'id':df_test['id'], 'failure':test_predictions.reshape(-1)})
sub.to_csv('TF_NN_cross.csv', index=False)

In [None]:
!kaggle competitions submit -c tabular-playground-series-aug-2022 -f TF_NN_cross.csv -m "Tensorflow NN cross"

100% 520k/520k [00:03<00:00, 160kB/s]
Successfully submitted to Tabular Playground Series - Aug 2022

In [None]:
!zip models.zip ./*.h5

  adding: model0.h5 (deflated 71%)
  adding: model1.h5 (deflated 71%)
  adding: model2.h5 (deflated 71%)
  adding: model3.h5 (deflated 71%)
  adding: model4.h5 (deflated 71%)
  adding: model5.h5 (deflated 71%)
  adding: model6.h5 (deflated 71%)
  adding: model7.h5 (deflated 71%)
  adding: model8.h5 (deflated 71%)
  adding: model9.h5 (deflated 71%)
