# Execute this block if your running this notebook in Colab

In [1]:
!gdown 1iOQziZqU0v7eGaHnCZmdKEOHJP5idc_y
!mkdir /root/.kaggle/
!mv ./kaggle.json /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json
!pip install -q kaggle
!kaggle competitions download -c tabular-playground-series-aug-2022
!unzip tabular-playground-series-aug-2022.zip

Downloading...
From: https://drive.google.com/uc?id=1iOQziZqU0v7eGaHnCZmdKEOHJP5idc_y
To: /content/kaggle.json
100% 64.0/64.0 [00:00<00:00, 95.6kB/s]
Downloading tabular-playground-series-aug-2022.zip to /content
  0% 0.00/2.27M [00:00<?, ?B/s]
100% 2.27M/2.27M [00:00<00:00, 57.5MB/s]
Archive:  tabular-playground-series-aug-2022.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


# Start from here if your running this notebook in other environments

In [3]:
!pip install feature_engine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting feature_engine
  Downloading feature_engine-1.5.2-py2.py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.0/290.0 KB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature_engine
Successfully installed feature_engine-1.5.2


In [4]:
from tensorflow.keras.layers import Input, BatchNormalization, Dense, Dropout, concatenate
from tensorflow.nn import sigmoid
from tensorflow.keras.activations import swish
from tensorflow.keras.models import Model
import tensorflow as tf
from sklearn.linear_model import HuberRegressor
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from feature_engine.encoding import WoEEncoder
import pandas as pd
import numpy as np

## Loading datasets

In [5]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

## Data pre-processing

In [20]:
def preprocessing(df_train, df_test):
    data = pd.concat([df_train, df_test])
    
    data['m3_missing'] = data['measurement_3'].isnull().astype(np.int8)
    data['m5_missing'] = data['measurement_5'].isnull().astype(np.int8)
    data['area'] = data['attribute_2'] * data['attribute_3']

    feature = [f for f in df_test.columns if f.startswith('measurement') or f=='loading']

    
    full_fill_dict = dict()
    full_fill_dict['measurement_17'] = {
        'A': ['measurement_5','measurement_6','measurement_8'],
        'B': ['measurement_4','measurement_5','measurement_7'],
        'C': ['measurement_5','measurement_7','measurement_8','measurement_9'],
        'D': ['measurement_5','measurement_6','measurement_7','measurement_8'],
        'E': ['measurement_4','measurement_5','measurement_6','measurement_8'],
        'F': ['measurement_4','measurement_5','measurement_6','measurement_7'],
        'G': ['measurement_4','measurement_6','measurement_8','measurement_9'],
        'H': ['measurement_4','measurement_5','measurement_7','measurement_8','measurement_9'],
        'I': ['measurement_3','measurement_7','measurement_8']
    }

    
    col = [col for col in df_test.columns if 'measurement' not in col]+ ['loading','m3_missing','m5_missing']
    a = []
    b =[]
    for x in range(3,17):
        corr = np.absolute(data.drop(col, axis=1).corr()[f'measurement_{x}']).sort_values(ascending=False)
        a.append(np.round(np.sum(corr[1:4]),3))
        b.append(f'measurement_{x}')
    c = pd.DataFrame()
    c['Selected columns'] = b
    c['correlation total'] = a
    c = c.sort_values(by = 'correlation total',ascending=False).reset_index(drop = True)

    for i in range(10):
        measurement_col = 'measurement_' + c.iloc[i,0][12:]
        fill_dict = {}
        for x in data.product_code.unique() : 
            corr = np.absolute(data[data.product_code == x].drop(col, axis=1).corr()[measurement_col]).sort_values(ascending=False)
            measurement_col_dic = {}
            measurement_col_dic[measurement_col] = corr[1:5].index.tolist()
            fill_dict[x] = measurement_col_dic[measurement_col]
        full_fill_dict[measurement_col] =fill_dict

    feature = [f for f in data.columns if f.startswith('measurement') or f=='loading']
    nullValue_cols = [col for col in df_train.columns if df_train[col].isnull().sum()!=0]

    for code in data.product_code.unique():
        total_na_filled_by_linear_model = 0
        for measurement_col in list(full_fill_dict.keys()):
            tmp = data[data.product_code == code]
            column = full_fill_dict[measurement_col][code]
            tmp_train = tmp[column+[measurement_col]].dropna(how='any')
            tmp_test = tmp[(tmp[column].isnull().sum(axis=1)==0)&(tmp[measurement_col].isnull())]

            model = HuberRegressor(epsilon=1.9)
            model.fit(tmp_train[column], tmp_train[measurement_col])
            data.loc[(data.product_code==code)&(data[column].isnull().sum(axis=1)==0)&(data[measurement_col].isnull()),measurement_col] = model.predict(tmp_test[column])
            total_na_filled_by_linear_model += len(tmp_test)

        # non-numeric columns:
        NA = data.loc[data["product_code"] == code,nullValue_cols ].isnull().sum().sum()
        model1 = KNNImputer(n_neighbors=3)
        data.loc[data.product_code==code, feature] = model1.fit_transform(data.loc[data.product_code==code, feature])

    data['measurement_avg'] = data[[f'measurement_{i}' for i in range(3, 17)]].mean(axis=1)
    data['measurement_std'] = data[[f'measurement_{i}' for i in range(3, 17)]].std(axis=1)
    data['measurement_median'] = data[[f'measurement_{i}' for i in range(3, 17)]].median(axis=1)
    data['measurement_max'] = data[[f'measurement_{i}' for i in range(3, 17)]].max(axis=1)
    data['measurement_min'] = data[[f'measurement_{i}' for i in range(3, 17)]].min(axis=1)
    data['measurement_skew'] = data[[f'measurement_{i}' for i in range(3, 17)]].skew(axis=1)
    
    
    
    df_train = data.iloc[:df_train.shape[0],:]
    df_test = data.iloc[df_train.shape[0]:,:]

    woe_encoder = WoEEncoder(variables=['attribute_0'])
    woe_encoder.fit(df_train, df_train['failure'])
    df_train = woe_encoder.transform(df_train)
    df_test = woe_encoder.transform(df_test)

    df_train = df_train.drop(columns=['measurement_std', 'measurement_median', 'measurement_max', 'measurement_min', 'measurement_skew'])
    df_test = df_test.drop(columns=['measurement_std', 'measurement_median', 'measurement_max', 'measurement_min', 'measurement_skew'])

    return df_train, df_test

In [21]:
_, df_test = preprocessing(train, test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

## Loading the models and downloading the weigths. ([Link to the model](https://drive.google.com/file/d/10EaRD58y6gF8YA688_sPbVz06xOMk7YH/view?usp=sharing))

In [8]:
!gdown 10EaRD58y6gF8YA688_sPbVz06xOMk7YH
!unzip models.zip

Downloading...
From: https://drive.google.com/uc?id=10EaRD58y6gF8YA688_sPbVz06xOMk7YH
To: /content/models.zip
  0% 0.00/2.23M [00:00<?, ?B/s]100% 2.23M/2.23M [00:00<00:00, 151MB/s]
Archive:  models.zip
  inflating: model0.h5               
  inflating: model1.h5               
  inflating: model2.h5               
  inflating: model3.h5               
  inflating: model4.h5               
  inflating: model5.h5               
  inflating: model6.h5               
  inflating: model7.h5               
  inflating: model8.h5               
  inflating: model9.h5               


In [9]:
features = ['loading', 'attribute_0', 'measurement_17', 'measurement_0', 'measurement_1', 'measurement_2', 'area', 'm3_missing', 'm5_missing', 'measurement_avg']
NUM_FEATURE = len(features)
DROPOUT_RATE = 0.2
NUM_MODEL = 10

In [10]:
# [ref] https://www.kaggle.com/competitions/tabular-playground-series-aug-2022/discussion/349385
def get_stacked_dense(x, cur_dim, stack_depth, activation):
    """
    Parameters
    ----------
        `x`: current layer
        `cur_dim`: the number of dimensions of the first stacked layer
        `stack_depth`: the number of stacked layers
        `activation`: the activation function used for the dense layers

    Returns
    -------
        `x`: the output of the stacked layers
    """
    for i in range(stack_depth):
        x = Dense(cur_dim-i, activation)(x)
    return x

def get_triple_stacked(x_0, start_dim):
    """
    Parameters
    ----------
        `x_0`: current layer
        `start_dim`: the number of dimensions of the first stacked layer

    Returns
    -------
        `x_00000, x_00001, x_00010, x_00011, x_00100, x_00101, x_00110, x_00111`: the outputs of the stacked layers
    """
    x_00 = Dense(start_dim, swish)(x_0)
    x_000 = Dense(start_dim-1, swish)(x_00)
    x_0000 = Dense(start_dim-2, swish)(x_000)

    x_00000 = get_stacked_dense(x_0000, start_dim-3, 2, swish)
    x_00000 = BatchNormalization()(x_00000)

    x_00001 = get_stacked_dense(x_0000, start_dim-4, 2, swish)
    x_00001 = BatchNormalization()(x_00001)

    x_0001 = Dense(start_dim-3, swish)(x_000)
    x_00010 = get_stacked_dense(x_0001, start_dim-4, 2, swish)
    x_00010 = BatchNormalization()(x_00010)

    x_00011 = get_stacked_dense(x_0001, start_dim-5, 2, swish)
    x_00011 = BatchNormalization()(x_00011)

    x_001 = Dense(start_dim-2, swish)(x_00)
    x_0010 = Dense(start_dim-3, swish)(x_001)
    x_00100 = get_stacked_dense(x_0010, start_dim-4, 2, swish)
    x_00100 = BatchNormalization()(x_00100)

    x_00101 = get_stacked_dense(x_0010, start_dim-5, 2, swish)
    x_00101 = BatchNormalization()(x_00101)

    x_0011 = Dense(start_dim-4, swish)(x_001)
    x_00110 = get_stacked_dense(x_0011, start_dim-5, 2, swish)
    x_00110 = BatchNormalization()(x_00110)

    x_00111 = get_stacked_dense(x_0011, start_dim-6, 2, swish)
    x_00111 = BatchNormalization()(x_00111)

    return x_00000, x_00001, x_00010, x_00011, x_00100, x_00101, x_00110, x_00111

def get_model(feature_num):
    """
    Parameters
    ----------
        `feature_num`: the number of the features from the training dataset

    Returns
    -------
        `model`: the assembled model
    """
    inputs = Input(shape=(feature_num,))
    x = Dense(20, swish)(inputs)
    x = BatchNormalization()(x)
    x = Dropout(DROPOUT_RATE)(x)
    x_0 = Dense(20, swish)(x)
    x_00000, x_00001, x_00010, x_00011, x_00100, x_00101, x_00110, x_00111 = get_triple_stacked(x_0, 19)
    x_01000, x_01001, x_01010, x_01011, x_01100, x_01101, x_10110, x_01111 = get_triple_stacked(x_0, 18)
    cat_layer_list = [x_00000, x_00001, x_00010, x_00011, x_00100, x_00101, x_00110, x_00111, x_01000, x_01001, x_01010, x_01011, x_01100, x_01101, x_10110, x_01111]
    cat = concatenate(cat_layer_list)
    dense1 = Dense(20, swish)(cat)
    output = Dense(1, sigmoid)(dense1)
    model = Model(inputs, output)
    return model

In [12]:
test_predictions = np.zeros((df_test.shape[0], 1)) # ensembled results from 10 models
for i in range(NUM_MODEL):

    model = get_model(NUM_FEATURE)
    model.load_weights(f"model{i}.h5")
    y_pred = model.predict(df_test[features].values)
    test_predictions += y_pred / NUM_MODEL



## Writing the result into a .csv file

In [13]:
sunmission = pd.DataFrame({'id':df_test['id'], 'failure':test_predictions.reshape(-1)})
sunmission.to_csv('submission_final.csv', index=False)

# Run the following cell if this submission is going to be submitted to Kaggle 

In [14]:
!kaggle competitions submit -c tabular-playground-series-aug-2022 -f submission_final.csv -m "Final submission"

100% 520k/520k [00:03<00:00, 158kB/s]
Successfully submitted to Tabular Playground Series - Aug 2022