**Compared to first draft, we will add batch normalization layer, we'll remove highly correlated features and apply PCA dimensionality reduction**

In [1]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/lish-moa/train_features.csv
/kaggle/input/lish-moa/test_features.csv
/kaggle/input/lish-moa/train_targets_nonscored.csv
/kaggle/input/lish-moa/sample_submission.csv
/kaggle/input/lish-moa/train_targets_scored.csv


# Import Section

In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt #used for basic plotting
import seaborn as sns #used for advanced plotting

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss
from sklearn.decomposition import PCA

from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.losses import BinaryCrossentropy
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import EarlyStopping

# Constants Section

In [3]:
RUN_KERAS, INVESTIGATE_BEST_MODEL, MAKE_PREDICTION = True, False, True

param ={'definition' : {'n_inputs': None,
                        'n_output': None, # needs to be updated before use
                        'optimizer': 'adam',
                        'loss': 'binary_crossentropy',
                        'activation':'relu',
                        'dropout': 0.5},
       'fitting' : {'batch_size':200,
                    'epochs':200,
                    'callbacks': [],
                    'validation_split':0.2},
        'PCA':{'min_n_components':400
            
                }
       }

# Useful functions

In [4]:
def preprocess_data(data):
    res = data.copy()
    if("sig_id" in res.columns):
        res.set_index("sig_id", inplace=True)
    if("cp_type" in res.columns):
        res["cp_type"] = (res["cp_type"] == "trt_cp").apply(lambda x: int(x)) #1 if "trt_cp" and 0 if "ctl_vehicle"
    if("cp_dose" in res.columns):
        res["cp_dose"] = (res["cp_dose"] == "D1").apply(lambda x: int(x)) #1 if "D1" and 0 if "D2"
    if("cp_time" in res.columns):
        res["cp_time"] = res["cp_time"] // 24 - 1
    return res

def read_data(file_path):
    return preprocess_data(pd.read_csv(file_path))

def define_model(param):
    assert param['n_inputs'] != None and param['n_outputs'] != None, "Data Shape is None"
    
    clf = Sequential()
    #input layer
    clf.add(Dense(int(1.5 * param['n_inputs']), input_dim=param['n_inputs'], activation=param['activation']))
    clf.add(BatchNormalization())
    clf.add(Dropout(param['dropout']))
    
    #first hidden layers
    clf.add(Dense(int(param['n_inputs']), activation=param['activation']))
    clf.add(BatchNormalization())
    clf.add(Dropout(param['dropout']))
    
    #second hidden layers
    clf.add(Dense(int(2 * param['n_outputs']), activation=param['activation']))
    clf.add(BatchNormalization())
    clf.add(Dropout(param['dropout']))
    
    #third hidden layers
    clf.add(Dense(int(1.5 * param['n_outputs']), activation=param['activation']))
    clf.add(BatchNormalization())
    clf.add(Dropout(param['dropout']))
    
    #output layer
    clf.add(Dense(param['n_outputs'], activation = 'sigmoid'))
    
    clf.compile(optimizer=param['optimizer'],loss=param['loss'])
    
    return clf


# Loading & Preparing data

### Loading

In [5]:
#load training data
X = train_features = read_data("/kaggle/input/lish-moa/train_features.csv")
Y = train_targets_scored = read_data("/kaggle/input/lish-moa/train_targets_scored.csv")
train_targets_non_scored = read_data("/kaggle/input/lish-moa/train_targets_nonscored.csv")

#load testing data
test_features = read_data("/kaggle/input/lish-moa/test_features.csv")

### removing highly correlated features

In [6]:
threshold, columns_to_remove = 0.8, []
corr = X.corr()
for i in range(len(corr.columns)):
    cond = False
    for j in range(i+1,len(corr.columns)):
        if(corr.iloc[j][corr.columns[i]] >= threshold and corr.index[j] not in columns_to_remove):
            columns_to_remove.append(corr.index[j])

#keep only poorly correlated features
X = X[list(set(X.columns)-set(columns_to_remove))]

### Dimensionality reduction with PCA

In [7]:
pca = PCA(n_components=min(param['PCA']['min_n_components'], len(X.columns)), svd_solver='auto')
X = pca.fit_transform(X)

# Modelization Section

In [8]:
if(RUN_KERAS):
    #split data
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, shuffle=True)

    #update param dict
    param['definition']['n_inputs'], param['definition']['n_outputs'] = X_train.shape[1], y_train.shape[1]

    #define model
    clf = define_model(param['definition'])

    #train model
    param['fitting']['callbacks'].append(EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10))
    clf.fit(X_train, y_train, **param['fitting'])

    #predict with test features
    y_predict = clf.predict(X_test)

    #evaluate predictions
    bce = BinaryCrossentropy()
    loss = bce(y_test, y_predict).numpy()
    print("logarithmic loss between test data and real data is = {}".format(loss))

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 00075: early stopping
logari

In [9]:
if(INVESTIGATE_BEST_MODEL):
    # create model
    model = KerasClassifier(build_fn=create_model, n_in=n_inputs, n_out=n_outputs, verbose=0)

    # define the grid search parameters
    batch_size = [100, 200]
    epochs = [50, 100]
    param_grid = dict(batch_size=batch_size, epochs=epochs)
    print(param_grid)
    grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
    grid_result = grid.fit(X_train, y_train)

    # summarize results
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))

# Prediction Section

In [10]:
if(MAKE_PREDICTION):
    test_features_ = pca.transform(test_features[list(set(test_features.columns)-set(columns_to_remove))])
    predictions = clf.predict(test_features_)
    submission_sample = pd.DataFrame(data=predictions,
                                     index=test_features.index,
                                     columns=train_targets_scored.columns).reset_index(drop=False)
    submission_sample.to_csv('submission.csv', index=False)
    submission_sample.head()

=> No major improvement with the PCA so far