In [27]:
import random
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import sys
import PIL
import cv2
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score

# # keras
import tensorflow as tf
from tensorflow.keras import models
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import GaussianNoise
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

# plt
import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = 10, 5

%config InlineBackend.figure_format = 'svg' 
%matplotlib inline

In [2]:
print('Python       :', sys.version.split('\n')[0])
print('Numpy        :', np.__version__)
print('Tensorflow   :', tf.__version__)

In [3]:
RANDOM_SEED = 42

In [4]:
DATA_DIR_TRAIN = '../input/merck-train/'
train = pd.read_csv(DATA_DIR_TRAIN + 'ACT4_competition_training.csv')

In [5]:
train.sample(10)

In [6]:
train.shape

In [7]:
i = 0
delete_columns = []
for column in train.columns.to_list()[2:]:
    count = len(train[column].value_counts().values)
    if count == 1:
        i += 1
        delete_columns.append(column)
print(i)

In [8]:
# delete columns that do not provide any data
df = train.drop(delete_columns, axis=1)

In [18]:
df.shape

In [9]:
def visualize_distributions(titles_values_dict):
  columns = min(3, len(titles_values_dict))
  rows = (len(titles_values_dict) - 1) // columns + 1
  fig = plt.figure(figsize = (columns * 6, rows * 4))
  for i, (title, values) in enumerate(titles_values_dict.items()):
    hist, bins = np.histogram(values, bins = 20)
    ax = fig.add_subplot(rows, columns, i + 1)
    ax.bar(bins[:-1], hist, width = (bins[1] - bins[0]) * 0.7)
    ax.set_xlabel(title)
    ax.set_ylabel("Distribution")
    ax.set_title(title)
  plt.show()

visualize_distributions({
    'Activity': df['Act'],
})


In [10]:
plt.scatter(y=df['Act'], x=df.index, s=1)

In [11]:
# rescale features
for column in df.columns.to_list()[2:]:
    df[column] = np.log(df[column] + 1)

In [23]:
y = df['Act']     # target value
X = df.drop(['MOLECULE', 'Act'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, shuffle=True, random_state=RANDOM_SEED)

In [32]:
def neural_network(input_shape=(128,)):

    model = models.Sequential()

    model.add(Dense(4000, activation='relu', input_shape=input_shape, kernel_regularizer=l2(0.0001)))
    model.add(Dropout(0.25))

    model.add(Dense(2000, activation='relu', input_shape=input_shape, kernel_regularizer=l2(0.0001)))
    model.add(Dropout(0.25))

    model.add(Dense(1000, activation='relu', input_shape=input_shape, kernel_regularizer=l2(0.0001)))
    model.add(Dropout(0.25))

    model.add(Dense(1000, activation='relu', input_shape=input_shape, kernel_regularizer=l2(0.0001)))
    model.add(Dropout(0.10))

    model.add(Dense(1, activation=None, use_bias=True, kernel_regularizer=l2(0.0001)))

    # model.summary()

    return model

In [28]:
optimizer = SGD(lr=0.05, momentum=0.9, clipnorm=1.0)
earlystop = EarlyStopping(monitor='val_MAPE', patience=50, restore_best_weights=True,)

In [43]:
input_dim=X_train.shape[1]

In [45]:
model = neural_network((input_dim,))
model.summary()

In [46]:
model.compile(loss='MAPE',optimizer=optimizer, metrics=['MAPE'])

In [47]:
history = model.fit(X_train, np.log(y_train),
                    batch_size=128,
                    epochs=500, # or less if EarlyStopping brings to stop learning
                    validation_data=(X_test, np.log(y_test)),
                    callbacks=[earlystop],
                    verbose=0,
                   )

In [105]:
history

In [48]:
plt.title('Loss')
plt.plot(history.history['MAPE'], label='train')
plt.plot(history.history['val_MAPE'], label='test')
plt.show();

In [49]:
test_predict = np.exp(model.predict(X_test))
print(f"TEST R2-score: {((r2_score(y_test, test_predict[:,0])))}")

In [51]:
def mape(y_true, y_pred):
    return np.mean(np.abs((y_pred-y_true)/y_true))
print(f"TEST mape: {(mape(y_test, test_predict[:,0]))*100:0.2f}%")

In [104]:
# according to the formular provided
def r2_score(y_true, y_pred):
    a = np.sum(np.array(y_true-np.mean(y_true))*(y_pred-np.mean(y_pred)))
    b = np.sum((np.array(y_true-np.mean(y_true)))**2)
    c = np.sum((y_pred-np.mean(y_pred))**2)
    return a*a/(b*c)
              
print(f"TEST r2_score: {r2_score(y_test, test_predict[:,0])}")                  