# Merck Molecular Activity Challenge

In [1]:
#importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
from sklearn.impute import SimpleImputer

#Importing Concrete Compressive Strength Dataset
dataset1 = pd.read_csv('ACT2_competition_training.csv',sep=",")
dataset2 = pd.read_csv('ACT4_competition_training.csv',sep=",")
dataset = pd.concat([dataset1, dataset2])
data = pd.DataFrame(dataset) 
print(data)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # This is added back by InteractiveShellApp.init_path()


         Act  D_10  D_1000  D_10000  D_10001  D_10002  D_10003  D_10004  \
0     6.7153   0.0       0        0        0        0      0.0        0   
1     6.4912   0.0       0        0        0        0      0.0        0   
2     5.8528   0.0       0        0        0        0      0.0        0   
3     6.3854   0.0       0        0        0        0      0.0        0   
4     5.8941   0.0       0        0        0        0      0.0        0   
...      ...   ...     ...      ...      ...      ...      ...      ...   
1810  6.6956   NaN       0        0        0        0      NaN        0   
1811  7.0471   NaN       0        0        0        0      NaN        0   
1812  7.2048   NaN       0        0        0        0      NaN        0   
1813  6.7425   NaN       0        0        0        0      NaN        0   
1814  6.7157   NaN       0        0        0        0      NaN        0   

      D_10005  D_10006  ...  D_999  D_9992  D_9993  D_9994  D_9995  D_9996  \
0           0        

In [2]:
# Check if dataset has null values
result = data.isna()
print("-> Contains Missing values                 ",end='')
print(result.values.any())
print("-> Total Number of Missing values:         ",end='')
print(result.sum().sum())
print("-> Number of Missing values by column:     ",end='')
print(result.sum())

-> Contains Missing values                 True
-> Total Number of Missing values:         7442881
-> Number of Missing values by column:     Act            0
D_10        1815
D_1000         0
D_10000        0
D_10001        0
            ... 
D_9996         0
D_9997         0
D_9998         0
D_9999         0
MOLECULE       0
Length: 6315, dtype: int64


In [3]:
# Filling Missing values with mean
data = data.fillna(data.mean())

In [4]:
# Check if dataset has null values
result = data.isna()
print("-> Contains Missing values                 ",end='')
print(result.values.any())
print("-> Total Number of Missing values:         ",end='')
print(result.sum().sum())
print("-> Number of Missing values by column:     ",end='')
print(result.sum())

-> Contains Missing values                 False
-> Total Number of Missing values:         0
-> Number of Missing values by column:     Act         0
D_10        0
D_1000      0
D_10000     0
D_10001     0
           ..
D_9996      0
D_9997      0
D_9998      0
D_9999      0
MOLECULE    0
Length: 6315, dtype: int64


In [5]:
# Load the actual data, ignoring first column and using second column as targets.
X = data.iloc[:, 2:].values
Y = data.iloc[:, 1].values
print(data.shape)
print(X.shape)
print(Y.shape)

(10531, 6315)
(10531, 6313)
(10531,)


In [6]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
categorical_features = (data.select_dtypes(include=['object']).columns.values)
print(categorical_features)
X[:,-1] = labelencoder.fit_transform(data['MOLECULE'])
print(X)

['MOLECULE']
[[0 0 0 ... 0 0 914]
 [0 0 0 ... 0 0 926]
 [0 0 0 ... 0 0 942]
 ...
 [0 0 0 ... 0 0 10474]
 [0 0 0 ... 0 0 10475]
 [0 0 0 ... 0 0 10476]]


In [7]:
from sklearn.feature_selection import mutual_info_regression
mutual_information = mutual_info_regression(X, Y, discrete_features='auto', n_neighbors=3, copy=True, random_state=None)
print(mutual_information)
count = 0;
for i in range(0,len(mutual_information)):
    if(mutual_information[i]>0):
        print(count)
        X[:,count] = X[:,i]
        count+=1
X = X[:,:count]
print(X.shape)
print(len(mutual_information))

KeyboardInterrupt: 

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.33, random_state = 0)

In [None]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler().fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# Model Accuracy
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

#Defining kernel for GaussianProcessRegressor
kernel = DotProduct() + WhiteKernel()

names = ['SVR', 'DecisionTreeRegressor', 'RandomForestRegressor', 'AdaBoostRegressor','GaussianProcessRegressor','LinearRegression','MLPRegressior']
models = [SVR(),
          DecisionTreeRegressor(),
          RandomForestRegressor(),
          AdaBoostRegressor(),
          GaussianProcessRegressor(kernel=kernel),
          LinearRegression(),
          MLPRegressor()]

param_distributions = {
    'SVR': {'C': [0.01, 0.1, 1, 10], 'gamma':[0.01, 0.1, 1]},
    'DecisionTreeRegressor': {'max_depth': [10, 20]},
    'RandomForestRegressor': {'max_depth': [1, 10, 20, 40],'n_estimators': [16, 32, 100],'bootstrap': [True, False],},
    'AdaBoostRegressor': {'n_estimators': [16, 32, 100],'learning_rate' : [0.2,0.4,1]},
    'GaussianProcessRegressor': {'normalize_y': ['True','False']},
    'LinearRegression': {},
    'MLPRegressior': {'hidden_layer_sizes': [(100,),(200,)],'activation':['tanh', 'relu'], 'max_iter':[200,300]},
}

accuracy= np.zeros(7)

for counter, model in enumerate(models):
    np.random.seed(0);
    gridcv = GridSearchCV(model, param_distributions[names[counter]], n_jobs=2, cv=3)
    gridcv.fit(X_train, Y_train)
    Y_pred  = gridcv.best_estimator_.predict(X_test)
    accuracy[counter] = round(mean_squared_error(Y_test, Y_pred), 3)
    print("MSE for " + names[counter] + ": "+str(accuracy[counter]))

In [None]:
import pickle
filename = "mereck.sav"
pickle.dump(gridcv.best_estimator_, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

In [None]:
#Comparision graph between all models
import seaborn as sns
y_pos = np.arange(len(names))
heights = [accuracy[0],accuracy[1],accuracy[2],accuracy[3],accuracy[4],accuracy[5],accuracy[6]]

fig, ax=plt.subplots(1,1,figsize=(12,6))

plt.xticks(rotation='90')
sns.barplot(x=names, y=heights)
plt.ylabel('MSE')
plt.title('SGEMM GPU kernel performance Dataset model')