In [122]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
from sklearn import preprocessing
from sklearn.metrics import r2_score, classification_report, roc_auc_score, f1_score, mean_absolute_error, mean_squared_error
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, LeaveOneOut, train_test_split
from numpy import random

In [91]:
import re
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfTransformer as tfidfvec
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestRegressor

In [45]:
X = np.hstack((np.arange(20).reshape(-1, 1), np.arange(20).reshape(-1, 1)))

In [89]:
from sklearn.linear_model import Ridge, ElasticNet

In [94]:
model_dict = {"Ridge": [{'alpha' : [0.01, 0.1, 1, 10, 100]}, Ridge()],
              
              "RF" : [{'n_estimators' : [200],
             'min_samples_split':[2, 5, 8],
             'min_samples_leaf': [2, 5, 8],
             'max_features' : ["sqrt", "log2"],
             'max_samples' : [0.5, 0.8, 1]
             }, RandomForestRegressor()]
             }

In [103]:
def find_best_model(X, y, metric, model, params_dict):

    (X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.2, shuffle = True, random_state=22)
    
    scaler = preprocessing.StandardScaler()
    X_train_scaler = scaler.fit_transform(X_train)
    X_test_scaler = scaler.transform(X_test)
    
    scores_test = []
    
    optimizer = GridSearchCV(model, param_grid = params_dict, 
                                 cv = 4, n_jobs = 3, scoring = "neg_mean_absolute_error", refit=True)
    
    optimizer.fit(X_train_scaler, y_train)
    #print(model.name, optimizer.best_score_)

    metric_score = metric(y_test, optimizer.best_estimator_.predict(X_test_scaler))
    r2_metric_score = r2_score(y_test, optimizer.best_estimator_.predict(X_test_scaler))
    
    return metric_score, r2_metric_score    

### QM7 

In [54]:
import os

In [68]:
def matrix_reader(folder):
    files = os.listdir(os.path.join('dataset',folder))
    for file in files:
        if file.startswith("matrix"):
            X = pd.read_csv(os.path.join('dataset',folder, file))
            yield X

In [115]:
def y_reader(folder):
    y = pd.read_csv(os.path.join('dataset', folder, 'targets.csv'), header=None)
    for i in range(y.shape[1]):
        if y.iloc[:, i].dtype==np.float64:
            yield y.iloc[:, i]
        else:
            continue

In [74]:
for X in matrix_reader('QM7'):
    print(X)

      C_0s*  C_1d*  C_1s*  C_1t*  C_2d*  C_2s*  C_2t*  C_3d*  C_3s*  C_4s*  \
0       1.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
1       0.0    0.0    2.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
2       0.0    2.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
3       0.0    0.0    0.0    2.0    0.0    0.0    0.0    0.0    0.0    0.0   
4       0.0    0.0    1.0    0.0    0.0    1.0    0.0    0.0    0.0    0.0   
...     ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
6829    0.0    0.0    2.0    0.0    1.0    1.0    1.0    1.0    0.0    0.0   
6830    0.0    0.0    2.0    0.0    2.0    1.0    0.0    0.0    1.0    0.0   
6831    0.0    0.0    1.0    0.0    2.0    2.0    1.0    0.0    0.0    0.0   
6832    0.0    1.0    1.0    0.0    0.0    3.0    0.0    1.0    0.0    0.0   
6833    0.0    1.0    2.0    0.0    0.0    1.0    0.0    1.0    1.0    0.0   

      ...  N_2d*  N_2s*  N_3s*  O_1d*  O_1s*  O_2s*  S_1d*  S_2

In [127]:
folders = ['QM7', 'QM8', 'QM9', 'ESOL', 'FreeSolv', 'Lipophilicity']
metrics_to_test = [mean_absolute_error, mean_absolute_error, mean_absolute_error,
           mean_squared_error, mean_squared_error, mean_squared_error]

In [104]:
stats = []
for y in y_reader('QM7'):
    for X in matrix_reader('QM7'):
        metric, r2_metric_score = find_best_model(X, y, metric=mean_absolute_error, model=model_dict['Ridge'][1], 
                                           params_dict=model_dict['Ridge'][0])
        stats.append([metric, r2_metric_score])

In [129]:
stats = []

for folder, metric_for_task in zip(folders[-3:], metrics_to_test[-3:]):
    print(folder)
    for i, y in enumerate(y_reader(folder)): 
        stats.append([folder + "_" + f"task_{i}"])
        metrics = []
        for X in matrix_reader(folder):
            metric, r2_metric_score = find_best_model(X, y, metric=metric_for_task, model=model_dict['Ridge'][1], 
                                           params_dict=model_dict['Ridge'][0])
            metrics.append([len(X), metric, r2_metric_score])
        
        metrics = np.array(metrics)
        best_metric_arg = np.argmin(metrics[:, 1])
        
        stats[-1] += list(metrics[best_metric_arg])

ESOL
FreeSolv
Lipophilicity


In [130]:
stats

[['ESOL_task_0', 1128.0, 0.7857239471538578, 0.8297426280183919],
 ['FreeSolv_task_0', 642.0, 2.145667772617778, 0.8886470703730552],
 ['Lipophilicity_task_0', 4200.0, 0.8431889896239785, 0.4000400371342089]]

In [124]:
stats_pd = pd.DataFrame(stats, columns=['Dataset_task', 'Data Len', 'metric', 'r2_metric'])

In [131]:
stats_pd.iloc[-3:] = stats

In [134]:
stats_pd['task type'] = 'regression'

In [138]:
stats_pd['metric_type'] = None

In [144]:
for i in range(len(stats_pd)):
    if stats_pd.iloc[i][0].startswith('Q'):
        stats_pd.iloc[i, 5] = 'MAE'
    else:
        stats_pd.iloc[i, 5] = 'RMSE'

In [150]:
stats_pd['Data Len'] = stats_pd['Data Len'].astype(np.int)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  stats_pd['Data Len'] = stats_pd['Data Len'].astype(np.int)


In [153]:
stats_pd['metric'] = stats_pd['metric'].round(3)
stats_pd['r2_metric'] = stats_pd['r2_metric'].round(3)

In [154]:
stats_pd

Unnamed: 0,Dataset_task,Data Len,metric,r2_metric,task type,metric_type
0,QM7_task_0,6834,57.125,0.771,regression,MAE
1,QM8_task_0,21786,0.015,0.801,regression,MAE
2,QM8_task_1,21786,0.013,0.77,regression,MAE
3,QM8_task_2,21786,0.024,0.363,regression,MAE
4,QM8_task_3,21786,0.042,0.165,regression,MAE
5,QM8_task_4,21786,0.015,0.827,regression,MAE
6,QM8_task_5,21786,0.013,0.825,regression,MAE
7,QM8_task_6,21786,0.022,0.392,regression,MAE
8,QM8_task_7,21786,0.035,0.144,regression,MAE
9,QM8_task_8,21786,0.015,0.827,regression,MAE


In [155]:
stats_pd.to_csv('Result/regression_task_matrix.csv', index=False)

In [133]:
aggr_stats = {}
count = 0
for row in stats_pd.values:
    prefix = row[0].split('_')
    aggr_stats[prefix] += 

array([['QM7_task_0', 6834.0, 57.12466205051201, 0.7708046517671339],
       ['QM8_task_0', 21786.0, 0.014988540443121959, 0.800903686777822],
       ['QM8_task_1', 21786.0, 0.012575620091595864, 0.7698592950986939],
       ['QM8_task_2', 21786.0, 0.02386013425434605, 0.36328203298457773],
       ['QM8_task_3', 21786.0, 0.042289928303828286, 0.16489671419485263],
       ['QM8_task_4', 21786.0, 0.015080735182230348, 0.827310994520346],
       ['QM8_task_5', 21786.0, 0.012940021520120918, 0.8245154995261422],
       ['QM8_task_6', 21786.0, 0.021830596606850123, 0.3917910401280622],
       ['QM8_task_7', 21786.0, 0.034767586835059816, 0.14434154500460206],
       ['QM8_task_8', 21786.0, 0.015080735182230348, 0.827310994520346],
       ['QM8_task_9', 21786.0, 0.012940021520120918, 0.8245154995261422],
       ['QM8_task_10', 21786.0, 0.021830596606850123, 0.3917910401280622],
       ['QM8_task_11', 21786.0, 0.034767586835059816,
        0.14434154500460206],
       ['QM8_task_12', 21786.0, 

In [157]:
stats_pd['Dataset'] = [d_t.split("_")[0] for d_t in stats_pd.iloc[:, 0]]

In [164]:
stats_pd['r2_metric'][stats_pd['r2_metric'] <= 0] = 0
stats_pd['metric'][stats_pd['r2_metric'] <= 0] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stats_pd['r2_metric'][stats_pd['r2_metric'] <= 0] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stats_pd['metric'][stats_pd['r2_metric'] <= 0] = 0


In [165]:
stats_pd

Unnamed: 0,Dataset_task,Data Len,metric,r2_metric,task type,metric_type,Dataset
0,QM7_task_0,6834,57.125,0.771,regression,MAE,QM7
1,QM8_task_0,21786,0.015,0.801,regression,MAE,QM8
2,QM8_task_1,21786,0.013,0.77,regression,MAE,QM8
3,QM8_task_2,21786,0.024,0.363,regression,MAE,QM8
4,QM8_task_3,21786,0.042,0.165,regression,MAE,QM8
5,QM8_task_4,21786,0.015,0.827,regression,MAE,QM8
6,QM8_task_5,21786,0.013,0.825,regression,MAE,QM8
7,QM8_task_6,21786,0.022,0.392,regression,MAE,QM8
8,QM8_task_7,21786,0.035,0.144,regression,MAE,QM8
9,QM8_task_8,21786,0.015,0.827,regression,MAE,QM8


In [169]:
stats_pd.groupby(['Dataset'], observed=True).mean().to_csv('Result/regression_task_matrix_aggregate.csv')