In [1]:
%matplotlib inline
%load_ext autoreload

import pandas as pd
from processing_utils import *

### Data Pipeline Application

In [18]:
raw_dataset = pd.read_csv('./dataset/data.csv')
print(raw_dataset.head())
pruned_dataset = prune_dataset_lines(raw_dataset, remove_nan_lines=False, remove_nan_cols=True)
print("Raw dataset shape =", raw_dataset.shape, " Pruned dataset shape =", pruned_dataset.shape)

encoded_pruned_data = encode_smiles_column_of(pruned_dataset, 'count_encoding') # change to one_hot_encoding here
print("Encoded dataset shape =", encoded_pruned_data.shape)

X_train, y_train, X_val, y_val, X_test, y_test = return_required_data(
    encoded_pruned_data, 
    ['Energy_(kcal/mol)', 'Energy DG:kcal/mol)'], 
    normalize=True
)

print("X_train shape =", X_train.shape)
print("y_train shape =", y_train.shape)
print("X_val shape =", X_val.shape)
print("y_val shape =", y_val.shape)
print("X_test shape =", X_test.shape)
print("y_test shape =", y_test.shape)

                 Chiral_Molecular_SMILES  Energy_(kcal/mol)  \
0                 Fc1ccc(cc1)c1cn[nH]c1N       -51177.14356   
1                 Fc1ccc(cc1)c1cn[nH]c1N       -51177.14356   
2   O=C(c1ccccc1)O[C@H]1c2ccccc2C(=O)N1C       -73542.31526   
3   O=C(c1ccccc1)O[C@H]1c2ccccc2C(=O)N1C       -73542.31444   
4  O=C(c1ccccc1)O[C@@H]1c2ccccc2C(=O)N1C       -73542.31445   

   Zero_point_energy_(kcal/mol@0K)  Enthalpy_(kcal/mol@298K)  \
0                        93.711236                101.010221   
1                        93.716622                101.010607   
2                       150.283189                161.552872   
3                       150.296914                161.555511   
4                       150.299719                161.554575   

   Gibbs_energy_(kcal/mol@298K)  Energy DG:kcal/mol)       PMI1         PMI2  \
0                     72.646019         -51104.49754  15.116918  1138.689784   
1                     72.676293         -51104.46727  55.760384  2320.928165

### Linear Regression

In [20]:
from sklearn.linear_model import LinearRegression

X_train, y_train, X_val, y_val, X_test, y_test = return_required_data(
    encoded_pruned_data, 
    ['Energy_(kcal/mol)', 'Energy DG:kcal/mol)'], 
    normalize=True
)

X_train = np.vstack((X_train, X_val))
y_train = np.vstack((y_train, y_val))

LR_Energy_ = LinearRegression()
LR_Energy_ = LR_Energy_.fit(X_train, y_train[:, 0])

LR_EnergyDG = LinearRegression()
LR_EnergyDG = LR_EnergyDG.fit(X_train, y_train[:, 1])

print("mse(y_train_Energy_, y_pred_train_Energy_) =", mean_squared_error(y_train[:, 0], LR_Energy_.predict(X_train)))
print("mse(y_val_Energy_, y_pred_val_Energy_) =", mean_squared_error(y_val[:, 0], LR_Energy_.predict(X_val)))
print("mse(y_test_Energy_, y_pred_test_Energy_) =", mean_squared_error(y_test[:, 0], LR_Energy_.predict(X_test)))
print("==================================")

print("Test prediction mse =", mean_squared_error(y_test[:5, 0], LR_Energy_.predict(X_test[:5, :])))
print("Actual Real target values : \n", y_test[:5, 0])
print("Sample predicted vector : \n", LR_Energy_.predict(X_test[:5, :]))
print("==================================")

print("mse(y_train_EnergyDG, y_pred_train_EnergyDG) =", mean_squared_error(y_train[:, 1], LR_EnergyDG.predict(X_train)))
print("mse(y_val_EnergyDG, y_pred_val_EnergyDG) =", mean_squared_error(y_val[:, 1], LR_EnergyDG.predict(X_val)))
print("mse(y_test_Energy_, y_pred_test_Energy_) =", mean_squared_error(y_test[:, 1], LR_EnergyDG.predict(X_test)))
print("==================================")

print("Test prediction mse =", mean_squared_error(y_test[:5, 1], LR_EnergyDG.predict(X_test[:5, :])))
print("Actual Real target values : \n", y_test[:5, 1])
print("Sample predicted vector : \n", LR_EnergyDG.predict(X_test[:5, :]))

mse(y_train_Energy_, y_pred_train_Energy_) = 9.732707977533945e-05
mse(y_val_Energy_, y_pred_val_Energy_) = 0.00031667777941247365
mse(y_test_Energy_, y_pred_test_Energy_) = 0.006046406758231887
Test prediction mse = 0.0052624012713294544
Actual Real target values : 
 [-0.72683536 -1.74296008 -0.29714503 -0.20861087 -0.16368761]
Sample predicted vector : 
 [-0.76949048 -1.85523726 -0.40411846 -0.18920654 -0.17184475]
mse(y_train_EnergyDG, y_pred_train_EnergyDG) = 9.730094514767952e-05
mse(y_val_EnergyDG, y_pred_val_EnergyDG) = 0.0003166054307816974
mse(y_test_Energy_, y_pred_test_Energy_) = 0.006028496735773142
Test prediction mse = 0.005238001463340732
Actual Real target values : 
 [-0.72697726 -1.73961602 -0.29509866 -0.20820851 -0.16520833]
Sample predicted vector : 
 [-0.76949701 -1.85158427 -0.40188427 -0.18881718 -0.17333226]


In [25]:
print("Cross validation error LR_Energy_ =", cross_validation_of( LR_Energy_, X_train, y_train))
print("Cross Validation of EnergyGD =", cross_validation_of( LR_EnergyDG, X_train, y_train))

Cross validation error LR_Energy_ = 0.00011824140029793778
Cross Validation of EnergyGD = 0.00011824140029793778
