In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

### Descriptor datasets

In [9]:
import pickle

with open("../data/dataset_X-descriptor_y-relposhydrogen_allcentral_1bondedhyd.pkl", 'rb') as file:
    data = pickle.load(file)

In [48]:
import pickle

with open("../data/dataset_X-descriptor_y-relposhydrogen_allcentral_1bondedhyd_max3atoms.pkl", 'rb') as file:
    data = pickle.load(file)

In [25]:
import pickle

with open("../data/dataset_X-descriptor_y-relposhydrogen_allcentral_1bondedhyd_max3atoms_l2sorted.pkl", 'rb') as file:
    data = pickle.load(file)

### No Descriptor datasets

In [42]:
import pickle

with open("../data/no_descriptor_approach_no_extras.pkl", 'rb') as file:
    data = pickle.load(file)

In [58]:
import pickle

with open("../data/no_descriptor_approach_sorted.pkl", 'rb') as file:
    data = pickle.load(file)

### Hybrid (Descriptor appended)

In [1]:
import pickle

with open("../data/no_descriptor_approach_sorted_and_descriptor.pkl", 'rb') as file:
    data = pickle.load(file)

# Data loading

In [3]:
X, y = data

In [4]:
X = np.asarray(X)
y = np.asarray(y)

In [5]:
X.shape

(381646, 62)

In [6]:
y.shape

(381646, 3)

In [79]:
X[:2]

array([[ 8.        ,  6.        ,  1.        ,  0.461     , -0.519     ,
        -1.25      ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        , 73.51669472, 33.57068578,  0.        ,  0.        ,
         0.        ,  0.        , 33.57068578, 36.8581052 ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ],
       [ 6.        ,  6.        ,  1.        ,  0.579     ,  0.336     ,
         1.25   

In [80]:
y[:2]

array([[-0.794,  0.502, -0.139],
       [-0.925,  0.496, -0.341]])

In [7]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [82]:
X_train.shape

(343481, 62)

In [83]:
X_test.shape

(38165, 62)

# Normal Regression

In [84]:
# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

In [85]:
print(f"intercept: {model.intercept_}")
print(f"slope: {model.coef_}")

intercept: [ 0.04416486 -0.08964643  0.0249412 ]
slope: [[-9.30071768e-03  7.24249156e-03 -8.76380291e-03 -6.41438707e-01
  -5.12499691e-03 -6.04584313e-03  1.38609570e-03 -8.41220915e-03
  -6.89457267e-01 -5.92822538e-03 -2.87082773e-03  1.12844912e-03
  -2.48712413e-02 -7.08658581e-01 -5.93226822e-03 -3.51547530e-03
   1.19742910e+00  1.86032079e-01  1.05346423e+00 -1.49098261e+00
  -1.43804826e+00  5.03686531e-01  8.39477552e-02  1.29513650e+00
   2.31543615e+00  2.62329941e+00  2.52046574e-04 -5.25610075e-04
  -5.06321042e-05  4.83516524e-04  5.35530479e-01 -2.24286682e+00
  -5.25610078e-04 -1.04834792e-05  2.79952386e-04 -1.81386359e-04
   5.63596323e-01 -3.59245102e-01 -5.06320995e-05  2.79952390e-04
  -3.30823137e-05 -4.34310492e-04  5.11722332e-01  9.11922031e-01
   4.83516521e-04 -1.81386359e-04 -4.34310493e-04 -4.95107496e-06
  -2.59852607e+00 -3.59515347e-01  5.35530479e-01  5.63596323e-01
   5.11722332e-01 -2.59852607e+00  2.51049294e-02 -1.20051779e+00
  -2.24286682e+00 -3

In [86]:
# Make predictions
y_pred_train = model.predict(X_train)

In [87]:
# Evaluate the model
r2_train = r2_score(y_train, y_pred_train)
print(f'R² score: {r2_train}')

R² score: 0.80829497431986


In [88]:
# Make predictions
y_pred = model.predict(X_test)

In [89]:
# Compute R² score
r2 = r2_score(y_test, y_pred)
print(f'R² score: {r2}')

# Compute MAE
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error (MAE): {mae}')

# Compute MSE
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error (MSE): {mse}')

# Compute Normalized MAE
normalized_mae = mae / np.mean(np.abs(y_test))
print(f'Normalized Mean Absolute Error (Normalized MAE): {normalized_mae}')

# Compute Normalized MSE
normalized_mse = mse / np.mean(np.square(y_test))
print(f'Normalized Mean Squared Error (Normalized MSE): {normalized_mse}')


R² score: 0.8092941193825959
Mean Absolute Error (MAE): 0.1568410366948355
Mean Squared Error (MSE): 0.071456071636799
Normalized Mean Absolute Error (Normalized MAE): 0.2969969534144954
Normalized Mean Squared Error (Normalized MSE): 0.19069493429736906


In [20]:
y_pred[:2]

array([[-0.05132291, -0.0154987 , -0.93118411],
       [ 0.8268751 ,  0.70030951,  0.37714775]])

In [21]:
y_test[:2]

array([[-0.007, -0.064, -1.078],
       [ 0.751,  0.676,  0.382]])

# Ridge Regression

In [9]:
from sklearn.linear_model import Ridge
from sklearn.multioutput import MultiOutputRegressor

# Initialize the base Ridge model
ridge_base = Ridge(alpha=0.1)

# Wrap the Ridge model with MultiOutputRegressor
ridge_model = MultiOutputRegressor(ridge_base)

# Train the model
ridge_model.fit(X_train, y_train)

# Predict on the test set
y_pred_ridge = ridge_model.predict(X_test)

# Compute R² score
r2_ridge = r2_score(y_test, y_pred_ridge)
print(f'Ridge Regression Multi-Output R² score: {r2_ridge}')

Ridge Regression Multi-Output R² score: 0.8095698325398889


# Lasso Regression

In [10]:
from sklearn.linear_model import Lasso
from sklearn.multioutput import MultiOutputRegressor

# Initialize the base Lasso model
lasso_base = Lasso(alpha=0.01)

# Wrap the Lasso model with MultiOutputRegressor
lasso_model = MultiOutputRegressor(lasso_base)

# Train the model
lasso_model.fit(X_train, y_train)

# Predict on the test set
y_pred_lasso = lasso_model.predict(X_test)

# Compute R² score
r2_lasso = r2_score(y_test, y_pred_lasso)
print(f'Lasso Regression Multi-Output R² score: {r2_lasso}')

Lasso Regression Multi-Output R² score: 0.80355153496684


# Polynomial Regression
- <span style="color:red">takes too long</span>

### Order 2

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

# Polynomial regression of order 2
poly2 = PolynomialFeatures(degree=2)
X_train_poly2 = poly2.fit_transform(X_train)
X_test_poly2 = poly2.transform(X_test)

# Fit the model
model_poly2 = LinearRegression()
model_poly2.fit(X_train_poly2, y_train)

# Predict on the test set
y_pred_poly2 = model_poly2.predict(X_test_poly2)

# Compute R² score
r2_poly2 = r2_score(y_test, y_pred_poly2)
print(f'Polynomial Regression Order 2 R² score: {r2_poly2}')

### Order 3

In [None]:
# Polynomial regression of order 3
poly3 = PolynomialFeatures(degree=3)
X_train_poly3 = poly3.fit_transform(X_train)
X_test_poly3 = poly3.transform(X_test)

# Fit the model
model_poly3 = LinearRegression()
model_poly3.fit(X_train_poly3, y_train)

# Predict on the test set
y_pred_poly3 = model_poly3.predict(X_test_poly3)

# Compute R² score
r2_poly3 = r2_score(y_test, y_pred_poly3)
print(f'Polynomial Regression Order 3 R² score: {r2_poly3}')

# Leaderboard
- no descriptor no extras
    - linear regression: 0.807
- no descriptor sorted
    - linear regression: 0.808
- no descriptor sorted and descriptor added at the end:
    - linear regression: 0.809
    - advanced neural network: 0.862 