# PINL

### Predicting molecular boiling points using ML

# data Preparation

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
df = pd.read_csv("chem_data.csv")
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df = df.dropna()
df

Unnamed: 0,BP,Subset,D002,D003,D004,D005,D006,D007,D008,D009,...,D769,D770,D771,D772,D773,D774,D775,D776,D777,D001
0,-19.1,Train,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1.00000,-0.315465,0.000000,-0.959000,0
1,301.0,Train,0,0,0,1,0,0,0,0,...,0,0,0,0,0,3.90689,-0.020333,0.545455,2.886720,0
2,83.0,Train,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0.00000,-0.668418,0.000000,0.734612,0
3,218.0,Train,0,0,0,1,0,0,0,0,...,0,0,0,0,0,3.90689,1.490470,0.500000,1.061310,0
4,250.0,Train,0,0,0,0,0,0,0,0,...,0,0,0,0,0,4.08746,-0.195345,0.000000,-2.210160,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5426,159.7,Blind,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0.00000,-0.960253,0.000000,3.911450,0
5427,85.0,Blind,0,0,0,0,0,0,0,0,...,0,0,0,0,0,3.00000,-0.920782,0.000000,2.351650,0
5428,170.0,Blind,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0.00000,-0.520919,0.000000,0.557290,0
5429,162.0,Blind,0,0,0,0,0,0,0,0,...,0,0,0,0,0,3.32193,-0.864227,0.000000,2.446070,0


## Correlation

In [None]:
(df.drop(['Subset'], axis=1)).corr()['BP']

## Graphs

In [None]:
names = {
    "D004" : "Molecular Weight" ,
    "D368" : "LogP (octanol water partition coefficient)" ,
    "D294" : "TPSA (Topological Polar Surface Area)" ,
    "D555" : "Number of Hydrogen Bond Donors" ,
    "D556" : "Number of Hydrogen Bond Acceptors" ,
    "D557" : "Number of Rotatable Bonds" ,
    "D553" : "Number of Ring Atoms" ,
    "D559" : "Number of Aromatic Rings" ,
    "D370" : "AlogP" ,
    "D560" : "Formal Charge" ,
    "D373" : "Fraction Csp3"
}

for i in names.keys():
    plt.scatter(df[i], df['BP'])
    plt.xlabel(names[i])
    plt.ylabel("Boiling Point")
    plt.title("Scatter Plot")
    plt.show()

## Splitting Dataset

In [3]:
train = df.loc[df['Subset'] == 'Train']
train = train.drop(["Subset"], axis=1)
val = df.loc[df['Subset'] == 'Validation']
val = val.drop(['Subset'], axis=1)
test = df.loc[df['Subset'] == 'Blind']
test = test.drop(["Subset"], axis=1)

In [4]:
X_train = train.drop(['BP'], axis=1)
y_train = train['BP']
X_val = val.drop(['BP'], axis=1)
y_val = val['BP']
X_test = test.drop(['BP'], axis=1)
y_test = test['BP']

## Scaling Data

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.fit_transform(X_val)
X_test_scaled = scaler.fit_transform(X_test)

# Linear Regression (TurboGlitch)

In [6]:
import pandas as pd
import seaborn as sns

cor = df.drop(columns=["Subset"])
cor.corr()["BP"]


from sklearn.linear_model import LinearRegression

predictors = ["D132", "D439", "D440", "D410", "D409", "D441", "D442", "D443", "D228", "D229","D256", "D254", "D163",  "D138", "D140", "D143", "D151", "D136", "D005", "D024", "D201", "D203", "D589", "D590", "D591", "D592", "D593", "D594", "D595", "D204", "D172", "D168", "D127", "D012", "D018", "D212", "D225", "D294", "D555", "D553", "D559", "D370", "D560", "D590", "D582", "D583", "D584", "D585", "D586"]
target = ["BP"]

reg = LinearRegression()
reg.fit(X_train[predictors], y_train)
predictions = reg.predict(X_test[predictors])
X_test["predictions"] = predictions
display(X_val)

df.describe()["BP"]

Unnamed: 0,D002,D003,D004,D005,D006,D007,D008,D009,D010,D011,...,D769,D770,D771,D772,D773,D774,D775,D776,D777,D001
4074,0,0,0,5,0,0,0,0,0,0,...,0,0,0,0,0,5.16993,-0.983974,1.000000,5.550630,0
4075,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,3.70044,-0.245218,0.600000,2.241880,0
4076,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2.80735,-0.534734,0.000000,-0.249608,0
4077,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0.00000,-0.179649,0.000000,2.226400,0
4078,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0.00000,0.582944,0.000000,-0.241238,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5156,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0.00000,-0.960253,0.000000,3.911450,0
5157,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,3.58496,-0.964984,0.000000,4.057920,0
5158,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0.00000,-0.534734,0.000000,3.451330,0
5159,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,3.45943,-0.534734,0.000000,-0.020581,0


Unnamed: 0,BP
count,5431.0
mean,188.938182
std,85.016591
min,-88.6
25%,133.0
50%,189.3
75%,245.0
max,548.0


In [7]:
lr_mae = mean_absolute_error(y_test, X_test["predictions"])
lr_mse = mean_squared_error(y_test, X_test["predictions"])
lr_r2 = r2_score(y_test, X_test["predictions"])
X_test.drop(columns=["predictions"], inplace=True)

In [None]:
pd.set_option('display.min_rows', None)
cor.corr()["BP"]

# Other models (Mujtaba-4T4)

### Ridge Regression

In [8]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=10)
ridge.fit(X_train_scaled, y_train)

In [9]:
ridge_pred = ridge.predict(X_test_scaled)
ridge_mae = mean_absolute_error(y_test, ridge_pred)
ridge_mse = mean_squared_error(y_test, ridge_pred)
ridge_r2 = r2_score(y_test, ridge_pred)

### Lasso Regression

In [10]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.1)
lasso.fit(X_train_scaled, y_train)

  model = cd_fast.enet_coordinate_descent(


In [11]:
lasso_pred = lasso.predict(X_test_scaled)
lasso_mae = mean_absolute_error(y_test, lasso_pred)
lasso_mse = mean_squared_error(y_test, lasso_pred)
lasso_r2 = r2_score(y_test, lasso_pred)

### ElasticNet Regression

In [12]:
from sklearn.linear_model import ElasticNet
enet = ElasticNet(alpha=0.1, l1_ratio=0.9)
enet.fit(X_train_scaled, y_train)

  model = cd_fast.enet_coordinate_descent(


In [13]:
enet_pred = enet.predict(X_test_scaled)
enet_mae = mean_absolute_error(y_test, enet_pred)
enet_mse = mean_squared_error(y_test, enet_pred)
enet_r2 = r2_score(y_test, enet_pred)

### K-Nearest Neighbors Regressor

In [14]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5, p=1 , weights='distance')
knn.fit(X_train_scaled, y_train)

In [15]:
knn_predictions = knn.predict(X_test_scaled)
knn_mae = mean_absolute_error(y_test, knn_predictions)
knn_mse = mean_squared_error(y_test, knn_predictions)
knn_r2 = r2_score(y_test, knn_predictions)

### Support Vector Regressor

In [16]:
from sklearn.svm import SVR
svr = SVR(kernel='linear', gamma='scale', C=1)
svr.fit(X_train_scaled, y_train)

In [17]:
svr_predictions = svr.predict(X_test_scaled)
svr_mae = mean_absolute_error(y_test, svr_predictions)
svr_mse = mean_squared_error(y_test, svr_predictions)
svr_r2 = r2_score(y_test, svr_predictions)

### Decision Tree Regressor


In [18]:
from sklearn.tree import DecisionTreeRegressor
DTR = DecisionTreeRegressor(max_depth=10, min_samples_leaf=4, min_samples_split=2, random_state=42)
DTR.fit(X_train, y_train)

In [19]:
dtr_predictions = DTR.predict(X_test)
dtr_mae = mean_absolute_error(y_test, dtr_predictions)
dtr_mse = mean_squared_error(y_test, dtr_predictions)
dtr_r2 = r2_score(y_test, dtr_predictions)

### Random Forest Regressor

In [20]:
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor(n_estimators=100, random_state=42)
RFR.fit(X_train, y_train)

In [21]:
rfr_predictions = RFR.predict(X_test)
rfr_mae = mean_absolute_error(y_test, rfr_predictions)
rfr_mse = mean_squared_error(y_test, rfr_predictions)
rfr_r2 = r2_score(y_test, rfr_predictions)

### Gradient Boosting Regressor

In [22]:
from sklearn.ensemble import GradientBoostingRegressor
GBR = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=3, random_state=42)
GBR.fit(X_train, y_train)

In [23]:
gbr_predictions = GBR.predict(X_test)
gbr_mae = mean_absolute_error(y_test, gbr_predictions)
gbr_mse = mean_squared_error(y_test, gbr_predictions)
gbr_r2 = r2_score(y_test, gbr_predictions)

### Hypertuning models

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

models = {
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "ElasticNet Regression": ElasticNet(),
    "K-Nearest Neighbors Regressor": KNeighborsRegressor(),
    "Support Vector Regressor": SVR(),
    "Decision Tree Regressor": DecisionTreeRegressor(random_state=42),
    "Random Forest Regressor": RandomForestRegressor(random_state=42),
    "Gradient Boosting Regressor": GradientBoostingRegressor(random_state=42),
}

In [None]:
param_grids = {
    "Ridge Regression": {
        'alpha': [0.1, 1.0, 10.0]
    },
    "Lasso Regression": {
        'alpha': [0.01, 0.1, 1.0]
    },
    "ElasticNet Regression": {
        'alpha': [0.01, 0.1, 1.0],
        'l1_ratio': [0.1, 0.5, 0.9]
    },
    "K-Nearest Neighbors Regressor": {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance'],
        'p': [1, 2]  # p=1: Manhattan, p=2: Euclidean
    },
    "Support Vector Regressor": {
        'kernel': ['linear', 'rbf'],
        'C': [0.1, 1, 10],
        'gamma': ['scale', 'auto']
    },
    "Decision Tree Regressor": {
        'max_depth': [5, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 4]
    },
    "Random Forest Regressor": {
        'n_estimators': [100, 200],
        'max_depth': [10, 20],
        'min_samples_split': [2, 5]
    },
    "Gradient Boosting Regressor": {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5, 10]
    }
}

In [None]:
best_models = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    grid = GridSearchCV(model, param_grids[name], cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
    grid.fit(X_train, y_train)

    print(f"Best params for {name}: {grid.best_params_}")
    print(f"Best (neg MAE) score: {grid.best_score_}")
    best_models[name] = grid.best_estimator_

# Results

In [26]:
results = {
    "Model": [
        "Linear Regression", "Ridge", "Lasso", "ElasticNet",
        "KNN", "SVR", "Decision Tree", "Random Forest", "Gradient Boosting"
    ],
    "MAE": [
        lr_mae, ridge_mae, lasso_mae, enet_mae,
        knn_mae, svr_mae, dtr_mae, rfr_mae, gbr_mae
    ],
    "MSE": [
        lr_mse, ridge_mse, lasso_mse, enet_mse,
        knn_mse, svr_mse, dtr_mse, rfr_mse, gbr_mse
    ],
    "R²": [
        lr_r2, ridge_r2, lasso_r2, enet_r2,
        knn_r2, svr_r2, dtr_r2, rfr_r2, gbr_r2
    ]
}

results_df = pd.DataFrame(results)

In [27]:
results_df

Unnamed: 0,Model,MAE,MSE,R²
0,Linear Regression,25.600227,1213.775583,0.816506
1,Ridge,16.166587,519.434157,0.921474
2,Lasso,14.271958,430.534707,0.934913
3,ElasticNet,14.163653,429.01263,0.935143
4,KNN,19.644389,781.713758,0.881823
5,SVR,14.573589,441.363078,0.933276
6,Decision Tree,20.681794,923.344905,0.860412
7,Random Forest,14.599506,470.157744,0.928923
8,Gradient Boosting,14.791234,464.627743,0.929759
