In [None]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors as rdkit_Descriptors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# Read the cleaned CSV file
file_path = 'gdb9_G4MP2_withdata_hydrogenation_clean.csv'
data = pd.read_csv(file_path)

# Function to compute all available RDKit Descriptors
def compute_all_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    descriptors = {}
    
    # Compute all the RDKit Descriptors
    for descriptor_name, descriptor_fn in rdkit_Descriptors.descList:
        descriptors[descriptor_name] = descriptor_fn(mol)
    
    return list(descriptors.values())
    
# Apply the compute_all_descriptors function to the unsat_SMILE column
X = data['unsat_SMILE'].apply(compute_all_descriptors)
X = pd.DataFrame(X.tolist()) # Convert the result into a DataFrame
y = data['delta_H']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply PCA
n_components = 10 # You can adjust this value
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Create and fit the Linear Regression model
model = LinearRegression()
model.fit(X_train_pca, y_train)

# Predict and evaluate
y_pred = model.predict(X_test_pca)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error on Test Set: {mse}')

# Scatter plot of actual vs predicted values
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual Hydrogenation Enthalpy')
plt.ylabel('Predicted Hydrogenation Enthalpy')
plt.title('Actual vs. Predicted Hydrogenation Enthalpy using Linear Regression')
plt.show()
