In [34]:
import pandas as pd
import numpy as np
import csv 
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdMolDescriptors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error, explained_variance_score


In [35]:
def load_dataset(filepath):
    df = pd.read_excel(filepath)
    return df

In [36]:
def calculate_maccs_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    
    if mol is None:
        print(f"Invalid SMILES: {smiles}")
        return pd.Series([0] * 166)  # MACCS fingerprints have 166 bits

    # Calculate MACCS fingerprint
    fingerprint = rdMolDescriptors.GetMACCSKeysFingerprint(mol)
    bit_array = list(fingerprint.ToBitString())
    

    return pd.Series([int(bit) for bit in bit_array])


In [37]:
def prepare_data(df, smiles_column, training_column, target_column):
    
    fingerprints = df[smiles_column].apply(lambda x: calculate_maccs_fingerprint(x))
    
    # Combine the original DataFrame with the fingerprints DataFrame
    df = pd.concat([df, fingerprints], axis=1)
    
    # Drop unnecessary columns
    df = df.drop(columns=[smiles_column, training_column])
    
    # Prepare the feature matrix X and target vector y
    X = df.drop(columns=[target_column])  
    y = df[target_column]                 
    
    return X, y

In [38]:
#Split into training and test set
def split_data(X, Y, test_size=0.2, random_state=42):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=random_state)
    X_train.columns = X_train.columns.astype(str)
    X_test.columns = X_test.columns.astype(str)
    return X_train, X_test, Y_train, Y_test

In [39]:
#Standard Scaler
def scale_data(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled


In [40]:
def evaluate_models(X_train, X_test, Y_train, Y_test):
    models = {
        'Linear Regression': LinearRegression(),
        'Ridge Regression': Ridge(alpha=1.0, solver='auto'),
        'Lasso Regression': Lasso(alpha=0.01, max_iter=1000),
        'Elastic Net': ElasticNet(alpha=0.01, l1_ratio=0.5, max_iter=1000),
        'Random Forest': RandomForestRegressor(n_estimators=1000, max_depth=10, min_samples_split=2, random_state=42),
        'SVR': SVR(kernel='rbf', C=1.0, epsilon=0.1),
        'K-Neighbors Regressor': KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto'),
        'XGBoost': xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=42),
        'CatBoost': CatBoostRegressor(learning_rate=0.1, iterations=1000, depth=6, silent=True),
        'GradientBoost': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, subsample=0.8, random_state=42)
    }
    
    results = {}
    for name, model in models.items():
        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_test)
        
        # Calculate R2 and MSE
        r2 = r2_score(Y_test, Y_pred)
        mse = mean_squared_error(Y_test, Y_pred)
        rmse = np.sqrt(mse)
        results[name] = {'R2': r2, 'MSE': mse, 'RMSE': rmse}
    print("Results from evaluate_models:", results) 
    return results

In [41]:
"""ChildProcessErrordef cross_validate_models(X, Y, models, cv=10):
    results = {}
    for name, model in models.items():
        # Cross-validation for R2, MSE, RMSE
        r2 = cross_val_score(model, X, Y, cv=cv, scoring='r2')
        mse = cross_val_score(model, X, Y, cv=cv, scoring=make_scorer(mean_squared_error))
        rmse = np.sqrt(mse)
        
        # Store average scores
        results[name] = {
            'mean_R2': np.mean(r2),
            'mean_MSE': np.mean(mse),
            'mean_RMSE': np.mean(rmse)
        }
    
    return results"""

def cross_validate_models_loo(X, Y, models):
    results = {}
    loo = LeaveOneOut()  # Leave-One-Out cross-validator
    
    for name, model in models.items():
        # Cross-validation for MSE, RMSE, and Explained Variance with LOO
        mse = cross_val_score(model, X, Y, cv=loo, scoring=make_scorer(mean_squared_error))
        rmse = np.sqrt(mse)  # RMSE is the square root of MSE
        evs = cross_val_score(model, X, Y, cv=loo, scoring=make_scorer(explained_variance_score))  # Explained Variance
        
        # Store average scores
        results[name] = {
            'mean_MSE': np.mean(mse),
            'mean_RMSE': np.mean(rmse),
            'mean_Explained_Variance': np.mean(evs)  # Average explained variance
        }
    
    return results

In [42]:
def export_results_to_csv(results, filepath):
    # Convert results to a DataFrame
    results_df = pd.DataFrame.from_dict(results, orient='index')
    
    # Export DataFrame to CSV
    results_df.to_csv(filepath, index=True)  # index=True to include model names in the CSV
    print(f"Results exported to {filepath}")

In [43]:
def main(filepath, smiles_column,training_column,target_column):
    # Step 1: Load dataset
    df = load_dataset(filepath)
    
    # Step 2 & 3: Prepare the data (convert SMILES and separate X, Y)
    X, Y = prepare_data(df, smiles_column,training_column, target_column)
    
    # Step 4: Split data into train and test sets
    X_train, X_test, Y_train, Y_test = split_data(X, Y)
    
    # Step 5: Scale the data
    X_train_scaled, X_test_scaled = scale_data(X_train, X_test)


    
    # Step 6: Evaluate models and print results
    results = evaluate_models(X_train_scaled, X_test_scaled, Y_train, Y_test)
    models = {
        'Linear Regression': LinearRegression(),
        'Ridge Regression': Ridge(alpha=1.0, solver='auto'),
        'Lasso Regression': Lasso(alpha=0.01, max_iter=1000),
        'Elastic Net': ElasticNet(alpha=0.01, l1_ratio=0.5, max_iter=1000),
        'Random Forest': RandomForestRegressor(n_estimators=1000, max_depth=10, min_samples_split=2, random_state=42),
        'SVR': SVR(kernel='rbf', C=1.0, epsilon=0.1),
        'K-Neighbors Regressor': KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto'),
        'XGBoost': xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=42),
        'CatBoost': CatBoostRegressor(learning_rate=0.1, iterations=1000, depth=6, silent=True),
        'GradientBoost': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, subsample=0.8, random_state=42)
    }
    
    results1 = cross_validate_models_loo(X_train_scaled, Y_train, models)

    for model_name, metrics in results1.items():
        print(f"Model: {model_name}, R2: {metrics['mean_Explained_Variance']:.4f}, MSE: {metrics['mean_MSE']:.4f}, RMSE: {metrics['mean_RMSE']:.4f}")
 
    #output_filepath = r"C:\Users\matic\OneDrive\Desktop\model_results_cross_maccs_1.csv"
    #export_results_to_csv(results1, output_filepath)
    # Display model performances
    for model_name, metrics in results.items():
        print(f"Model: {model_name}, R2: {metrics['R2']:.4f}, MSE: {metrics['MSE']:.4f}, RMSE: {metrics['RMSE']:.4f}")

    #output_filepath1 = r"C:\Users\matic\OneDrive\Desktop\model_results_maccs_1.csv"
    #export_results_to_csv(results, output_filepath1)

In [None]:
if __name__ == "__main__":
    filepath = r"C:\Users\matic\OneDrive\Desktop\Končne tabele t QSAR\CLEAN CLEAN\#testtes_clean.xlsx"  # Path to your Excel file
    smiles_column = "SMILES"  
    target_column = "E7"    
    training_column = "Eq4-training"
    #Use 'Morgan', 'MACCS', 'AtomPair'
    

    main(filepath, smiles_column,training_column,target_column)
    