<a href="https://colab.research.google.com/github/Gressling/notebooks/blob/main/Compare_CatBoost%2C_XGBoost%2C_and_LightGBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# title: Compare CatBoost, XGBoost, and LightGBM (on sythetic datase molecular properties)
# author: Gressling, T                               # license: MIT License
# code: github.com/gressling/notebooks               # activity: single example
# indices: [wiki.Q5591907]

In [None]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Number of samples
n_samples = 1000

# Generate synthetic data
X = np.random.rand(n_samples, 10)  # 10 molecular descriptors
y = np.sum(X, axis=1) + np.random.normal(0, 0.1, n_samples)  # A simple linear combination plus some noise

# Create a DataFrame
columns = [f'descriptor_{i+1}' for i in range(10)]
data = pd.DataFrame(X, columns=columns)
data['target'] = y

# Save to CSV
data.to_csv('molecular_properties.csv', index=False)

# Display the first few rows of the dataset
print(data.head())

   descriptor_1  descriptor_2  descriptor_3  descriptor_4  descriptor_5  \
0      0.374540      0.950714      0.731994      0.598658      0.156019   
1      0.020584      0.969910      0.832443      0.212339      0.181825   
2      0.611853      0.139494      0.292145      0.366362      0.456070   
3      0.607545      0.170524      0.065052      0.948886      0.965632   
4      0.122038      0.495177      0.034389      0.909320      0.258780   

   descriptor_6  descriptor_7  descriptor_8  descriptor_9  descriptor_10  \
0      0.155995      0.058084      0.866176      0.601115       0.708073   
1      0.183405      0.304242      0.524756      0.431945       0.291229   
2      0.785176      0.199674      0.514234      0.592415       0.046450   
3      0.808397      0.304614      0.097672      0.684233       0.440152   
4      0.662522      0.311711      0.520068      0.546710       0.184854   

     target  
0  5.052581  
1  3.840160  
2  4.042754  
3  4.975320  
4  4.156834  


In [None]:
!pip install catboost

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load dataset
data = pd.read_csv('molecular_properties.csv')
X = data.drop(columns=['target'])
y = data['target']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# CatBoost
from catboost import CatBoostRegressor

cat_model = CatBoostRegressor(verbose=0)
cat_model.fit(X_train, y_train)
cat_preds = cat_model.predict(X_test)
cat_mse = mean_squared_error(y_test, cat_preds)

# XGBoost
import xgboost as xgb

xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)
xgb_mse = mean_squared_error(y_test, xgb_preds)

# LightGBM
import lightgbm as lgb

lgb_model = lgb.LGBMRegressor()
lgb_model.fit(X_train, y_train)
lgb_preds = lgb_model.predict(X_test)
lgb_mse = mean_squared_error(y_test, lgb_preds)


In [None]:
print(f"CatBoost MSE: {cat_mse}")
print(f"XGBoost MSE: {xgb_mse}")
print(f"LightGBM MSE: {lgb_mse}")

CatBoost MSE: 0.03005122531829603
XGBoost MSE: 0.155008613162719
LightGBM MSE: 0.09856335735167443
