# Hyperparameter Tuning for Chemical Bonds Prediction

## 1. Data Loading and Preprocessing

In [1]:

import pandas as pd

# Load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Impute NaN values
median_value = train_data['AlogP'].median()
train_data['AlogP'].fillna(median_value, inplace=True)
test_data['AlogP'].fillna(median_value, inplace=True)


## 2. Feature Engineering

In [2]:

from sklearn.preprocessing import PolynomialFeatures

# Create polynomial features
X = train_data.drop(columns=['id', 'SMILES', 'MLM', 'HLM'])
X_test = test_data.drop(columns=['id', 'SMILES'])

poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X)
X_test_poly = poly.transform(X_test)


## 3. Hyperparameter Tuning

In [3]:

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from catboost import CatBoostRegressor

# Define target variables
y_mlm = train_data['MLM']
y_hlm = train_data['HLM']

# Reduced hyperparameters for demonstration
param_dist_rf = {
    'n_estimators': [100],
    'max_depth': [10]
}

param_dist_gbr = {
    'n_estimators': [100],
    'learning_rate': [0.1]
}

param_dist_cat = {
    'iterations': [100],
    'learning_rate': [0.1],
    'depth': [10]
}

# Error handling for hyperparameter tuning
try:
    rf_mlm_search = RandomizedSearchCV(RandomForestRegressor(random_state=42), param_distributions=param_dist_rf, n_iter=1, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    rf_mlm_search.fit(X_poly, y_mlm)
    best_rf_mlm_params = rf_mlm_search.best_params_
except:
    best_rf_mlm_params = {'n_estimators': 100, 'max_depth': 10}

try:
    rf_hlm_search = RandomizedSearchCV(RandomForestRegressor(random_state=42), param_distributions=param_dist_rf, n_iter=1, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    rf_hlm_search.fit(X_poly, y_hlm)
    best_rf_hlm_params = rf_hlm_search.best_params_
except:
    best_rf_hlm_params = {'n_estimators': 100, 'max_depth': 10}

try:
    gbr_mlm_search = RandomizedSearchCV(GradientBoostingRegressor(random_state=42), param_distributions=param_dist_gbr, n_iter=1, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    gbr_mlm_search.fit(X_poly, y_mlm)
    best_gbr_mlm_params = gbr_mlm_search.best_params_
except:
    best_gbr_mlm_params = {'n_estimators': 100, 'learning_rate': 0.1}

try:
    gbr_hlm_search = RandomizedSearchCV(GradientBoostingRegressor(random_state=42), param_distributions=param_dist_gbr, n_iter=1, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    gbr_hlm_search.fit(X_poly, y_hlm)
    best_gbr_hlm_params = gbr_hlm_search.best_params_
except:
    best_gbr_hlm_params = {'n_estimators': 100, 'learning_rate': 0.1}

try:
    cat_mlm_search = RandomizedSearchCV(CatBoostRegressor(random_state=42, verbose=0), param_distributions=param_dist_cat, n_iter=1, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    cat_mlm_search.fit(X_poly, y_mlm)
    best_cat_mlm_params = cat_mlm_search.best_params_
except:
    best_cat_mlm_params = {'iterations': 100, 'learning_rate': 0.1, 'depth': 10}

try:
    cat_hlm_search = RandomizedSearchCV(CatBoostRegressor(random_state=42, verbose=0), param_distributions=param_dist_cat, n_iter=1, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    cat_hlm_search.fit(X_poly, y_hlm)
    best_cat_hlm_params = cat_hlm_search.best_params_
except:
    best_cat_hlm_params = {'iterations': 100, 'learning_rate': 0.1, 'depth': 10}

best_rf_mlm_params, best_rf_hlm_params, best_gbr_mlm_params, best_gbr_hlm_params, best_cat_mlm_params, best_cat_hlm_params


({'n_estimators': 100, 'max_depth': 10},
 {'n_estimators': 100, 'max_depth': 10},
 {'n_estimators': 100, 'learning_rate': 0.1},
 {'n_estimators': 100, 'learning_rate': 0.1},
 {'learning_rate': 0.1, 'iterations': 100, 'depth': 10},
 {'learning_rate': 0.1, 'iterations': 100, 'depth': 10})

## 4. Model Training with Best Hyperparameters

In [4]:
from sklearn.ensemble import GradientBoostingRegressor
# Train Random Forest with best hyperparameters
rf_mlm_best = RandomForestRegressor(**best_rf_mlm_params, random_state=42)
rf_mlm_best.fit(X_poly, y_mlm)

rf_hlm_best = RandomForestRegressor(**best_rf_hlm_params, random_state=42)
rf_hlm_best.fit(X_poly, y_hlm)

# Train Gradient Boosting Regressor with best hyperparameters
gbr_mlm_best = GradientBoostingRegressor(**best_gbr_mlm_params, random_state=42)
gbr_mlm_best.fit(X_poly, y_mlm)

gbr_hlm_best = GradientBoostingRegressor(**best_gbr_hlm_params, random_state=42)
gbr_hlm_best.fit(X_poly, y_hlm)

# Train CatBoost with best hyperparameters
cat_mlm_best = CatBoostRegressor(**best_cat_mlm_params, random_state=42, verbose=0)
cat_mlm_best.fit(X_poly, y_mlm)

cat_hlm_best = CatBoostRegressor(**best_cat_hlm_params, random_state=42, verbose=0)
cat_hlm_best.fit(X_poly, y_hlm)


<catboost.core.CatBoostRegressor at 0x26d5e2b3a50>

## 5. Predictions and Submission Creation

In [5]:

# Predictions
rf_mlm_preds = rf_mlm_best.predict(X_test_poly)
rf_hlm_preds = rf_hlm_best.predict(X_test_poly)

gbr_mlm_preds = gbr_mlm_best.predict(X_test_poly)
gbr_hlm_preds = gbr_hlm_best.predict(X_test_poly)

cat_mlm_preds = cat_mlm_best.predict(X_test_poly)
cat_hlm_preds = cat_hlm_best.predict(X_test_poly)

# Averaging predictions from the three models
final_pred_mlm = (rf_mlm_preds + gbr_mlm_preds + cat_mlm_preds) / 3
final_pred_hlm = (rf_hlm_preds + gbr_hlm_preds + cat_hlm_preds) / 3

# Create final submission DataFrame
submission = pd.DataFrame({
    'id': test_data['id'],
    'MLM': final_pred_mlm,
    'HLM': final_pred_hlm
})

# Save submission to CSV
submission.to_csv('submission.csv', index=False)
