# Random forest incremental learning + Online learning

working on entire chembl dataset -- incremental learning + chunk wise cv

`/home/jovyan/proj-liujing/Data_preparation/pIC50_chembl.csv`

`/home/jovyan/proj-liujing/Data_preparation/target_to_index_dictionary_unsorted.txt`

`/home/jovyan/proj-liujing/Data_preparation/test_df.csv`

`/home/jovyan/proj-liujing/Data_preparation/train_df.csv`

`/home/jovyan/proj-liujing/Data_preparation/val_df.csv`

In [20]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.utils import shuffle
from rdkit import Chem
from rdkit.Chem import AllChem
import joblib
import gc
from itertools import product
import os


In [2]:
def smiles_to_ecfp(smiles_list, radius=2, n_bits=2048):
    """Convert a list of SMILES to ECFP fingerprints."""
    ecfp_features = []
    for smi in smiles_list:
        mol = Chem.MolFromSmiles(smi)
        if mol:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
            ecfp_features.append(list(fp))
        else:
            ecfp_features.append([0] * n_bits)  # 如果解析失败，填充0
    return np.array(ecfp_features)

In [None]:
import gc
import joblib
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.utils import shuffle
import os

def train_rf_online(train_files, model_save_path="rf_model.pkl",
                    n_estimators_per_chunk=50, max_depth=10, ecfp_bits=2048):
    """Online Learning for RandomForestRegressor with Incremental Training"""

    # if model exists, load the model
    if os.path.exists(model_save_path):
        print("Loading existing model...")
        rf = joblib.load(model_save_path)
    else:
        print("Initializing new model...")
        rf = RandomForestRegressor(n_estimators=0, max_depth=max_depth, warm_start=True, n_jobs=-1, random_state=42)

    for i, f in enumerate(train_files):
        print(f"\nTraining on chunk {i+1}/{len(train_files)}...")
        chunk = pd.read_csv(f, encoding="utf-8-sig")

        # extract X and convert smiles into ECFP
        smiles_list = chunk.iloc[:, 0].tolist()
        ecfp_features = smiles_to_ecfp(smiles_list, n_bits=ecfp_bits)
        other_features = chunk.iloc[:, 1:-6143].values
        X_chunk = np.hstack((ecfp_features, other_features))

        y_chunk = chunk.iloc[:, -6143:].values

        X_chunk, y_chunk = shuffle(X_chunk, y_chunk, random_state=42)

        # incrementlly increase m_estimators
        rf.n_estimators += n_estimators_per_chunk
        rf.fit(X_chunk, y_chunk)

        # save model after each training
        joblib.dump(rf, model_save_path)
        print(f"Saved model after chunk {i+1}.")

        # clear RAM
        del X_chunk, y_chunk, chunk, smiles_list, other_features, ecfp_features
        gc.collect()

    print(f"\nFinal model saved at {model_save_path}")
    return model_save_path


In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

def evaluate_model(model, test_file, ecfp_bits=2048):
    """Evaluate model performance on test set and return predictions dataframe"""

    test = pd.read_csv(test_file)

    test_smiles = test.iloc[:, 0].tolist()

    test_ecfp = smiles_to_ecfp(test_smiles, n_bits=ecfp_bits)

    test_other_features = test.iloc[:, 1:-6143].values

    X_test = np.hstack((test_ecfp, test_other_features))

    y_test = test.iloc[:, -6143:].values

    y_test_pred = model.predict(X_test)

    # Caculating MSE and R²
    mse = mean_squared_error(y_test, y_test_pred, multioutput='raw_values')  
    r2 = r2_score(y_test, y_test_pred, multioutput='uniform_average')  

    print(f"Test MSE (mean across tasks): {np.mean(mse):.4f}")
    print(f"Test R² Score: {r2:.4f}")

    
    predictions_df = pd.DataFrame(y_test_pred, columns=[f"{i}" for i in range(y_test_pred.shape[1])])
    predictions_df.insert(0, "smiles", test_smiles)
    return predictions_df


In [None]:
train_files = ["../uncoverted_dataset_csv/train_chunks_csv/train_chunk1.csv", 
               "../uncoverted_dataset_csv/train_chunks_csv/train_chunk2.csv", 
               "../uncoverted_dataset_csv/train_chunks_csv/train_chunk3.csv", 
               "../uncoverted_dataset_csv/train_chunks_csv/train_chunk4.csv", 
               "../uncoverted_dataset_csv/train_chunks_csv/train_chunk5.csv", 
               "../uncoverted_dataset_csv/train_chunks_csv/train_chunk6.csv", 
               "../uncoverted_dataset_csv/train_chunks_csv/train_chunk7.csv", 
               "../uncoverted_dataset_csv/train_chunks_csv/train_chunk8.csv"
               ]
# param_grid = {
#     "n_estimators": [50, 100, 150, 200], 
#     "max_depth": [5, 10], 
#     "min_samples_split": [2, 5]
# }
val_file = "/home/jovyan/proj-liujing/Data_preparation/val_df.csv"




In [6]:
model_path = train_rf_online(train_files, model_save_path="rf_model_online.pkl")


Initializing new model...

Training on chunk 1/8...
Saved model after chunk 1.

Training on chunk 2/8...
Saved model after chunk 2.

Training on chunk 3/8...
Saved model after chunk 3.

Training on chunk 4/8...
Saved model after chunk 4.

Training on chunk 5/8...
Saved model after chunk 5.

Training on chunk 6/8...
Saved model after chunk 6.

Training on chunk 7/8...
Saved model after chunk 7.

Training on chunk 8/8...
Saved model after chunk 8.

Final model saved at rf_model_online.pkl


In [9]:
rf_model = joblib.load("rf_model_online.pkl")

In [10]:
test1_file = "test1.csv"
test2_file = "test2.csv"

In [11]:
predictions_df1 = evaluate_model(rf_model, test1_file)

Test MSE (mean across tasks): 0.0113
Test R² Score: 0.0200


In [12]:
predictions_df1.tail()

Unnamed: 0,smiles,0,1,2,3,4,5,6,7,8,...,6133,6134,6135,6136,6137,6138,6139,6140,6141,6142
38711,CCc1cccn1S(=O)(=O)c1c(C)cc(C)cc1C,0.001966,0.000558,0.009789,0.03414,0.018141,0.005518,0.007094,0.000161,0.01614,...,4e-05,3e-06,1.3e-05,5.7e-05,3.8e-05,0.0,3.4e-05,1.8e-05,1.2e-05,1.1e-05
38712,CC(C)C[C@H](NC(=O)O[C@H]1CC[C@H](C(C)C)CC1)C(=...,0.001719,0.000462,0.009481,0.035622,0.017535,0.004843,0.007829,0.000143,0.014736,...,3.3e-05,5e-06,9e-06,5.4e-05,3.6e-05,0.0,3.4e-05,9e-06,1.1e-05,1.1e-05
38713,Cn1cnnc1CC1(c2cc(C3CC3)nc(N3Cc4c(cc(CNC5(C)CCC...,0.001911,0.000556,0.009396,0.033361,0.017251,0.004895,0.006177,0.000148,0.017894,...,3.4e-05,5e-06,6e-06,5.6e-05,4.1e-05,0.0,3.2e-05,1.4e-05,1.4e-05,1.1e-05
38714,O=C1Nc2ccc(Cl)cc2/C1=C/c1cccc(OCc2cn(Cc3ccccc3...,0.001984,0.000596,0.010031,0.03533,0.018694,0.005375,0.006834,0.000165,0.016535,...,3.2e-05,5e-06,1.4e-05,6.7e-05,4.3e-05,0.0,3.2e-05,1.9e-05,1.8e-05,1.1e-05
38715,CC(=O)N1CCC(=C2c3ccccc3CCc3ccccc32)CC1,0.001966,0.000558,0.009789,0.03414,0.018141,0.005518,0.007094,0.000161,0.01614,...,4e-05,3e-06,1.3e-05,5.7e-05,3.8e-05,0.0,3.4e-05,1.8e-05,1.2e-05,1.1e-05


In [13]:
predictions_df1.to_csv("rf_predictions1_ol.csv", index=False)

In [14]:
predictions_df2 = evaluate_model(rf_model, test2_file)

Test MSE (mean across tasks): 0.0112
Test R² Score: 0.0204


In [15]:
predictions_df2.tail()

Unnamed: 0,smiles,0,1,2,3,4,5,6,7,8,...,6133,6134,6135,6136,6137,6138,6139,6140,6141,6142
38711,CCCCCNC(=O)NC1CCCCC1,0.001966,0.000558,0.009789,0.03414,0.018141,0.005518,0.007094,0.000161,0.01614,...,4e-05,3.496797e-06,1.3e-05,5.7e-05,3.8e-05,0.0,3.363516e-05,1.8e-05,1.2e-05,1.055654e-05
38712,Cc1c(C)n(Cc2ccc(-c3ccccc3C(=O)O)cc2)c2ccc(C(=O...,0.001682,0.001036,0.009474,0.033778,0.018786,0.004529,0.005978,0.000125,0.01339,...,2.4e-05,1.555382e-06,1.3e-05,5.7e-05,2.4e-05,0.0,3.318259e-05,1.7e-05,1.2e-05,1.055654e-05
38713,O=c1[nH]cc(Nc2ccccc2)cc1-c1ccccc1Br,0.000693,5.1e-05,0.017622,0.029669,0.017079,0.000712,0.0011,1.2e-05,0.001902,...,1.1e-05,2.901817e-07,1e-06,6e-06,0.000221,0.0,4.400543e-06,1e-06,0.0,2.922902e-06
38714,Cc1ccc(C(=O)Nc2cc(N(C)CCN(C)C)cc(C(F)(F)F)c2)c...,9.6e-05,1.9e-05,0.000746,0.003128,0.001247,0.00022,0.000283,6e-06,0.000835,...,0.0,0.0,0.0,3e-06,4e-06,0.0,1.997062e-07,0.0,4e-06,2.341383e-07
38715,CC(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)[C@H](O)CN...,0.00174,0.000451,0.00886,0.033496,0.016833,0.004592,0.007711,0.000132,0.015956,...,2.5e-05,2.868005e-06,8e-06,5e-05,3.2e-05,0.0,3.277523e-05,8e-06,1.1e-05,9.825253e-06


In [16]:
predictions_df2.to_csv("rf_predictions2.csv", index=False)

In [17]:
predictions_df  = pd.concat([predictions_df1, predictions_df2], axis = 0, ignore_index=True)

In [18]:
predictions_df.tail()

Unnamed: 0,smiles,0,1,2,3,4,5,6,7,8,...,6133,6134,6135,6136,6137,6138,6139,6140,6141,6142
77427,CCCCCNC(=O)NC1CCCCC1,0.001966,0.000558,0.009789,0.03414,0.018141,0.005518,0.007094,0.000161,0.01614,...,4e-05,3.496797e-06,1.3e-05,5.7e-05,3.8e-05,0.0,3.363516e-05,1.8e-05,1.2e-05,1.055654e-05
77428,Cc1c(C)n(Cc2ccc(-c3ccccc3C(=O)O)cc2)c2ccc(C(=O...,0.001682,0.001036,0.009474,0.033778,0.018786,0.004529,0.005978,0.000125,0.01339,...,2.4e-05,1.555382e-06,1.3e-05,5.7e-05,2.4e-05,0.0,3.318259e-05,1.7e-05,1.2e-05,1.055654e-05
77429,O=c1[nH]cc(Nc2ccccc2)cc1-c1ccccc1Br,0.000693,5.1e-05,0.017622,0.029669,0.017079,0.000712,0.0011,1.2e-05,0.001902,...,1.1e-05,2.901817e-07,1e-06,6e-06,0.000221,0.0,4.400543e-06,1e-06,0.0,2.922902e-06
77430,Cc1ccc(C(=O)Nc2cc(N(C)CCN(C)C)cc(C(F)(F)F)c2)c...,9.6e-05,1.9e-05,0.000746,0.003128,0.001247,0.00022,0.000283,6e-06,0.000835,...,0.0,0.0,0.0,3e-06,4e-06,0.0,1.997062e-07,0.0,4e-06,2.341383e-07
77431,CC(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)[C@H](O)CN...,0.00174,0.000451,0.00886,0.033496,0.016833,0.004592,0.007711,0.000132,0.015956,...,2.5e-05,2.868005e-06,8e-06,5e-05,3.2e-05,0.0,3.277523e-05,8e-06,1.1e-05,9.825253e-06


In [19]:
predictions_df.to_csv("rf_predictions_online.csv", index=False)