In [None]:
# --- Import Libraries ---
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# --- RDKit for molecular fingerprint extraction ---
from rdkit import Chem
from rdkit.Chem import AllChem

def smiles_to_fp(smiles, n_bits=256, radius=2):
    mol = Chem.MolFromSmiles(smiles) if pd.notna(smiles) else None
    if mol is None:
        return np.zeros(n_bits, dtype=int)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    arr = np.asarray(fp)
    return arr

print(smiles_to_fp('CCO').shape)

In [None]:
# --- Load raw data ---
b2f_mlmodel_df = pd.read_csv('../ylab-ML-MODEL/data/batch2flow_updated.csv', index_col='item')
print("Columns:", b2f_mlmodel_df.columns)

# --- Manually or automatically identify SMILES columns ending with _B or by chemical column names ---
possible_smiles_cols = [col for col in b2f_mlmodel_df.columns if (
    col.endswith('_B') or col in ['reagent1', 'reagent2', 'reagent3', 'solvent', 'additive1', 'catalyst']
)]
# Check actual string type
smiles_cols = [col for col in possible_smiles_cols if b2f_mlmodel_df[col].dtype == 'object']
print("SMILES columns:", smiles_cols)

In [None]:
# --- Convert all SMILES columns to fingerprint matrices and concatenate ---
n_bits = 256  # To avoid feature explosion

fp_feature_frames = []
for col in smiles_cols:
    # Compute fingerprint matrix for column
    fp_mat = np.stack(b2f_mlmodel_df[col].fillna('').apply(smiles_to_fp, n_bits=n_bits))
    fp_df = pd.DataFrame(fp_mat, index=b2f_mlmodel_df.index, columns=[f"{col}_fp{i}" for i in range(n_bits)])
    fp_feature_frames.append(fp_df)

# --- Merge all fingerprints with the original data (excluding original SMILES) ---
numeric_df = b2f_mlmodel_df.drop(columns=smiles_cols)
merged_df = pd.concat([numeric_df] + fp_feature_frames, axis=1)
merged_df.head()

In [None]:
# --- Clean missing columns, pivot to wide format for ML modeling ---

# Keep only numeric columns and F/B for pivot
numeric_cols = merged_df.select_dtypes(include=[np.number]).columns.tolist()
fb = merged_df[['F/B']]
numeric_merged_df = pd.concat([fb, merged_df[numeric_cols]], axis=1)

# Remove columns with too many missing values
missing_frac = numeric_merged_df.isnull().mean()
cols_to_drop = missing_frac[missing_frac > 1].index.tolist()
df = numeric_merged_df.drop(columns=cols_to_drop).copy()

# Assign batch number, reshape to wide format (pivot)
df['batch'] = ((df.index + 1) // 2).astype(int)
flow_vars = [c.replace('_B', '').replace('_F', '') for c in numeric_cols if not c.startswith('exist_')]
df_wide = df.pivot(index='batch', columns='F/B', values=flow_vars)
df_wide.columns = [f"{var}_{fb}" for var, fb in df_wide.columns]
df_wide = df_wide.reset_index()

# Reorder columns for ML (all _B, then all _F)
f_cols = sorted([c for c in df_wide if c.endswith('_F')])
b_cols = sorted([c for c in df_wide if c.endswith('_B')])
df_reordered = df_wide[['batch'] + b_cols + f_cols].set_index('batch')
df_full = df_reordered.fillna(0)
df_full

In [None]:
# --- Add 'exist_' flag columns: 1 if value != 0, else 0 ---
all_features = b_cols + f_cols
for feat in all_features:
    df_full[f"exist_{feat}"] = (df_full[feat] != 0).astype(int)

# --- Build interleaved columns for ML input ---
new_cols = []
for feat in all_features:
    new_cols.append(feat)
    new_cols.append(f"exist_{feat}")
df_exist = df_full[new_cols].copy()
df_exist.head()

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# 1. original target variables
target_vars = [
    'requiv2_F', 'requiv3_F', 'csolv_F', 'addequiv1_F', 'addequiv2_F',
    'catequiv1_F', 'catequiv2_F', 'pcequiv_F', 'temp_F', 'time_F'
]

# 2. setup feature_cols: _B + exist_*_B + fingerprint
b_features = [col for col in df_full.columns if col.endswith('_B')]
exist_features = [col for col in df_exist.columns if col.startswith('exist_') and col.endswith('_B')]

# assume fingerprint column name _fp0/_fp1... as end, eg. fp_cols = [col for col in df_full.columns if col.startswith('smiles_fp')]
fp_features = [col for col in df_full.columns if '_fp' in col]
feature_cols = list(dict.fromkeys(b_features + exist_features + fp_features))

# 3. target variables
f_features = target_vars

# 4. Build input/output
X = df_exist[feature_cols].values
y = df_full[f_features].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 5. Standardization
scaler_X = StandardScaler()
scaler_y = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)
y_train_scaled = scaler_y.fit_transform(y_train)

# 6. Train MLP
mlp = MLPRegressor(
    hidden_layer_sizes=(128, 512, 256, 128),
    activation='relu',
    solver='adam',
    learning_rate_init=1e-3,
    max_iter=5000,
    early_stopping=False,
    tol=1e-6,
    random_state=42,
    verbose=True
)
mlp.fit(X_train_scaled, y_train_scaled)

# 7. Predict and inverse scale, force negative to 0
pred_scaled = mlp.predict(X_test_scaled)
y_pred = scaler_y.inverse_transform(pred_scaled)
y_pred = np.clip(y_pred, 0, None)

# 8. Assemble result DataFrame (仅老10个目标)
X_test_df = pd.DataFrame(X_test, columns=feature_cols)
y_test_df = pd.DataFrame(y_test, columns=f_features)
pred_df = pd.DataFrame(y_pred, columns=[f"predicted_{c}" for c in f_features])
combined_df = pd.concat([X_test_df, y_test_df, pred_df], axis=1)

pd.DataFrame(combined_df)

# 9. exist_*_B == 0 predicted value need equal 0 must be

for fcol in f_features:
    pred_col = f'predicted_{fcol}'
    combined_df.loc[combined_df[fcol] == 0, pred_col] = 0

# 10. Again, ensure no small negatives
for fcol in f_features:
    pred_col = f'predicted_{fcol}'
    if pred_col in combined_df.columns:
        combined_df[pred_col] = combined_df[pred_col].clip(lower=0)

pd.DataFrame(combined_df)


In [None]:
# 11. Only plot the 10 original variables           
import matplotlib.pyplot as plt

n_plots = len(f_features)
n_cols = 5
n_rows = (n_plots + n_cols - 1) // n_cols

plt.figure(figsize=(5*n_cols, 5*n_rows))
for i, col in enumerate(f_features):
    pcd ~/Desktop/ai4sci_demot.subplot(n_rows, n_cols, i+1)
    # x: Real, y: Predicted
    plt.scatter(combined_df[col], combined_df[f"predicted_{col}"], alpha=0.6, edgecolor='k')
    min_val = min(combined_df[col].min(), combined_df[f"predicted_{col}"].min())
    max_val = max(combined_df[col].max(), combined_df[f"predicted_{col}"].max())
    plt.plot([min_val, max_val], [min_val, max_val], 'r--', label='Ideal: y = x')
    plt.xlabel('Real Value')
    plt.ylabel('Predicted Value')
    plt.title(col)
    plt.legend()
    plt.grid(True)
plt.tight_layout()
plt.show()
