In [None]:
from mainv3 import SystemDesign
import matplotlib.pyplot as plt
from equations import JouybanAcreeModel
import numpy as np
from groups import ja_groups
import glob
from data_module import DataProcessor
import pandas as pd
import scipy.stats as stats
from equations import JouybanAcreeModel
from groups import ja_groups
from sklearn.metrics import mean_squared_error, r2_score


# Set up initial configurations for plots
plt.rcParams.update({
    'font.size': 12,          # Default font size
    'axes.labelsize': 14,     # Axis labels
    'axes.titlesize': 16,     # Subplot titles
    'xtick.labelsize': 12,    # X-axis tick labels
    'ytick.labelsize': 12,    # Y-axis tick labels
    'legend.fontsize': 12,    # Legend text
    'figure.titlesize': 18    # Figure title
})

In [None]:
ja_model = JouybanAcreeModel()

In [None]:
def calculate_metrics(weight_fractions: pd.Series, solubility_g_g: pd.Series, solvent_1_pure, solvent_2_pure, temperature, J0, J1, J2):
    predicted_solubility = ja_model.predict(
        weight_fractions,
        solvent_1_pure,
        solvent_2_pure,
        temperature,
        J0, J1, J2
    )

    # Calculate error metrics
    rmse = np.sqrt(mean_squared_error(weight_fractions, predicted_solubility))
    r2 = r2_score(solubility_g_g, predicted_solubility)
    mape = np.mean(np.abs((solubility_g_g - predicted_solubility) / solubility_g_g)) * 100
    
    return rmse, r2, mape, np.log(mape) 

In [None]:
def AppendResults(system_load: SystemDesign):
    x,y = system_load.get_data_split_df()
    y_pred = system_load.predict_model(x)

    df = system_load.dataprocess.raw_data.merge(
        y_pred,
        left_index=True,
        right_index=True,
        suffixes=('','_pred')
    )
    
    results = []
    for i,row in df.iterrows():
        gn = int(row['group_index'])
        
        group = ja_groups[gn]
        CalculateMetrics = calculate_metrics(
            group['solvent_1_weight_fraction'],
            group['solubility_g_g'],
            row['solvent_1_pure'],
            row['solvent_2_pure'],
            row['temperature'],
            row['J0_pred'],
            row['J1_pred'],
            row['J2_pred']
        )
        
        results.append({
            "group_index": gn,
            "rmse": CalculateMetrics[0],
            "r2": CalculateMetrics[1],
            "mape": CalculateMetrics[2],
            "logmape": CalculateMetrics[3],
        })
    
    results = pd.DataFrame(results)
    
    df = df.merge(
        results,
        left_on='group_index',
        right_on='group_index',
        suffixes=('','_pred')
    )
    
    return df

In [None]:
models_dir = '../../output/models'
model_files = glob.glob(f'{models_dir}/*.pkl')

models = {}
for model_file in model_files:
    system: SystemDesign = SystemDesign.load(model_file)
    # Extract model name from file path
    model_name = model_file.split('\\')[-1].replace('.pkl', '')
    
    models[model_name] = {
        'system': system,
        'results': AppendResults(system)
    }

In [None]:
models_df = []
for key, values in models.items():
    model = values['results']
    
    log_diff_mean = abs(model['logmape'] - model['logmape_pred']).sum() / len(model['logmape'])
    log_diff_std = np.std(model['logmape'] - model['logmape_pred'])
    
    diff_mean = abs(model['mape'] - model['mape_pred']).sum() / len(model['mape'])
    
    models_df.append({
        'model': key,
        'log_diff_mean': log_diff_mean,
        'log_diff_std': log_diff_std,
        'diff_mean': diff_mean,
        'n': len(model['mape']),
    })

models_df = pd.DataFrame(models_df)

# Split the model column into its components
models_df[['model_type', 'combination', 'extra', 'features']] = models_df['model'].str.extract(r'(\w+)_system(?:_([a-z]+))?_(\d+)_(\d+)_features')

# Fill NaN values in combination column
models_df['combination'] = models_df['combination'].fillna('')

# Convert numeric columns to appropriate types
models_df['extra'] = models_df['extra'].astype(int)
models_df['features'] = models_df['features'].astype(int)

# Filter out rows where features value is 500
# models_df = models_df[models_df['features'] != 500]
# models_df = models_df[models_df['extra'] != 3]

models_df = models_df.sort_values(by='log_diff_mean', ascending=True)

# Display the dataframe with the new columns
models_df

In [None]:
import seaborn as sns

In [None]:
palette = sns.color_palette("colorblind", 10)
import matplotlib.colors as mcolors
print(mcolors.to_hex(palette[0]))
print(mcolors.to_hex(palette[1]))
print(mcolors.to_hex(palette[2]))

In [None]:
import matplotlib.gridspec as gridspec

# Create a figure with a specific layout
fig = plt.figure(figsize=(16*1.3/1.5, 9*1.3/1.5))



# Create grid specification for the layout
gs = gridspec.GridSpec(2, 2, height_ratios=[1, 1], width_ratios=[1, 1])

# Create the first subplot (takes up the whole top row)
ax1 = fig.add_subplot(gs[0, :])

# Create the second and third subplots (bottom row)
ax2 = fig.add_subplot(gs[1, 0])
ax3 = fig.add_subplot(gs[1, 1])

# Plot 1: XGB model comparison (top)
plotted_system = models_df[(models_df['model_type'] == 'xgb')]

# Create the barplot for Plot 1
sns.barplot(
    data=plotted_system,
    x="extra",
    y="diff_mean",
    hue="features",
    palette='colorblind',
    errorbar=None,
    ax=ax1,
)


# Add labels and styling for Plot 1
ax1.set_xlabel('Number of Experimental Data Points')
ax1.set_ylabel('Δ MAPE (%)')
ax1.grid(axis='y', linestyle='--', alpha=0.7)
ax1.set_title('XGB Model Performance')

# Get the legend for the first plot and set title
legend = ax1.legend(title="Molecular Descriptor Count")

# Plot 2: NN model comparison (bottom left)
plotted_system = models_df[(models_df['model_type'] == 'nn') & (models_df['features'] == 10)]

# Create the barplot for Plot 2
sns.barplot(
    data=plotted_system,
    x="extra",
    y="diff_mean",
    color=palette[0],
    errorbar=None,
    ax=ax2
)

# Add labels and styling for Plot 2
ax2.set_xlabel('Number of Experimental Data Points')
ax2.set_ylabel('Δ MAPE (%)')
ax2.grid(axis='y', linestyle='--', alpha=0.7)
ax2.set_title('Neural Network Model Performance')

# Plot 3: VAE model comparison (bottom right)
plotted_system = models_df[(models_df['model_type'] == 'vae') & (models_df['features'] == 10)]

# Create the barplot for Plot 3
sns.barplot(
    data=plotted_system,
    x="extra",
    y="diff_mean",
    color=palette[0],
    errorbar=None,
    ax=ax3
)

# Add labels and styling for Plot 3
ax3.set_xlabel('Number of Experimental Data Points')
ax3.set_ylabel('Δ MAPE (%)')
ax3.grid(axis='y', linestyle='--', alpha=0.7)
ax3.set_title('Variational Autoencoder Model Performance')

plt.tight_layout()
plt.show()

In [None]:
def viewGraph(systems : list, n, empirical_label='Empirical Data', x_label='Weight Fraction Solvent 1'):

    ja_model = JouybanAcreeModel()  
    x_values = np.linspace(0, 1, 101)
    
    plt.figure(figsize=(16*1.3/3, 9*1.3/3))

    
    for system in systems:
        system_load = SystemDesign.load(system['filename'] + '.pkl')
        
        x,y = system_load.get_data_split_df()
        y_pred = system_load.predict_model(x)

        results_df = system_load.dataprocess.raw_data.merge(
            y_pred,
            left_index=True,
            right_index=True,
            suffixes=('','_pred')
        )
        
        JA_fit_pred = ja_model.predict(
            x_values, 
            results_df['solvent_1_pure'].iloc[n],
            results_df['solvent_2_pure'].iloc[n], 
            results_df['temperature'].iloc[n],
            results_df['J0_pred'].iloc[n],
            results_df['J1_pred'].iloc[n],
            results_df['J2_pred'].iloc[n],
        )
        
        plt.plot(x_values, JA_fit_pred, label=system['label'], color=system['color'], linestyle=system['dash'])
        
    # JA_fit_real = ja_model.predict(
    #     x_values, 
    #     results_df['solvent_1_pure'].iloc[n],
    #     results_df['solvent_2_pure'].iloc[n], 
    #     results_df['temperature'].iloc[n],
    #     results_df['J0'].iloc[n],
    #     results_df['J1'].iloc[n],
    #     results_df['J2'].iloc[n],
    # )

    # # Plot the JA model
    # plt.plot(x_values, JA_fit_real, label='Empirical JA Model', color=palette[0])


    group_index = int(results_df.iloc[n]['group_index'])
    group = ja_groups[group_index]
    
    plt.scatter(group['solvent_1_weight_fraction'], group['solubility_g_g'], color=palette[7], label=empirical_label)
    
    plt.xlabel(x_label)
    plt.ylabel('Solubility (g/g)')
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
ss_df = models_df[models_df['combination'] == 's']

In [None]:
ss_df[ss_df['model_type'] == 'xgb'].sort_values(by='diff_mean', ascending=True).head(3)

In [None]:
ss_df[ss_df['model_type'] == 'vae'].sort_values(by='diff_mean', ascending=True).head(3)

In [None]:
ss_df[ss_df['model_type'] == 'nn'].sort_values(by='diff_mean', ascending=True).head(3)

In [None]:
# 387 for the artuneate system
# viewGraph('xgb_system_1_3_10_features', 387)
# 51 for sofo system
# viewGraph('xgb_system_1_3_10_features', 51)
# 2 for Iminodibenyl system
# viewGraph('xgb_system_1_3_10_features', 2)

In [None]:
systems = [
    {
        'label': 'NN JA Model',
        'filename': '../../output/models/nn_system_ss_0_10_features',
        'color': palette[1],
        'dash': '--',
    },
    {
        'label': 'VAE JA Model',
        'filename': '../../output/models/vae_system_ss_0_10_features',
        'color': palette[2],
        'dash': '-',
    },
    {
        'label': 'XGB JA Model',
        'filename': '../../output/models/xgb_system_s_0_10_features',
        'color': palette[3],
        'dash': '-.',
    }
]

In [None]:
viewGraph(systems, 51, empirical_label='Sofosbuvir Exp. Data',x_label='Acetone — Water Weight Fraction')
viewGraph(systems, 387, empirical_label='Artesunate Exp. Data',x_label='Propane-1,2-diol — Water Weight Fraction')