In [None]:
from mainv3 import SystemDesign
import matplotlib.pyplot as plt
from equations import JouybanAcreeModel
import numpy as np
from groups import ja_groups
import glob
from data_module import DataProcessor
import pandas as pd
import scipy.stats as stats
from equations import JouybanAcreeModel
from groups import ja_groups
from sklearn.metrics import mean_squared_error, r2_score


# Set up initial configurations for plots
plt.rcParams.update({
    'font.size': 12,          # Default font size
    'axes.labelsize': 14,     # Axis labels
    'axes.titlesize': 16,     # Subplot titles
    'xtick.labelsize': 12,    # X-axis tick labels
    'ytick.labelsize': 12,    # Y-axis tick labels
    'legend.fontsize': 12,    # Legend text
    'figure.titlesize': 18    # Figure title
})

In [None]:
ja_model = JouybanAcreeModel()

In [None]:
def calculate_metrics(weight_fractions: pd.Series, solubility_g_g: pd.Series, solvent_1_pure, solvent_2_pure, temperature, J0, J1, J2):
    predicted_solubility = ja_model.predict(
        weight_fractions,
        solvent_1_pure,
        solvent_2_pure,
        temperature,
        J0, J1, J2
    )

    # Calculate error metrics
    rmse = np.sqrt(mean_squared_error(weight_fractions, predicted_solubility))
    r2 = r2_score(solubility_g_g, predicted_solubility)
    mape = np.mean(np.abs((solubility_g_g - predicted_solubility) / solubility_g_g)) * 100
    
    return rmse, r2, mape
            

In [None]:
def paired_t_test(results_df, other_model_results_df):
   
    # Merge the two dataframes on the group index
    merged_df = pd.merge(results_df, other_model_results_df, 
                            on='group_index', suffixes=('_model1', '_model2'), how='inner')
    
    # Perform paired t-test on logmape values
    t_statistic, p_value = stats.ttest_rel(merged_df['logmape_model1'], 
                                            merged_df['logmape_model2'], 
                                            alternative='less')
    
    print("\nPaired t-test results:")
    print(f"t-statistic: {t_statistic:.4f}")
    print(f"p-value: {p_value:.4f}")
    
    if p_value < 0.025:
        print(f"There is a statistically significant difference with model 1 having lower logmape values (p < {0.025}).")
    else:
        print(f"There is no statistically significant evidence that model 1 has lower logmape values (p >= {0.025}).")

    diff_mean = merged_df['logmape_model1'].mean() - merged_df['logmape_model2'].mean()
    num_better = sum(merged_df['logmape_model1'] < merged_df['logmape_model2'])
    total_cases = len(merged_df)
    percentage_better = (num_better / total_cases) * 100
    
    print(f"\nAdditional statistics:")
    print(f"Mean difference in logmape: {diff_mean:.4f}")
    print(f"Cases where model 1 performs better: {num_better} out of {total_cases} ({percentage_better:.1f}%)")


    return t_statistic, p_value, merged_df['mape_model1'] - merged_df['mape_model2']

In [None]:
def Comparison(system_load: SystemDesign):
    x,y = system_load.get_data_split_df()
    y_pred = system_load.predict_model(x)

    df = system_load.dataprocess.raw_data.merge(
        y_pred,
        left_index=True,
        right_index=True,
        suffixes=('','_pred')
    )
    
    results = []
    for i,row in df.iterrows():
        gn = int(row['group_index'])
        
        group = ja_groups[gn]
        CalculateMetrics = calculate_metrics(
            group['solvent_1_weight_fraction'],
            group['solubility_g_g'],
            row['solvent_1_pure'],
            row['solvent_2_pure'],
            row['temperature'],
            row['J0_pred'],
            row['J1_pred'],
            row['J2_pred']
        )
        
        results.append({
            "group_index": gn,
            "rmse": CalculateMetrics[0],
            "r2": CalculateMetrics[1],
            "mape": CalculateMetrics[2],
        })
    
    return pd.DataFrame(results)

In [None]:
models_dir = '../../output/models'
model_files = glob.glob(f'{models_dir}/*.pkl')

results = {}
for model_file in model_files:
    system: SystemDesign = SystemDesign.load(model_file)
    # Extract model name from file path
    model_name = model_file.split('\\')[-1].replace('.pkl', '')

    # Get comparison results for this model
    comparison_df = Comparison(system)
    mae_J0, mae_J1, mae_J2 = system.get_predictions_and_metrics()

    # Store results in dictionary with model name as key
    results[model_name] = {
        'comparison': comparison_df,
        'mae_J0': mae_J0,  # Will be updated after metrics calculation
        'mae_J1': mae_J1,
        'mae_J2': mae_J2,
        'system': system
    }
    
    display(comparison_df)
    

In [None]:
comparison_df = []

for key, value in results.items():
    log_mape = float(np.log(value['comparison']['mape']).mean())
    
    comparison_df.append({
        'model_name': key,
        'logmape': log_mape,
        'rmse': value['comparison']['rmse'].mean(),
        'r2': value['comparison']['r2'].mean(),
        'mape': value['comparison']['mape'].mean(),
    })

comparison_df = pd.DataFrame(comparison_df).sort_values(by='logmape', ascending=True).reset_index(drop=True)
# Extract components from model_name by splitting on underscore
comparison_df[['model', 'system', 'type', 'extra_points', 'features']] = comparison_df['model_name'].str.split('_', expand=True, n=4)
comparison_df = comparison_df.drop(columns=['system'])
# Clean up the features column by removing "_features" if present
comparison_df['features'] = comparison_df['features'].str.replace('_features', '')

display(comparison_df)


### Above 95%

In [None]:
better_models = comparison_df[comparison_df['r2'] >= 0.95]
better_models

In [None]:
def viewGraph(filename, n):
    system_load = SystemDesign.load(filename)

    x,y = system_load.get_data_split_df()
    y_pred = system_load.predict_model(x)

    results_df = system_load.dataprocess.raw_data.merge(
        y_pred,
        left_index=True,
        right_index=True,
        suffixes=('','_pred')
    )

    group_index = int(results_df.iloc[n]['group_index'])
    group = ja_groups[group_index]

    ja_model = JouybanAcreeModel()  
    x_values = np.linspace(0, 1, 101)

    JA_fit_real = ja_model.predict(
        x_values, 
        results_df['solvent_1_pure'].iloc[n],
        results_df['solvent_2_pure'].iloc[n], 
        results_df['temperature'].iloc[n],
        results_df['J0'].iloc[n],
        results_df['J1'].iloc[n],
        results_df['J2'].iloc[n],
    )

    JA_fit_NN = ja_model.predict(
        x_values, 
        results_df['solvent_1_pure'].iloc[n],
        results_df['solvent_2_pure'].iloc[n], 
        results_df['temperature'].iloc[n],
        results_df['J0_pred'].iloc[n],
        results_df['J1_pred'].iloc[n],
        results_df['J2_pred'].iloc[n],
    )

    # Plot the JA model
    plt.figure(figsize=(16*1.3/3, 9*1.3/3))
    plt.plot(x_values, JA_fit_real, label='Empirical', color='blue')
    plt.plot(x_values, JA_fit_NN, label='NN', color='red')


    # Add the experimental data points to the plot
    plt.scatter(group['solvent_1_weight_fraction'], group['solubility_g_g'], color='gray', label='Experimental Data')
    plt.xlabel('Solvent 1 Weight Fraction')
    plt.ylabel('Solubility (g/g)')
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
# group 377 for artunesate

In [None]:
viewGraph("../../output/models/xgb_system_ss_0_10_features.pkl",377)