In [None]:
from main import SystemDesign
import pickle


In [None]:
system = SystemDesign(
    system_columns=['solvent_1'],
    raw_data_path='curve_fit_results_x_is_7.csv',
    extra_fitted_points=3,
    target_columns=['J0','J1','J2']
)

# Train the model
system.train_model(
    feature_selection_method='random_forest',
    n_features=10,
    keep_prefixes=['solvent_1_pure','solvent_2_pure','system','solubility_','temperature'],
    epochs=1000, 
    batch_size=32, 
    verbose=1
)

# Evaluate the model
system.evaluate_model()

# Get predictions and metrics
predictions, actuals, mae = system.get_predictions_and_metrics()

In [None]:
from groups import ja_groups

In [None]:
x,y = system.get_data_split_df()
y_pred = system.predict_model(x)

In [None]:
from data_module import DataProcessor

In [None]:
otherDataProcessor,_ = DataProcessor.CreateDataProcessor("curve_fit_results_x_is_3.csv"')

In [None]:
results_df = system.dataprocess.raw_data[['group_index','temperature','solvent_1_pure','solvent_2_pure','J0','J1','J2']].merge(
    y_pred,
    left_index=True,
    right_index=True,
    suffixes=('','_pred')
).merge(
    otherDataProcessor.raw_data[['group_index','J0','J1','J2']],
    on='group_index',
    suffixes=('', '_JA5')
).drop_duplicates()

In [None]:
results_df

In [None]:
import matplotlib.pyplot as plt

# Set up initial configurations for plots
plt.rcParams.update({
    'font.size': 12,          # Default font size
    'axes.labelsize': 14,     # Axis labels
    'axes.titlesize': 16,     # Subplot titles
    'xtick.labelsize': 12,    # X-axis tick labels
    'ytick.labelsize': 12,    # Y-axis tick labels
    'legend.fontsize': 12,    # Legend text
    'figure.titlesize': 18    # Figure title
})

In [None]:
from equations import JouybanAcreeModel
import numpy as np

In [None]:
n = -1

In [None]:
n +=1 
group_index = int(results_df.iloc[n]['group_index'])
group = ja_groups[group_index]



ja_model = JouybanAcreeModel()  
x_values = np.linspace(0, 1, 101)

JA_fit_real = ja_model.predict(
    x_values, 
    results_df['solvent_1_pure'].iloc[n],
    results_df['solvent_2_pure'].iloc[n], 
    results_df['temperature'].iloc[n],
    results_df['J0_JA5'].iloc[n],
    results_df['J1_JA5'].iloc[n],
    results_df['J2_JA5'].iloc[n],
)

JA_fit_NN = ja_model.predict(
    x_values, 
    results_df['solvent_1_pure'].iloc[n],
    results_df['solvent_2_pure'].iloc[n], 
    results_df['temperature'].iloc[n],
    results_df['J0_pred'].iloc[n],
    results_df['J1_pred'].iloc[n],
    results_df['J2_pred'].iloc[n],
)

# Plot the JA model
plt.figure(figsize=(16*1.3/3, 9*1.3/3))
plt.plot(x_values, JA_fit_real, label='Empirical', color='blue')
plt.plot(x_values, JA_fit_NN, label='NN', color='red')


# Add the experimental data points to the plot
plt.scatter(group['solvent_1_weight_fraction'], group['solubility_g_g'], color='gray', label='Experimental Data')
plt.xlabel('Solvent 1 Weight Fraction')
plt.ylabel('Solubility (g/g)')
plt.legend()
plt.grid(True)
plt.show()




In [None]:
import pandas as pd
import scipy.stats as stats
from sklearn.metrics import mean_squared_error, r2_score
from tqdm import tqdm

In [None]:
results = []


for gn in tqdm(range(len(ja_groups)), desc="Processing groups"):
    chosen_df = ja_groups[gn]
    
    predicted_solubility = ja_model.predict(
        chosen_df['solvent_1_weight_fraction'],
        results_df['solvent_1_pure'].iloc[n],
        results_df['solvent_2_pure'].iloc[n], 
        results_df['temperature'].iloc[n],
        results_df['J0_pred'].iloc[n],
        results_df['J1_pred'].iloc[n],
        results_df['J2_pred'].iloc[n],
    )

    mape = np.mean(np.abs((chosen_df['solubility_g_g'] - predicted_solubility) / chosen_df['solubility_g_g'])) * 100
    
    results.append({
        'group_index': gn,
        'mape': mape,
        'logmape': np.log10(mape) if mape > 0 else np.inf,
    })
    
results_df = pd.DataFrame(results)
    

In [None]:
results_df.sort_values(by='logmape', ascending=True).describe()


In [None]:
merged_df = results_df[['group_index','logmape','mape']].merge(
    otherDataProcessor.raw_data,
    on='group_index',
    suffixes=('_model1', '_model2')
)

In [None]:
def paired_t_test(merged_df, verbose=True):   
    # Merge the two dataframes on the group index
    
    # Perform paired t-test on logmape values
    t_statistic, p_value = stats.ttest_rel(merged_df['logmape_model1'], 
                                            merged_df['logmape_model2'], 
                                            alternative='less')
    
    if verbose:
        print("\nPaired t-test results:")
        print(f"t-statistic: {t_statistic:.4f}")
        print(f"p-value: {p_value:.4f}")
        
        if p_value < 0.025:
            print(f"There is a statistically significant difference with model having lower logmape values (p < {0.025}).")
        else:
            print(f"There is no statistically significant evidence that model has lower logmape values (p >= {0.025}).")

        # Calculate additional statistics for verbose mode
        diff_mean = merged_df['logmape_model1'].mean() - merged_df['logmape_model2'].mean()
        num_better = sum(merged_df['logmape_model1'] < merged_df['logmape_model2'])
        total_cases = len(merged_df)
        percentage_better = (num_better / total_cases) * 100
        
        print(f"\nAdditional statistics:")
        print(f"Mean difference in logmape: {diff_mean:.4f}")
        print(f"Cases where model performs better: {num_better} out of {total_cases} ({percentage_better:.1f}%)")

        # Visualize the differences
        plt.figure(figsize=(8, 4))

        # Histogram of differences
        plt.subplot(1, 2, 1)
        
        merged_df['diff'] = merged_df['logmape_model1'] - merged_df['logmape_model2']
        
        plt.hist(merged_df['diff'], bins=30, color='skyblue', edgecolor='black')
        plt.axvline(x=0, color='red', linestyle='--')
        plt.xlabel('Difference in logmape')
        plt.ylabel('Frequency')
        plt.title('Histogram of Differences')

        # Scatter plot comparing the two sets
        plt.subplot(1, 2, 2)
        plt.scatter(merged_df['logmape_model1'], merged_df['logmape_model2'], alpha=0.5)
        plt.plot([-15, 5], [-15, 5], 'r--')  # Line y=x for reference
        plt.title('Comparison of logmape Values')
        plt.grid(True, alpha=0.3)

        plt.tight_layout()
        plt.show()

    return t_statistic, p_value, merged_df['mape_model1'] - merged_df['mape_model2']

In [None]:
paired_t_test(merged_df, verbose=True)

# New Start

In [None]:
from vae_model import VariationalAutoencoderWithFeatureSelection
from mainv3 import SystemDesign

# Create and setup the system with VAE model
system = SystemDesign(
    system_columns=['solvent_1', 'solvent_2', 'temperature'],
    raw_data_path='curve_fit_results_x_is_7.csv',
    extra_fitted_points=1,
    target_columns=['J0', 'J1', 'J2']
)

# Train the VAE model
system.train_model(
    model_class=VariationalAutoencoderWithFeatureSelection,
    feature_selection_method='random_forest',
    n_features=10,
    keep_prefixes=['solvent_1_pure', 'solvent_2_pure', 'system', 'solubility_', 'temperature'],
    epochs=1000,
    batch_size=32,
    verbose=1,
    optimize_hyperparams=True,
    n_calls=11,
    latent_dim=16,
    kl_weight=0.001
)

In [None]:
# Create and setup the system
system = SystemDesign(
    system_columns=['solvent_1','solvent_2','temperature'],
    raw_data_path='curve_fit_results_x_is_7.csv',
    extra_fitted_points=1,
    target_columns=['J0','J1','J2']
)

# Train the model
system.train_model(
    feature_selection_method='random_forest',
    n_features=10,
    keep_prefixes=['solvent_1_pure','solvent_2_pure','system','solubility_','temperature'],
    epochs=1000, 
    batch_size=32, 
    verbose=1,
    optimize_hyperparams=True,
    n_calls=11
)

# Evaluate the model
system.evaluate_model()

# Get predictions and metrics
predictions, actuals, mae = system.get_predictions_and_metrics()

system.model.save_model('trained_model.keras')

In [None]:
from xgb_model import XGBoostModelWithFeatureSelection

# Create and setup the system with XGBoost model
system = SystemDesign(
    system_columns=['solvent_1', 'solvent_2', 'temperature'],
    raw_data_path='curve_fit_results_x_is_7.csv',
    extra_fitted_points=1,
    target_columns=['J0', 'J1', 'J2']
)

# Train the XGBoost model
system.train_model(
    model_class=XGBoostModelWithFeatureSelection,
    feature_selection_method='random_forest',
    n_features=10,
    keep_prefixes=['solvent_1_pure', 'solvent_2_pure', 'system', 'solubility_', 'temperature'],
    verbose=1,
    optimize_hyperparams=True,
    n_calls=11,
    n_estimators=100,
    max_depth=6
)

# Evaluate the model
system.evaluate_model()

# Get predictions and metrics
predictions, actuals, mae = system.get_predictions_and_metrics()

# Plot feature importance
system.model.plot_feature_importance(top_n=15)

# Save the model
system.model.save_model('trained_xgboost_model.json')