In [None]:
import sqlite3
import pandas as pd
import numpy as np
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
import random
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tqdm import tqdm

random.seed(42)

In [None]:
connection = sqlite3.connect('../../db/MasterDatabase.db')
# Execute the query and load the data into a pandas DataFrame
df = pd.read_sql_query("SELECT * FROM selected_solubility_data", connection)

display(df)

# Close the database connection
connection.close()


groups = [group.reset_index(drop=True) for _, group in df.groupby(['solvent_1', 'solvent_2', 'compound_id','temperature'])]

### Processing for JA to conclude average MAPE scores

In [None]:
import warnings

results = []
failed_groups = []
skipped_groups = []

for gn in tqdm(range(len(groups)), desc="Processing groups"):
    chosen_df = groups[gn]
    
    x = 3 # Number of random points to select
    
    n = len(chosen_df)
    if n < x+2:  # Skip groups that don't have enough points
        print(f"Skipping group {gn} due to insufficient data points")
        skipped_groups.append(gn)
        continue
        
    random_indices = random.sample(range(1, n-1), x)
    
    solvent_2_pure = chosen_df[chosen_df['solvent_1_weight_fraction'] <= 0.01].iloc[0]['solubility_g_g']
    solvent_1_pure = chosen_df[chosen_df['solvent_1_weight_fraction'] >= 0.99].iloc[0]['solubility_g_g']
    specific_temperature = chosen_df['temperature'].iloc[0]
    
    # Create the random dataframe with x rows
    fitting_df = chosen_df.iloc[[0] + random_indices + [n-1]].reset_index(drop=True)
    
    def jouyban_acree(f1, J0, J1, J2):   
        # Calculate fraction of second solvent
        f2 = 1 - f1
        
        # Modified interaction term that reduces likelihood of bimodal behavior
        interaction_term = J0 * f1 * f2 + J1 * f1 * f2 * (2*f1 - 1) + J2 * f1 * f2 * (2*f1 - 1)**2
        
        # Calculate logarithm of solubility in the mixture
        log_Cm = f1 * np.log(solvent_1_pure) + f2 * np.log(solvent_2_pure) + \
                 interaction_term / specific_temperature
        
        # Return the solubility in the mixture
        return np.exp(log_Cm)
    

    # Suppress warnings during curve fitting
    try:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            popt, pcov = curve_fit(jouyban_acree, fitting_df['solvent_1_weight_fraction'], fitting_df['solubility_g_g'])
    except RuntimeError as e:
        print(f"RuntimeError: {e}")
        failed_groups.append(gn)
        continue
    
    if (pcov is None or np.isnan(pcov).any() or np.isinf(pcov).any()):
        print(f"Failed to fit group {gn} due to covariance issues")
        failed_groups.append(gn)
        continue
    
    # Extract the fitted parameters
    J0, J1, J2 = popt
    
    # Calculate predicted solubility for all experimental data points
    predicted_solubility = jouyban_acree(chosen_df['solvent_1_weight_fraction'], J0, J1, J2)
    
    # Root Mean Square Error
    rmse = np.sqrt(mean_squared_error(chosen_df['solubility_g_g'], predicted_solubility))
    
    # R² score (coefficient of determination)
    r2 = r2_score(chosen_df['solubility_g_g'], predicted_solubility)
    
    # Mean Absolute Percentage Error (MAPE)
    mape = np.mean(np.abs((chosen_df['solubility_g_g'] - predicted_solubility) / chosen_df['solubility_g_g'])) * 100
    
    # Store results in dictionary
    result = {
        'group_index': gn,
        'solvent_1': chosen_df['solvent_1'].iloc[0],
        'solvent_2': chosen_df['solvent_2'].iloc[0],
        'compound_id': chosen_df['compound_id'].iloc[0],
        'temperature': specific_temperature,
        'J0': J0,
        'J1': J1,
        'J2': J2,
        'solvent_1_pure': solvent_1_pure,
        'solvent_2_pure': solvent_2_pure,
        'rmse': rmse,
        'r2': r2,
        'mape': mape,
    }
    results.append(result)        

print(f"Processed {len(results)} groups successfully out of {len(groups)} total groups")
print(f"Failed to process {len(failed_groups)} groups")
print(f"Skipped {len(skipped_groups)} groups due to insufficient data points")

In [None]:
def results_describe(results):
    results_df = pd.DataFrame(results).sort_values(by='mape', ascending=False)
    # Calculate average MAPE and other statistics
    average_mape = results_df['mape'].mean()
    median_mape = results_df['mape'].median()
    min_mape = results_df['mape'].min()
    max_mape = results_df['mape'].max()

    print(f"Average MAPE: {average_mape}")
    print(f"Median MAPE: {median_mape}")
    print(f"Min MAPE: {min_mape}")
    print(f"Max MAPE: {max_mape}")

    # Print descriptive statistics for MAPE values
    print("\n--- MAPE Distribution Analysis ---")
    print(f"Count of values: {len(results_df['mape'])}")
    print(f"Number of values above 100%: {sum(results_df['mape'] > 100)}")
    print(f"Number of values above 50%: {sum(results_df['mape'] > 50)}")
    print(f"Number of values below 10%: {sum(results_df['mape'] < 10)}")
    print(f"Number of values below 5%: {sum(results_df['mape'] < 5)}")


    # Return the full dataframe
    display(results_df)

In [None]:
results_describe(results)