This file is for creating the datasets that we will use for machine learning. 

## Removing duplicates and splitting data

In [1]:
import pandas as pd
import os
from rdkit import Chem
from collections import Counter
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import random
import numpy as np
from thermo import Chemical, UNIFAC
import copy

seed = 67

# set random seed
random.seed(seed)

pd.set_option('display.max_columns', None)

In [2]:
def go_back_folder():
    current_dir = os.getcwd()
    parent_dir = os.path.dirname(current_dir)
    os.chdir(parent_dir)

def enter_folder(folder_name):
    current_dir = os.getcwd()
    child_dir = os.path.join(current_dir, folder_name)
    os.chdir(child_dir)

# Enter the folder for the code again
enter_folder("Data")

# Load the data from the file Raw_VLE_data.xlsx. 
excel_file = pd.ExcelFile("Raw_VLE_data.xlsx")

# Go back to the code folder
go_back_folder()

In [8]:
# Want to iterate over all the sheets in the file and store them in a dictionary
VLE_data = {}
for sheet in excel_file.sheet_names:
    if sheet == "Overview":
        continue 

    # Get sheet name
    cas_number = sheet.split("_")[0]

    # Extract the data from the sheet
    df = excel_file.parse(sheet)

    water_column_X = None 
    amine_column_X = None
    water_column_Y = None
    amine_column_Y = None

    # Identify which X that is the water column 
    if df["X"][0].lower() == "water":
        water_column_X = "X"
        amine_column_X = "X.1"

    elif df["X.1"][0].lower() == "water":
        water_column_X = "X.1"
        amine_column_X = "X"

    else: 
        raise ValueError(f"No column with water (X) found for cas number {cas_number}")
    
    # Identify which Y that is the water column
    if df["Y"][0].lower() == "water":
        water_column_Y = "Y"
        amine_column_Y = "Y.1"

    elif df["Y.1"][0].lower() == "water":
        water_column_Y = "Y.1"
        amine_column_Y = "Y"

    else: 
        raise ValueError(f"No column with water (Y) found for cas number {cas_number}")

    # If a row does is not a string in the "Usage" column, then remove it
    df = df[df["Usage"].apply(lambda x: isinstance(x, str))]

    # If a row in the column "Usage" usage in lower case is not "data", then remove it
    df = df[df["Usage"].apply(lambda x: x.lower() == "data")]

    VLE_data[cas_number] = {"x_water": df[water_column_X].values, "x_amine": df[amine_column_X].values, 
                            "y_water": df[water_column_Y].values, "y_amine": df[amine_column_Y].values, 
                            "T": df["TEMPERATURE"].values, "P": df["PRESSURE"].values}

Want to use UNIFAC to estimate the activity coefficients. UNIFAC looks at the funciton groups, and based on this estimates the activity coefficients.

In [9]:
# Can not just copy the dictionary. Have to perform a deep copy to avoid changing the original dictionary
VLE_data_with_estimated_y = copy.deepcopy(VLE_data)

for cas_number, data in VLE_data_with_estimated_y.items():

    print(cas_number)

    # Create the molecule object
    molecule_amine = Chemical(cas_number)
    molecule_water = Chemical("7732-18-5")

    # Get the UNIFAC subgroup list
    subgroups_amine = molecule_amine.UNIFAC_groups
    subgroups_water = molecule_water.UNIFAC_groups

    subgroup_list = [subgroups_amine, subgroups_water]

    # Calculate pure component vapor pressures [Pa] at all the temperatures
    P_sat_amine = [molecule_amine.VaporPressure(T)for T in data["T"]]
    P_sat_water = [molecule_water.VaporPressure(T) for T in data["T"]]

    print(molecule_amine.VaporPressure.all_methods)

    ewrf

    # If the molecule does not have any UNIFAC subgroups, then skip it
    if subgroups_amine is None: 
        continue

    y_amine = []
    y_water = []
    # Iterate over all the mixtures
    for x_amine, x_water, P_amine, P_water, T, P in zip(data["x_amine"], data["x_water"], P_sat_amine, P_sat_water, data["T"], data["P"]):

        mole_fractions = [x_amine, x_water]

        # Create a mixture object
        mixture = UNIFAC.from_subgroups(T, mole_fractions, subgroup_list, version=1)

        # Calculate the activity coefficients
        gamma_amine, gamma_water = mixture.gammas()

        # Calculate the y values
        y_amine.append(x_amine * gamma_amine * P_amine / P)
        y_water.append(x_water * gamma_water * P_water / P)

    VLE_data_with_estimated_y[cas_number]["y_amine"] = y_amine  
    VLE_data_with_estimated_y[cas_number]["y_water"] = y_water

140-31-8
{'BOILING_CRITICAL', 'EOS', 'LANDOLT', 'EDALAT', 'AMBROSE_WALTON', 'SANJARI', 'LEE_KESLER_PSAT'}


NameError: name 'ewrf' is not defined

In [74]:
print(VLE_data_with_estimated_y)

{'140-31-8': {'x_water': array([0.998, 0.991, 0.9875, 0.978, 0.9725, 0.9628, 0.9181, 0.8966,
       0.9989, 0.9971, 0.9937, 0.9896, 0.986, 0.9826, 0.9777, 0.9709,
       0.9331, 0.919, 0.8983, 0.8771, 0.8555, 0.997, 0.9936, 0.9903,
       0.9865, 0.9825, 0.9777, 0.9719, 0.9708, 0.9487, 0.9335, 0.9173,
       0.8981, 0.8776, 0.8612, 0.9988, 0.9975, 0.995, 0.992, 0.9889,
       0.9857, 0.9818, 0.9766, 0.9665, 0.9553, 0.9410000000000001, 0.9236,
       0.9046, 0.8833, 0.8535, 0.9991, 0.9973, 0.9946, 0.9916, 0.9883,
       0.9849, 0.9802, 0.9746, 0.9636, 0.95, 0.9359, 0.9189, 0.8817,
       0.8633, 0.5728, 0.6867, 0.7233, 0.6536, 0.594, 0.7389, 0.659,
       0.581], dtype=object), 'x_amine': array([0.002, 0.009, 0.0125, 0.022, 0.0275, 0.0372, 0.0819, 0.1034,
       0.0011, 0.0029, 0.0063, 0.0104, 0.014, 0.0174, 0.0223, 0.0291,
       0.0669, 0.081, 0.1017, 0.1229, 0.1445, 0.003, 0.0064, 0.0097,
       0.0135, 0.0175, 0.0223, 0.0281, 0.0292, 0.0513, 0.0665, 0.0827,
       0.1019, 0.1224, 0.

Code that counts the number of unique atoms in a molecule: 

In [75]:
def get_unique_atoms_in_SMILES(smiles):
    """
    This function returns a dictionary with an overview of the unique atoms present in the provided dataset.
    Each unique atom type is counted once per molecule, regardless of how many times it appears in that molecule.
    """
    # Initialize a Counter object to hold our atom counts
    atom_counts = Counter()

    # Convert the SMILES string to an RDKit molecule
    mol = Chem.MolFromSmiles(smiles)
    # If the molecule is valid, proceed
    if mol:
        # Add hydrogens explicitly
        mol = Chem.AddHs(mol)
        # Create a set of unique atoms for this molecule
        unique_atoms = set(atom.GetSymbol() for atom in mol.GetAtoms())
        # Update the atom counts with the unique atoms in this molecule
        atom_counts.update(unique_atoms)

    # Create a dictionary to store the atom counts
    atom_counts_dict = dict(atom_counts)
    return atom_counts_dict


def get_unique_atoms_in_data(data):
    """
    This function returns a dictionary with an overview of the unique atoms present in the provided dataset.
    Each unique atom type is counted once per molecule, regardless of how many times it appears in that molecule.
    """
    # Initialize a Counter object to hold our atom counts
    atom_counts = Counter()

    # Iterate over each SMILES string in the dataset
    for smiles in data['SMILES']:
        # Get the unique atoms in the SMILES string
        atom_counts_dict = get_unique_atoms_in_SMILES(smiles)
        
        # Update the atom counts with the unique atoms in this molecule
        atom_counts.update(atom_counts_dict.keys())

    # Create a dictionary to store the atom counts
    atom_counts_dict = dict(atom_counts)

    sorted_dict = dict(sorted(atom_counts_dict.items(), key=lambda x: x[1], reverse=True))

    return sorted_dict

The code below extracts and saves those values where experimental values for y is given. 

In [11]:
VLE_data_with_experimental_y = copy.deepcopy(VLE_data)
count = 0 

# Iterate over VLE_data, as we want to remove the rows with nan values
cas_numbers_to_remove = []
for cas_number, data in VLE_data_with_experimental_y.items():
    # Check the if the y_values only consist of nan, 1 or 0
    if all([np.isnan(y) or y == 1 or y == 0 for y in data["y_amine"]]) and all([np.isnan(y) or y == 1 or y == 0 for y in data["y_water"]]):
        # Remove the key from the dictionary
        cas_numbers_to_remove.append(cas_number)

    else: 

        count += 1
        indeces_to_remove = []
        for index, y_amine in enumerate(data["y_amine"]):

            """
            # Want to remove the values that are 1 or 0, if the values before and after are nan
            if y_amine == 1 or y_amine == 0:

                # Edge case
                if index == 0: 
                    # If the next value is nan, then remove the value
                    if np.isnan(data["y_amine"][index + 1]):
                        indeces_to_remove.append(index)

                # Edge case
                elif index == len(data["y_amine"]) - 1:
                    # If the previous value is nan, then remove the value
                    if np.isnan(data["y_amine"][index - 1]):
                        indeces_to_remove.append(index)

                # If the values before and after are nan, then remove the value
                elif (np.isnan(data["y_amine"][index - 1]) or data["y_amine"][index - 1] == 1 or data["y_amine"][index - 1] == 0) and (np.isnan(data["y_amine"][index + 1]) or data["y_amine"][index + 1] == 1 or data["y_amine"][index + 1] == 0):
                    indeces_to_remove.append(index)
            """

            if np.isnan(y_amine):
                indeces_to_remove.append(index)

        for key in data.keys():
            # Remove the rows with nan values
            data[key] = np.delete(data[key], indeces_to_remove)

# Remove the keys from the dictionary
for key in cas_numbers_to_remove:
    del VLE_data_with_experimental_y[key]

print(count)

# Print all the cas numbers
print(VLE_data_with_experimental_y.keys())

# Amines_NIST_Information_True.xlsx contains the smiles for all the cas numbers
# Want to load the smiles for all the cas numbers
df_overview = pd.read_excel("Data/Amines_NIST_Information_True.xlsx", sheet_name="Overview")

# Check if any of the smiles are duplicated (present in more than one row)
smiles_counter = Counter(df_overview["SMILES"])
duplicates = {key: value for key, value in smiles_counter.items() if value > 1}
print(duplicates)

# Also want to remove cas number "111-41-1" as this molecule only had data for pure water. 
del VLE_data_with_experimental_y["111-41-1"]

# Size of the data after removing the cas number "111-41-1"
size = 0
for key, data in VLE_data_with_experimental_y.items():
    size += len(data["P"])

print("Data after removing cas number 111-41-1: ", size)

print(count-1)

40
dict_keys(['140-31-8', '105-59-9', '124-09-4', '5308-25-8', '111-42-2', '111-41-1', '3179-63-3', '121-44-8', '142-84-7', '108-18-9', '109-01-3', '108-91-8', '111-49-9', '62-53-3', '124-68-5', '108-01-0', '6291-84-5', '110-72-5', '616-39-7', '110-91-8', '110-85-0', '110-89-4', '110-86-1', '109-83-1', '78-90-0', '109-76-2', '109-89-7', '109-73-9', '78-81-9', '123-75-1', '141-43-5', '57-14-7', '107-15-3', '75-31-0', '75-50-3', '107-10-8', '60-34-4', '124-40-3', '75-04-7', '74-89-5'])
{}
Data after removing cas number 111-41-1:  2923
39


Makes a overview of the atom distribution. 

In [77]:
# Create a new dataframe to store the smiles values
smiles_list = []

# Iterate over each compound in the dataset
for cas_number in VLE_data_with_experimental_y.keys():
    smiles = df_overview[df_overview["CAS No"] == cas_number]["SMILES"].values[0]
    
    # Get the unique atoms in the compound
    atom_counts_dict = get_unique_atoms_in_SMILES(smiles)
    
    # Check if any atom is present that is not in the specified list
    if any(atom not in ['C', 'H', 'O', 'Cl', 'N', 'F', 'Br', 'S', 'I'] for atom in atom_counts_dict.keys()):
        print("CAS number:", cas_number, "has unwanted atoms:", atom_counts_dict)
        continue  # Skip this compound if it has unwanted atoms

    # Add the SMILES to the dataframe. Df has no append. 
    smiles_list.append(smiles)

# Create a dataframe with the smiles values
df_smiles = pd.DataFrame(smiles_list, columns=["SMILES"])
    
count_dict = get_unique_atoms_in_data(df_smiles)
count_dict

{'H': 39, 'C': 39, 'N': 39, 'O': 8}

Also want to remove all the rows that has missing pressures, temperatures etc, even though they have y-values. 

In [78]:
for cas_num, data in VLE_data_with_experimental_y.items():
    remove_indeces = []
    for property, value in data.items():
        index = 0
        for val in value:
            if np.isnan(val):
                remove_indeces.append(index)
            index +=1

    
    if len(remove_indeces) > 0:
        for property, value in data.items():
            data[property] = np.delete(value, remove_indeces)

size = 0
for key, data in VLE_data_with_experimental_y.items():
    size += len(data["P"])

print("Data after removing cas number 111-41-1: ", size)

Data after removing cas number 111-41-1:  2849


Want to figure out how much data that is outside of the temperature interval. 

In [12]:
# Make a deep copy 
VLE_data_with_experimental_y_valid_temps = copy.deepcopy(VLE_data_with_experimental_y)

# Want to remove data from the dictionary if "T" is less than 313 or above 408.15 K
size = 0 
for cas_num, data in VLE_data_with_experimental_y_valid_temps.items():
    indeces_to_remove = []
    for index, T in enumerate(data["T"]):
        size += 1
        if T < 313 or T > 408.15:
            indeces_to_remove.append(index)
            size -= 1

    for property in data.keys():
        data[property] = np.delete(data[property], indeces_to_remove)

print("Data inside the temperature range: ", size)
# Find the size of VLE_data_with_experimental_y as well 
size = 0
for key, data in VLE_data_with_experimental_y.items():
    size += len(data["T"])

print("Data outside the temperature range: ", size)

Data inside the temperature range:  2313
Data outside the temperature range:  2923


Also want to remove very low pressures, as the baseline model did not contain compounds below 1 mmHg. I remove pressures if they are below 200 Pa. 

In [13]:
# Make a deep copy 
VLE_data_with_experimental_y_valid_pres = copy.deepcopy(VLE_data_with_experimental_y_valid_temps)

# Find the size of VLE_data_with_experimental_y_valid_temps as well 
size = 0
for key, data in VLE_data_with_experimental_y_valid_temps.items():
    size += len(data["P"])

print("Data outside the pressure range: ", size)

# Removes data outside the pressure range
size = 0 
for cas_num, data in VLE_data_with_experimental_y_valid_pres.items():
    indeces_to_remove = []
    for index, P in enumerate(data["P"]):
        size += 1
        if P < 200:
            indeces_to_remove.append(index)
            size -= 1

    for property in data.keys():
        data[property] = np.delete(data[property], indeces_to_remove)

print("Data inside the pressure range: ", size)

# Also want to remove cas number "111-41-1" as this molecule only had data for pure water. 
#del VLE_data_with_experimental_y_valid_pres["111-41-1"]

# Size of the data after removing the cas number "111-41-1"
size = 0
for key, data in VLE_data_with_experimental_y_valid_pres.items():
    size += len(data["P"])

print("Data after removing cas number 111-41-1: ", size)

Data outside the pressure range:  2313
Data inside the pressure range:  2308
Data after removing cas number 111-41-1:  2308


Also want to remove all the data where x=0 or x=1, and when y=0 or y=1. Want to do this since we use the logarithm in the loss funciton, and log(0) is impossible.

In [None]:
# Make a deep copy 
VLE_data_with_experimental_y_no_pure = copy.deepcopy(VLE_data_with_experimental_y_valid_pres)

# Find the size of VLE_data_with_experimental_y_no_pure as well 
size = 0
for key, data in VLE_data_with_experimental_y_no_pure.items():
    size += len(data["x_amine"])

print("VLE data, including pure compounds: ", size)

# Removes pure compound data (x-values)
size = 0 
for cas_num, data in VLE_data_with_experimental_y_no_pure.items():
    indeces_to_remove = []
    for index, x in enumerate(data["x_amine"]):
        size += 1
        if x == 0 or x == 1:
            indeces_to_remove.append(index)
            size -= 1

    for property in data.keys():
        data[property] = np.delete(data[property], indeces_to_remove)

# Removes pure compound data (y-values)
size = 0 
for cas_num, data in VLE_data_with_experimental_y_no_pure.items():
    indeces_to_remove = []
    for index, y in enumerate(data["y_amine"]):
        size += 1
        if y == 0 or y == 1:
            indeces_to_remove.append(index)
            size -= 1

    for property in data.keys():
        data[property] = np.delete(data[property], indeces_to_remove)

print("VLE data, without pure compounds: ", size)

VLE data, including pure compounds:  2264
VLE data, without pure compounds:  1986


Want to save this data in an excel file.

In [None]:
# Create a new excel file called "VLE_experimental_y.xlsx"
enter_folder("Data")
enter_folder("Combined model")
with pd.ExcelWriter("VLE_experimental_y.xlsx") as writer:
    for key, data in VLE_data_with_experimental_y_no_pure.items():
        df = pd.DataFrame(data)
        df.to_excel(writer, sheet_name=key, index=False)
go_back_folder()
go_back_folder()

I know want to split this overall dataset into train and test. I do the split based on the cas number and not on all the datapoints. I do an 80/20 split due to little data available. -> This lead to dataleakage. I need to ensure that the data in the test set has not already been seen by the model when developing the model for pure pressure prediction to achieve an unbiased evaluation of the model performance.  

In [None]:
"""
# set random seed
random.seed(42)

train_dict = {}
test_dict = {}

for cas_num, data in VLE_data_with_experimental_y_valid_pres.items():
    # Pick a random number between 0 and 1
    random_number = random.uniform(0, 1)
    if random_number < 0.8: 
        train_dict[cas_num] = data

    else:
        test_dict[cas_num] = data

# Create a new excel file called "VLE_experimental_y_train.xlsx"
enter_folder("Data")
enter_folder("Combined model")
with pd.ExcelWriter("VLE_experimental_y_train.xlsx") as writer:
    for key, data in train_dict.items():
        df = pd.DataFrame(data)
        df.to_excel(writer, sheet_name=key, index=False)

# Create a new excel file called "VLE_experimental_y_test.xlsx"
with pd.ExcelWriter("VLE_experimental_y_test.xlsx") as writer:
    for key, data in test_dict.items():
        df = pd.DataFrame(data)
        df.to_excel(writer, sheet_name=key, index=False)

go_back_folder()
go_back_folder()
"""

'\n# set random seed\nrandom.seed(42)\n\ntrain_dict = {}\ntest_dict = {}\n\nfor cas_num, data in VLE_data_with_experimental_y_valid_pres.items():\n    # Pick a random number between 0 and 1\n    random_number = random.uniform(0, 1)\n    if random_number < 0.8: \n        train_dict[cas_num] = data\n\n    else:\n        test_dict[cas_num] = data\n\n# Create a new excel file called "VLE_experimental_y_train.xlsx"\nenter_folder("Data")\nenter_folder("Combined model")\nwith pd.ExcelWriter("VLE_experimental_y_train.xlsx") as writer:\n    for key, data in train_dict.items():\n        df = pd.DataFrame(data)\n        df.to_excel(writer, sheet_name=key, index=False)\n\n# Create a new excel file called "VLE_experimental_y_test.xlsx"\nwith pd.ExcelWriter("VLE_experimental_y_test.xlsx") as writer:\n    for key, data in test_dict.items():\n        df = pd.DataFrame(data)\n        df.to_excel(writer, sheet_name=key, index=False)\n\ngo_back_folder()\ngo_back_folder()\n'

Want a function to evaluate the train/test split. 

In [None]:
def evaluate_split(train_dict, test_dict):
    # Amount of data in the train data vs test data
    size_train = 0
    for key, data in train_dict.items():
        size_train += len(data["T"])

    size_test = 0
    for key, data in test_dict.items():
        size_test += len(data["T"])

    # Amount of amines in train data vs test data
    amine_ratio = len(train_dict)/(len(train_dict) + len(test_dict))
    data_ratio = size_train/(size_train + size_test)

    return amine_ratio, data_ratio

Datasplitting without dataleakage: 

In [None]:
# set random seed
#random.seed(42)
enter_folder("Data")

# Load the files used for the pure compound model
df_train_pure_compounds = pd.read_excel('Train.xlsx')
df_test_pure_compounds = pd.read_excel('Test.xlsx')

# Store all the cas number from the column "CAS No"
cas_numbers_train = df_train_pure_compounds['CAS No'].tolist()
cas_numbers_test = df_test_pure_compounds['CAS No'].tolist()

# Create new dictionaies for the VLE data
train_dict = {}
test_dict = {}

for cas_num, data in VLE_data_with_experimental_y_no_pure.items():
    if str(cas_num) in cas_numbers_test:
        test_dict[cas_num] = data

    else:
        train_dict[cas_num] = data

# Make a deep copy of train_dict and test_dict
train_dict_copy = copy.deepcopy(train_dict)
test_dict_copy = copy.deepcopy(test_dict)

# Will contain the best split
best_train_dict = {}
best_test_dict = {}

index = 0
# Evaluate different splits
for cas_num, data in list(test_dict.items()):
    index += 1
    # Remove compound from test_dict
    test_dict_copy.pop(cas_num)
    
    # Append compound to train_dict
    train_dict_copy[cas_num] = data
    
    # Evaluate split
    amine_ratio, data_ratio = evaluate_split(train_dict_copy, test_dict_copy)
    
    # Print the result
    print(f"Split: {amine_ratio:.2f}, {data_ratio:.2f}")

    # The split that gives 0.74, 0.75 was the best one (hardcoded) -> The ratio with two decimals must be this 
    if round(amine_ratio, 2) == 0.74 and round(data_ratio, 2) == 0.75:
        best_train_dict = copy.deepcopy(train_dict_copy)
        best_test_dict = copy.deepcopy(test_dict_copy)

    # Take deep copy again to reset the dictionaries
    train_dict_copy = copy.deepcopy(train_dict)
    test_dict_copy = copy.deepcopy(test_dict)  

# Create a new excel file called "VLE_experimental_y_train.xlsx"
enter_folder("Combined model")

with pd.ExcelWriter("VLE_experimental_y_train.xlsx") as writer:
    for key, data in best_train_dict.items():
        df = pd.DataFrame(data)
        df.to_excel(writer, sheet_name=key, index=False)

# Create a new excel file called "VLE_experimental_y_test.xlsx"
with pd.ExcelWriter("VLE_experimental_y_test.xlsx") as writer:
    for key, data in best_test_dict.items():
        df = pd.DataFrame(data)
        df.to_excel(writer, sheet_name=key, index=False)

go_back_folder()

go_back_folder()

Split: 0.74, 0.67
Split: 0.74, 0.67
Split: 0.74, 0.67
Split: 0.74, 0.68
Split: 0.74, 0.78
Split: 0.74, 0.67
Split: 0.74, 0.69
Split: 0.74, 0.75
Split: 0.74, 0.69
Split: 0.74, 0.67
Split: 0.74, 0.68


I know want to investigate how good this final split was based on the amount of data and the number of amines. 

In [None]:
amine_ratio, data_ratio = evaluate_split(best_train_dict, best_test_dict)

print(f"Amount of amines in train data vs test data: {amine_ratio}")
print(f"Amount of data in the train data vs test data: {data_ratio:.2f}")

Amount of amines in train data vs test data: 0.7435897435897436
Amount of data in the train data vs test data: 0.75


I now want to use this data to figure out how well UNIFAC estimates the activity coefficients compared to UNIFAC calculated for experimental data. 

In [18]:
pd.set_option('display.max_columns', None)

CB_color_cycle = ['#377eb8', '#ff7f00', '#4daf4a',
                  '#f781bf', '#a65628', '#984ea3',
                  '#999999', '#e41a1c', '#dede00']

VLE_data_with_experimental_valid = copy.deepcopy(VLE_data_with_experimental_y_valid_pres)

# Get the data from the files Amines_train.xlsx and Amines_test.xlsx
enter_folder("Data")
train_data_sat = pd.read_excel("Train.xlsx")
test_data_sat = pd.read_excel("Test.xlsx")
go_back_folder()

# Merge the data from the files Amines_train.xlsx and Amines_test.xlsx
saturation_data = pd.concat([train_data_sat, test_data_sat])

# Iterate over all the VLE data
for cas_number, VLE_data in VLE_data_with_experimental_valid.items(): 

    # Check if cas_number is in the pure_amine_train["CAS No"]
    if cas_number in saturation_data["CAS No"].values:

        #if cas_number != "121-44-8":
        if cas_number != "107-15-3":
            continue

        # Create the molecule object
        molecule_amine = Chemical(cas_number)
        molecule_water = Chemical("7732-18-5")

        # Get the UNIFAC subgroup list
        subgroups_amine = molecule_amine.UNIFAC_groups
        subgroups_water = molecule_water.UNIFAC_groups

        subgroup_list = [subgroups_amine, subgroups_water]

        # If the molecule does not have any UNIFAC subgroups, then skip it 
        if subgroups_amine is None: 
            continue


        # Get the data needed to calculate the saturation pressure
        A, B, C, TMIN, TMAX = saturation_data[saturation_data["CAS No"] == cas_number][["A", "B", "C", "TMIN", "TMAX"]].values[0]

        # Calculate pure component vapor pressures [Pa] at all the temperatures
        P_sat_amine = []
        P_sat_water = []

        for T in VLE_data["T"]:
            T = T - 273.15
            if T < TMIN or T > TMAX:
                P_sat_amine.append(None)

            else: 
                # calcualte P using antonines equation
                P = 10**(A - B / (T + C))

                # Convert from mmmHg to Pa
                P = P * 133.322
                P_sat_amine.append(P)

        if len(P_sat_amine) != len(VLE_data["T"]):
            print("Not possible to calculate the saturation pressure for all temperatures for cas number: ", cas_number)

        for T in VLE_data["T"]:
            T = T - 273.15
            # If the tempeature is between 1 and 100, then calculate the saturation pressure for water
            if T > 1 and T < 100:
                A_water = 8.07131
                B_water = 1730.63
                C_water = 233.426
                P = 10**(A_water - B_water / (T + C_water))
                P = P * 133.322
                P_sat_water.append(P)

            # If the temperature is between 99 and 374, then calculate the saturation pressure for water
            elif T > 99 and T < 374:
                A_water = 8.14019
                B_water = 1810.94
                C_water = 244.485
                P = 10**(A_water - B_water / (T + C_water))
                P = P * 133.322
                P_sat_water.append(P)

            else:
                P_sat_water.append(None)

        activity_experimental_amine = []
        activity_experimental_water = []

        activity_estimated_amine = []
        activity_estimated_water = []

        for x_amine, y_amine, P_amine, P_water, P in zip(VLE_data["x_amine"], VLE_data["y_amine"], P_sat_amine, P_sat_water, VLE_data["P"]):

            # If any of the values are None, then append None to the list
            if P_amine is None or P_water is None or x_amine == 0 or x_amine == 1 or y_amine == 0 or y_amine == 1:
                activity_experimental_amine.append(None)
                activity_experimental_water.append(None)

            # Calculate the activity coefficients for the different temperatures 
            else: 
                activity_experimental_amine.append(y_amine * P/(P_amine * x_amine))
                activity_experimental_water.append((1 - y_amine) * P/(P_water * (1 - x_amine)))

            # If the previous values are None, then append None to the list as well
            if activity_experimental_amine[-1] is None:
                activity_estimated_amine.append(None)
                activity_estimated_water.append(None)

            else:

                # Want to estimate the activity coefficients for the different conditions
                mole_fractions = [x_amine, (1-x_amine)]

                # Create a mixture object
                mixture = UNIFAC.from_subgroups(T, mole_fractions, subgroup_list, version=1)

                # Calculate the activity coefficients
                gamma_amine, gamma_water = mixture.gammas()

                # Append the estimated activity coefficients
                activity_estimated_amine.append(gamma_amine)
                activity_estimated_water.append(gamma_water)

        # Remove all the None values from the lists
        activity_experimental_amine = [x for x in activity_experimental_amine if x is not None]
        activity_experimental_water = [x for x in activity_experimental_water if x is not None]
        activity_estimated_amine = [x for x in activity_estimated_amine if x is not None]
        activity_estimated_water = [x for x in activity_estimated_water if x is not None]

        ### Create a parity plot for the activity coefficients to see how well the UNIFAC model performs relative to the experimental data ###

        # Create a folder called water activity coefficients. The amine cas number will be the name of the figure
        enter_folder("Data")
        enter_folder("Combined model")

        # If the folder does not exist, then create it
        if not os.path.exists("Water activity coefficients"):
            os.makedirs("Water activity coefficients")
        enter_folder("Water activity coefficients")
        
        # Create a parity plot for the activity coefficient for the amine
        plt.plot(activity_experimental_water, activity_estimated_water, "o", color=CB_color_cycle[0])
        min_value = min(min(activity_experimental_water), min(activity_estimated_water))
        max_value = max(max(activity_experimental_water), max(activity_estimated_water))
        plt.plot([min_value, max_value], [min_value, max_value], "--", color="black")
        plt.xlabel("Experimentally calculated", fontsize=20)
        plt.ylabel("UNIFAC estimated", fontsize=20)
        plt.xticks(fontsize=14)
        plt.yticks(fontsize=14)
        #plt.title(f"Parity plot for the activity coefficient of water (with {cas_number})")
        plt.tight_layout()
        # Still dont see title in plot. Must fix this by 
        plt.savefig(f"Parity_plot_{cas_number}_water.png", dpi=600)
        plt.close()


        go_back_folder()

        # Create a folder called amine activity coefficients. The amine cas number will be the name of the figure
        # If the folder does not exist, then create it
        if not os.path.exists("Amine activity coefficients"):
            os.makedirs("Amine activity coefficients")

        enter_folder("Amine activity coefficients")

        # Create a parity plot for the activity coefficient for the amine
        plt.plot(activity_experimental_amine, activity_estimated_amine, "o", color=CB_color_cycle[0])
        min_value = min(min(activity_experimental_amine), min(activity_estimated_amine))
        max_value = max(max(activity_experimental_amine), max(activity_estimated_amine))
        plt.plot([min_value, max_value], [min_value, max_value], "--", color="black")
        plt.xlabel("Experimentally calculated", fontsize=20)
        plt.ylabel("UNIFAC estimated", fontsize=20)
        plt.xticks(fontsize=14)
        plt.yticks(fontsize=14)
        #plt.title(f"Parity plot for the activity coefficient of amine '{cas_number}'")
        plt.tight_layout()
        plt.savefig(f"Parity_plot_{cas_number}_amine.png", dpi=600)
        plt.close()

        go_back_folder()
        go_back_folder()
        go_back_folder()

The code below is for estimating y_amine and y_water when we know the saturation pressure (code that checked if the activity coefficients was reliable). The output is the sum of y that checks if the results are good enough or not. 

In [None]:
# Get the data from the files Amines_train.xlsx and Amines_test.xlsx
enter_folder("Data")
train_data = pd.read_excel("Train.xlsx")
test_data = pd.read_excel("Test.xlsx")
go_back_folder()

# Want to make a plot of the temperatures 
for cas_number, data in VLE_data.items(): 

    data_copy = data.copy() 

    # Check if cas_number is in the pure_amine_train["CAS No"]
    if cas_number in train_data["CAS No"].values:
        # Print the corresponding row in the pure_amine_train
        A, B, C, TMIN, TMAX = train_data[train_data["CAS No"] == cas_number][["A", "B", "C", "TMIN", "TMAX"]].values[0]

        # Create the molecule object
        molecule_amine = Chemical(cas_number)
        molecule_water = Chemical("7732-18-5")

        # Get the UNIFAC subgroup list
        subgroups_amine = molecule_amine.UNIFAC_groups
        subgroups_water = molecule_water.UNIFAC_groups

        subgroup_list = [subgroups_amine, subgroups_water]

        # Calculate pure component vapor pressures [Pa] at all the temperatures
        P_sat_amine = []

        for T in data_copy["T"]:
            T = T - 273.15
            if T < TMIN or T > TMAX:
                P_sat_amine.append(None)

            else: 
                # calcualte P using antonines equation
                P = 10**(A - B / (T + C))

                # Convert from mmmHg to Pa
                P = P * 133.322
                P_sat_amine.append(P)

        P_sat_water = [molecule_water.VaporPressure(T) for T in data_copy["T"]]

        # If the molecule does not have any UNIFAC subgroups, then skip it
        if subgroups_amine is None: 
            continue

        y_amine = []
        y_water = []
        # Iterate over all the mixtures
        for x_amine, x_water, P_amine, P_water, T, P in zip(data_copy["x_amine"], data_copy["x_water"], P_sat_amine, P_sat_water, data_copy["T"], data_copy["P"]):

            mole_fractions = [x_amine, x_water]

            # Create a mixture object
            mixture = UNIFAC.from_subgroups(T, mole_fractions, subgroup_list)

            # Calculate the activity coefficients
            gamma_amine, gamma_water = mixture.gammas()

            if P_amine is None: 
                y_amine.append(None)
                y_water.append(None)
                continue

            else:

                # Calculate the y values
                y_amine.append(x_amine * gamma_amine * P_amine / P)
                y_water.append(x_water * gamma_water * P_water / P)


        for index, y in enumerate(y_amine):
            if y is None:
                continue

            else: 
                print(y + y_water[index])


    # Check if cas_number is in the pure_amine_test["CAS No"]
    if cas_number in test_data["CAS No"].values:
        # Print the corresponding row in the pure_amine_test
        print(test_data[test_data["CAS No"] == cas_number])

        øoih
