Test of Nusselt with GPLearn

In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
# Path to your TXT file in folder
txt_path = '/home/pcornejo/Tesis/benchmark_project/data/raw/battery/df_n_102.txt'

# Read the TXT file
with open(txt_path, 'r') as file:
    content = file.read()

# Print the first 100 characters of the file
print("First 100 characters of the file:")
print(content[:100])

# Print the total number of characters in the file
print(f"\nTotal characters in the file: {len(content)}")

First 100 characters of the file:
Current,K,Flujo,t_viento,Diametro,col_fluido,col_celda,n_fluido,n_celda,Rem,prandtl,colIndex,nusselt

Total characters in the file: 1650251


### Data Processing (Train/Test/Val)

In [3]:
txt_path_25 = '/home/pcornejo/Tesis/benchmark_project/data/raw/battery/df_n_25.txt'
txt_path_53 = '/home/pcornejo/Tesis/benchmark_project/data/raw/battery/df_n_53.txt'
txt_path_74 = '/home/pcornejo/Tesis/benchmark_project/data/raw/battery/df_n_74.txt'
txt_path_102 = '/home/pcornejo/Tesis/benchmark_project/data/raw/battery/df_n_102.txt'

# Load the datasets
df_25 = pd.read_csv(txt_path_25)
df_53 = pd.read_csv(txt_path_53)
df_74 = pd.read_csv(txt_path_74)
df_102 = pd.read_csv(txt_path_102)

# Merge the training datasets (25 and 53)
df_train = pd.concat([df_25, df_53])

# Validation and test sets
df_val = df_102
df_test = df_74

# Select only the columns we need
selected_columns = ['K', 'Rem', 'prandtl', 'nusselt']

df_train_selected = df_train[selected_columns]
df_val_selected = df_val[selected_columns]
df_test_selected = df_test[selected_columns]

# Display the first few rows of each selected DataFrame
print("Training set:")
print(df_train_selected.head())

print("\nValidation set:")
print(df_val_selected.head())

print("\nTest set:")
print(df_test_selected.head())

# Print the shape of each selected DataFrame
print(f"\nTraining set shape: {df_train_selected.shape}")
print(f"Validation set shape: {df_val_selected.shape}")
print(f"Test set shape: {df_test_selected.shape}")

Training set:
     K          Rem   prandtl    nusselt
0  0.6  2795.874321  0.709843  61.165728
1  0.6  2795.874321  0.709843  52.824696
2  0.6  2310.732084  0.709842  50.527730
3  0.6  2795.776244  0.709842  59.596338
4  0.6  2795.776244  0.709841  54.687451

Validation set:
     K          Rem   prandtl    nusselt
0  0.6  2796.592710  0.709843  61.165728
1  0.6  2796.592710  0.709843  52.824696
2  0.6  2446.261529  0.709842  50.527730
3  0.6  2128.077694  0.709842  48.422178
4  0.6  1909.194425  0.709842  46.485070

Test set:
     K          Rem   prandtl    nusselt
0  0.6  2827.361047  0.709843  55.340421
1  0.6  2827.361047  0.709843  52.824696
2  0.6  2356.426510  0.709842  50.527730
3  0.6  2038.344639  0.709842  48.422178
4  0.6  1885.685153  0.709842  48.421871

Training set shape: (12300, 4)
Validation set shape: (17178, 4)
Test set shape: (12210, 4)


In [25]:
X_train = df_train_selected[['K', 'Rem', 'prandtl']]
y_train = df_train_selected['nusselt']

X_test = df_test_selected[['K', 'Rem', 'prandtl']]
y_test = df_test_selected['nusselt']

### Nusselt Function

Structure for Nusselt number: 

$Nu = C \cdot Re^n \cdot Pr^m$

Result proposed by RdlS:

 $Nu = 0.5 \cdot S^{-0.2} \cdot Re^{0.64} \cdot Pr$


We can check how well the data fits this equation by measuring the MSE between the ground data and the data generated by the equation formula

In [5]:
from gplearn.genetic import SymbolicRegressor
from gplearn.functions import make_function
from gplearn.fitness import make_fitness
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.utils.random import check_random_state
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score
from mpl_toolkits.mplot3d import Axes3D

In [23]:
# creando funciones auxiliares
# considerando que es cierta la #Ec1
# c1*S ^ (-0.2) * Re ^ (0.64) * Pr
def pot1(S, Re, Pr, c1, exp1 , exp2):
    con1 = (S > 0) & (Re > 0) & (Pr > 0)
    con2 = (exp1 == -0.2) & (exp2 == 0.64) 
    con3 = (0 < c1) & (c1 < 1)
    with np.errstate(divide='ignore', invalid='ignore', over = 'ignore'):
        result = np.where(con1 & con2 & con3, c1*np.power(S, exp1) * np.power(Re, exp2) * Pr, 0)
    return result

pot1_fn = make_function(function=pot1, 
                        name='pot1', 
                        arity=6)


In [22]:
def rewrite(expression):

    # Reemplazar funciones específicas con sus representaciones matemáticas
    expression = re.sub(r"add\(([^,]+), ([^\)]+)\)", r"(\1 + \2)", expression)
    expression = re.sub(r"sub\(([^,]+), ([^\)]+)\)", r"(\1 - \2)", expression)
    expression = re.sub(r"mul\(([^,]+), ([^\)]+)\)", r"(\1 * \2)", expression)
    expression = re.sub(r"div\(([^,]+), ([^\)]+)\)", r"(\1 / \2)", expression)
    expression = re.sub(r"pow\(([^,]+), ([^\)]+)\)", r"\1 ** \2", expression)
    
    # Reemplazar la función pot1
    expression = re.sub(r"pot1\((X\d)\)", r"\1 ** (-0.6)", expression)
    # Reemplazar la función pot2
    expression = re.sub(r"pot2\((X\d)\)", r"5 * \1 ** (-0.23)", expression)
    
    # Reemplazar variables (ajustar según el modelo)
    expression = expression.replace("X0", "S")
    expression = expression.replace("X1", "Re")
    
    return expression

In [31]:
# entrenando modelo 

# Modelo
model = SymbolicRegressor(population_size=5000,
                        generations=20, 
                        stopping_criteria=0.01,
                        p_crossover=0.7, 
                        p_subtree_mutation=0.1,
                        p_hoist_mutation=0.05, 
                        p_point_mutation=0.1,
                        max_samples=0.9, 
                        verbose=1,
                        parsimony_coefficient=0.01, 
                        random_state=123,
                        function_set=['add', 'sub', pot1_fn], 
                        feature_names=['S', 'Re', 'Pr'])

model.fit(X_train, y_train)

# Obtener la expresión simbólica
expression = model._program
print("Expresión:", expression)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0   318.95          1492.75       55          54.2685          53.8009     51.09m
   1    25.59          449.767       51          52.2434          52.5928      4.42m
   2    30.21          430.541       79          49.8187          49.2808      4.39m
   3    37.25          675.698       45          43.0089          42.4004      4.56m
   4    47.68          759.365      173          33.2133          31.6327      4.42m
   5    66.08          807.583      121          20.7992          20.9181      4.43m
   6    98.70            835.2      215          19.1253          18.4729      4.91m
   7   141.42          826.949      151          18.5038          18.8174      5.05m
   8   145.90          757.701      131          18.4961          18.2871  

In [28]:
print("Expresión:", expression)

Expresión: sub(log(log(X2)), mul(log(log(X2)), add(log(sqrt(X0)), sub(log(log(log(log(X2)))), mul(log(log(X2)), add(log(log(log(log(X2)))), sqrt(div(mul(mul(mul(mul(X1, sqrt(log(mul(mul(X1, log(X2)), sqrt(-0.170))))), sqrt(-0.170)), sqrt(log(mul(mul(X1, sqrt(sqrt(-0.170))), log(X0))))), sqrt(-0.170)), sqrt(sqrt(X0))))))))))
