In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from matplotlib.ticker import ScalarFormatter, NullFormatter
from sklearn.linear_model import LinearRegression

from lineartree import LinearTreeRegressor

# Component Model Surrogate

In [2]:
df = pd.read_csv('data/component_data_100000.csv')
mask = np.random.rand(len(df)) < 0.8
df_train, df_test = df[mask], df[~mask]

inputs = ['VCSTR', 'ν', 'ξ', 'TOF', 'mol_metal']
outputs = ['H2Yield']
dfin = df_train[inputs]
dfout = df_train[outputs]

x = dfin.values
y = dfout.values

In [5]:
component_surrogate = LinearTreeRegressor(
    max_depth = 20,
    min_samples_leaf = 0.01,
    base_estimator = LinearRegression(),
    n_jobs = 16,
    criterion = 'mae'
)
component_surrogate.fit(x, y)

print(component_surrogate.feature_importances_)
print(len(component_surrogate._leaves))

KeyboardInterrupt: 

In [None]:
component_surrogate.plot_model()

In [None]:
x_test = df_test[inputs].values

y_pred = component_surrogate.predict(x_test)
y_test = df_test[outputs].values

plt.figure(dpi=200)

plt.scatter(y_test, y_pred, alpha=0.1, s = 10, color='blue')

plt.plot([-20, 10000],[-20, 10000], color='red')


plt.xlabel('Actual yield (mol $H_2$/mol Metal*sec)')
plt.ylabel('Predicted yield (mol $H_2$/mol Metal*sec)')

axes = plt.gca()
axes.set_xlim([0, 1.2])
axes.set_ylim([0, 1.2])
axes.set_aspect('equal', 'box')
plt.grid()

In [None]:
component_surrogate.write_to_json('models/component_surrogate.json')

# Costing Model Surrogate

In [None]:
df = pd.read_csv('data/cost_data_100000.csv')
mask = np.random.rand(len(df)) < 0.8
df_train, df_test = df[mask], df[~mask]

inputs = ['VCSTR', 'StationCapacity', 'CatalystMass', 'H2Yield', 'Temperature', 'Pressure', 'CatalystPrice']
outputs = ['Cost']
dfin = df_train[inputs]
dfout = df_train[outputs]

x = dfin.values
y = dfout.values

In [None]:
costing_surrogate = LinearTreeRegressor(
    max_depth = 20,
    min_samples_leaf = 0.003,
    base_estimator = LinearRegression(),
    n_jobs = 16,
    criterion = 'msle'
)
costing_surrogate.fit(x, y)

print(component_surrogate.feature_importances_)
print(len(component_surrogate._leaves))

In [None]:
x_test = df_test[inputs].values
y_pred = costing_surrogate.predict(x_test)
y_test = df_test[outputs].values

# set dpi to 300
plt.figure(dpi=300)
plt.scatter(y_test, y_pred, alpha=0.1, s = 10, color = 'blue')
plt.plot([0,300],[0,300], color='red')
plt.xlabel('Actual Cost (\$/kg $H_2$)')
plt.ylabel('Predicted Cost (\$/kg $H_2$)')

axes = plt.gca()
axes.set_xlim([10,250])
axes.set_ylim([10,250])
axes.set_aspect('equal', 'box')

# make axes logarithmic
axes.set_xscale('log')
axes.set_yscale('log')

# Label the major and minor ticks
axes.xaxis.set_major_formatter(ScalarFormatter())
axes.xaxis.set_minor_formatter(ScalarFormatter())
axes.yaxis.set_major_formatter(ScalarFormatter())
axes.yaxis.set_minor_formatter(ScalarFormatter())

# Rotate x tick labels 90 degrees
plt.xticks(rotation=270)
plt.setp(axes.xaxis.get_minorticklabels(), rotation=270)

plt.grid(True, which="both", ls="-")
plt.show()


In [None]:
costing_surrogate.write_to_json('models/costing_surrogate.json')

# Nanoparticle Model Surrogate

In [None]:
df = pd.read_csv('data/combined-nanoparticle-data.csv')

df['is_Pd'] = df['Metal'].str.contains('Pd').astype(int)
df['is_Cu'] = df['Metal'].str.contains('Cu').astype(int)
df['is_Pt'] = df['Metal'].str.contains('Pt').astype(int)

df['Metal_int'] = df['Metal'].map({'Pd': 0, 'Cu': 1, 'Pt': 2})

In [None]:
df.head()

NameError: name 'df' is not defined

In [None]:
inputs = ['Metal_int', 'T_value (K)', 'P_value (bar)', 'Nanoparticle Diameter (nm)']

outputs = ['TOF_Data (mol H2/mol Pd*sec)']

Cu_mask = df['is_Cu'] == 1
Pd_mask = df['is_Pd'] == 1
Pt_mask = df['is_Pt'] == 1

df = df[(df['Nanoparticle Diameter (nm)'] == 3) | (df['Nanoparticle Diameter (nm)'] == 4) | (df['Nanoparticle Diameter (nm)'] == 5)]


dfin = df[inputs]
dfout = df[outputs]

x = dfin.values
y = dfout.values

In [None]:
surrogate = LinearTreeRegressor(
    max_depth = 12,
    base_estimator=LinearRegression(),
    criterion='msle',
    min_samples_leaf = 10,
    )
surrogate.fit(x, y)

In [None]:
x_test = x


y_pred = surrogate.predict(x_test)
y_test = y

plt.figure(dpi=300)

plt.scatter(y_test[Cu_mask], y_pred[Cu_mask], alpha=0.1, s = 10, label = "Cu", color='red')
plt.scatter(y_test[Pd_mask], y_pred[Pd_mask], alpha=0.1, s = 10, label = "Pd", color='blue')
plt.scatter(y_test[Pt_mask], y_pred[Pt_mask], alpha=0.1, s = 10, label = "Pt", color='green')

plt.plot([-20, 10000],[-20, 10000], color='red')
plt.plot([-20, 10000], [-200, 100000], color='blue')
plt.plot([-20, 10000], [-2, 1000], color='blue')


plt.xlabel('Actual TOF (mol $H_2$/mol Metal*sec)')
plt.ylabel('Predicted TOF (mol $H_2$/mol Metal*sec)')

axes = plt.gca()
axes.set_xlim([10**-3, 10**3.5])
axes.set_ylim([10**-3, 10**3.5])
axes.set_aspect('equal', 'box')
plt.grid()

axes.legend()

# make axes logarithmic
axes.set_xscale('log')
axes.set_yscale('log')

In [None]:
surrogate.write_to_json('models/nanoparticle_surrogate.json')