In [8]:
import sherpa

# help function
from transfer_learning import NeuralNet_sherpa_optimize
from dataset_loader import (
    data_loader,
    all_filter,
    get_descriptors,
    one_filter,
    data_scaler,
)

# modules
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

import os, sys
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

from tqdm import tqdm
from scipy.stats import pearsonr

import matplotlib.pyplot as plt

parameters = [
    sherpa.Continuous(name="lr", range=[0.0002, 0.1], scale="log"),
    # sherpa.Discrete(name='Epoch', range=[10,100]),
    sherpa.Discrete(name="H_l1", range=[10, 300]),
    sherpa.Choice(
        name="activate",
        range=["nn.Hardswish", "nn.PReLU", "nn.ReLU", "nn.Sigmoid", "nn.LeakyReLU"],
    ),
]
algorithm = sherpa.algorithms.RandomSearch(max_num_trials=10)
study = sherpa.Study(
    parameters=parameters,
    algorithm=algorithm,
    lower_is_better=False,
    disable_dashboard=True,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_path = os.getcwd()
file_name = "data/CrystGrowthDesign_SI.csv"

"""
Data description.

    Descriptors:
        'void fraction', 'Vol. S.A.', 'Grav. S.A.', 'Pore diameter Limiting', 'Pore diameter Largest'
    Source task:
        'H2@100 bar/243K (wt%)'
    Target tasks:
        'H2@100 bar/130K (wt%)' 'CH4@100 bar/298 K (mg/g)' '5 bar Xe mol/kg' '5 bar Kr mol/kg'
"""

descriptor_columns = [
    "void fraction",
    "Vol. S.A.",
    "Grav. S.A.",
    "Pore diameter Limiting",
    "Pore diameter Largest",
]
one_filter_columns = ["H2@100 bar/243K (wt%)"]
another_filter_columns = ["H2@100 bar/130K (wt%)"]

# load data
data = data_loader(base_path, file_name)

# extract descriptors and gas adsorptions
one_property = one_filter(data, one_filter_columns)
descriptors = get_descriptors(data, descriptor_columns)

# prepare training inputs and outputs
X = np.array(descriptors.values, dtype=np.float32)
y = np.array(one_property.values, dtype=np.float32).reshape(len(X),)
X = data_scaler(X)
y = data_scaler(y.reshape(-1, 1)).reshape(len(X),)

# makes transfer trials... more of a legacy code ---- function cannot be pulled out of .py bc of data dependencies
data_small = data.sample(n=100, random_state=1)

another_property = one_filter(data_small, another_filter_columns)
descriptors_small = get_descriptors(data_small, descriptor_columns)

X_small = np.array(descriptors_small.values, dtype=np.float32)
y_small = np.array(another_property.values, dtype=np.float32).reshape(
    len(X_small),
)

In [9]:
from Statistics_helper import stratified_cluster_sample
df,t_1,t_2,y_1,y_2=stratified_cluster_sample(1,data,descriptor_columns,one_filter_columns[0],5,net_out=True)
df=df[0]
len(df)

13506

In [10]:
print(df.columns)
features=['void fraction', 'Vol. S.A.', 'Grav. S.A.', 'Pore diameter Limiting',
       'Pore diameter Largest','Cluster']
interest=['H2@100 bar/243K (wt%)']
df[interest].values

Index(['void fraction', 'Vol. S.A.', 'Grav. S.A.', 'Pore diameter Limiting',
       'Pore diameter Largest', 'H2@100 bar/243K (wt%)', 'Cluster'],
      dtype='object')


array([[1.4],
       [1.2],
       [1.5],
       ...,
       [1.7],
       [1.5],
       [1.7]])

In [11]:
from torch.utils.data import Dataset
from transfer_learning import MyDataset
from Statistics_helper import stratified_cluster_sample
df,t_1,t_2,y_1,y_2=stratified_cluster_sample(1,data,descriptor_columns,one_filter_columns[0],5,net_out=True)
df=df[0]
interest=one_filter_columns[0]
descriptor_columns.append("Cluster")
features=descriptor_columns

abc=MyDataset(df,interest,features)
abc.x_train

tensor([[-1.5635,  1.1383, -1.5535, -1.2260, -1.4663,  1.0000],
        [-1.9746,  0.5552, -1.7561, -1.0469, -1.3959,  1.0000],
        [-1.0702,  1.8004, -1.1994, -1.0187, -1.3334,  1.0000],
        ...,
        [-0.9057,  0.1045, -1.5281, -0.2928, -0.6847,  1.0000],
        [-1.3991,  0.3672, -1.5860, -0.8678, -1.0521,  1.0000],
        [-1.2346,  0.5466, -1.5167, -0.6699, -0.9739,  1.0000]])

In [12]:
a= torch.utils.data.DataLoader(abc,batch_size=100)
for x,y in a:
    t_1,t_2,y_1,y_2=train_test_split(x,y,test_size=.2)
    

In [13]:
df_train,df_val,y_df_train,df_val = train_test_split(df[features],df[interest],test_size=.1)
df_train[interest]=y_df_train

In [14]:
df_train

Unnamed: 0,void fraction,Vol. S.A.,Grav. S.A.,Pore diameter Limiting,Pore diameter Largest,Cluster,H2@100 bar/243K (wt%)
10443,0.738769,-1.283844,0.644296,1.064633,0.378130,0,9.0
4546,0.409872,0.051113,1.003662,-0.292802,-0.301793,4,6.6
11022,0.574321,-0.692191,0.484206,1.554818,0.933010,3,6.8
7105,-0.001248,0.809369,-0.082081,-0.528468,-0.856673,4,3.3
10818,0.738769,-1.268893,0.947272,0.904380,0.331239,0,8.9
...,...,...,...,...,...,...,...
171,-0.576817,1.437333,-0.927450,-0.886680,-1.052053,1,1.9
4404,0.574321,-0.442287,1.055751,0.037130,-0.067337,0,8.5
4286,-0.659041,1.755587,-0.352083,-1.046933,-1.169281,1,2.6
9816,0.409872,0.048977,0.860298,-0.453055,0.112413,4,4.7
