In [3]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import ray
from ray import tune
import matplotlib.pyplot as plt

In [4]:
#Loading training set into dataframe
df = pd.read_csv('./data/TOC_WF_LMX_2K.csv')
df.head()

Unnamed: 0,Depth/Thickness(m),GR(API),AC(us/m),RT(ohm×m),TOC(%),Journal,Author,Well,Area,DOI1,DOI2,Unnamed: 11,Unnamed: 12
0,2490.0,29.06525,65.23841,78.04103,1.72428,Oil & Gas Geology (Chinese Edition)/Marine Pet...,"Shen et al., 2021/Nie et., 2021",N201,Changning/South Sichuan,10.11743/ogg20210109,10.3969/j.issn.1672-9854.2021.01.005,,
1,2490.125,30.90459,66.23507,99.6852,1.58637,Oil & Gas Geology (Chinese Edition)/Marine Pet...,"Shen et al., 2021/Nie et., 2022",N201,Changning/South Sichuan,10.11743/ogg20210109,10.3969/j.issn.1672-9854.2021.01.005,,
2,2490.25,33.1118,67.23173,112.66383,1.58637,Oil & Gas Geology (Chinese Edition)/Marine Pet...,"Shen et al., 2021/Nie et., 2023",N201,Changning/South Sichuan,10.11743/ogg20210109,10.3969/j.issn.1672-9854.2021.01.005,,
3,2490.375,32.74393,68.22838,116.16438,1.51742,Oil & Gas Geology (Chinese Edition)/Marine Pet...,"Shen et al., 2021/Nie et., 2024",N201,Changning/South Sichuan,10.11743/ogg20210109,10.3969/j.issn.1672-9854.2021.01.005,,`
4,2490.5,31.27246,67.23173,102.78249,1.86219,Oil & Gas Geology (Chinese Edition)/Marine Pet...,"Shen et al., 2021/Nie et., 2025",N201,Changning/South Sichuan,10.11743/ogg20210109,10.3969/j.issn.1672-9854.2021.01.005,,


In [5]:
df = df.drop(columns='Depth/Thickness(m)')
df = df.drop(columns='Journal')
df = df.drop(columns='Author')
df = df.drop(columns='Well')
df = df.drop(columns='Area')
df = df.drop(columns='DOI1')
df = df.drop(columns='DOI2')
df = df.drop(columns='Unnamed: 11')
df = df.drop(columns='Unnamed: 12')

In [6]:
#This is an example, taking TOC as label to predict
label = df['TOC(%)']
train = df.drop('TOC(%)', axis=1) # we don't need it in this project
label.shape, train.shape

((2554,), (2554, 3))

In [7]:
#Function to min-max normalize
def normalize(df, cols):
    """
    @param df pandas DataFrame
    @param cols a list of columns to encode
    @return a DataFrame with normalized specified features
    """
    result = df.copy() # do not touch the original df
    for feature_name in cols:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        if max_value > min_value:
            result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

In [9]:
#Normalizing dataset
new_train = normalize(train,train.columns)
new_train

Unnamed: 0,GR(API),AC(us/m),RT(ohm×m)
0,0.048273,0.171642,0.049639
1,0.051515,0.174820,0.063547
2,0.055404,0.177998,0.071888
3,0.054756,0.181176,0.074137
4,0.052163,0.177998,0.065538
...,...,...,...
2549,0.185708,0.642884,0.370392
2550,0.180394,0.635238,0.413156
2551,0.191022,0.635238,0.514045
2552,0.183051,0.635238,0.413156


In [10]:
label.isnull().values.any()
new_train.isnull().values.any()

False

In [11]:
#Test Nan and fill with mean
for column in list(new_train.columns[ new_train.isnull().sum() > 0]):
    mean_val = new_train[column].mean()
    new_train[column].fillna(mean_val, inplace=True)

In [12]:
train = np.array(new_train)
label = np.array(label)

In [13]:
train = torch.tensor(train, dtype=torch.float32)
label = torch.tensor(label, dtype=torch.float32)

In [14]:
label.shape, train.shape

(torch.Size([2554]), torch.Size([2554, 3]))

In [15]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [20]:
from utils.half_attenuate import HiddenSizesGenerator

class WellLogNet(nn.Module):
    def __init__(self, hyperparams={}):
        super().__init__()

        # Fix the hyper-paremeters in the input and output layers

        layers = []
        in_size = hyperparams["input_dim"]
        out_size = hyperparams["output_dim"]
        sizes = HiddenSizeGenerator(hyperparams["hidden_sizes"])
        
        for i, size in enumerate(sizes[:-1]):
            layers.append(nn.Linear(in_size, size))
            in_size = size
        
        layers.append(nn.Linear(in_size, out_size))

        self.layers = nn.Sequential(*layers)

    def forward(self, x):

        # Here, we use relu function to activate except for the last layer

        for layer in self.layers[:-1]:
            x = F.relu(layer(x))
        return self.layers[-1](x)

class RONet(nn.Module):
    def __init__(self, hyperparams):
        super().__init__()

        # Fix the hyper-paremeters in the input and output layers

        layers = []
        in_size = hyperparams["input_dim"]
        out_size = hyperparams["output_dim"]
        sizes = HiddenSizeGenerator(hyperparams["hidden_sizes"])

        for i, size in enumerate(sizes[:-1]):
            layers.append(nn.Linear(in_size, size))
            in_size = size

        layers.append(nn.Linear(in_size, out_size))

        self.layers = nn.Sequential(*layers)

    def forward(self, x):

        # Here, we use relu function to activate except for the last layer

        for layer in self.layers[:-1]:
            x = F.relu(layer(x))
        return self.layers[-1](x)
    
class BiasNet(nn.Module):
    def __init__(self, hyperparams):
        super().__init__()

        # Fix the hyper-paremeters in the input and output layers

        layers = []
        in_size = hyperparams["input_dim"]
        out_size = hyperparams["output_dim"]
        sizes = HiddenSizeGenerator(hyperparams["hidden_sizes"])

        for i, size in enumerate(sizes[:-1]):
            layers.append(nn.Linear(in_size, size))
            in_size = size
        
        layers.append(nn.Linear(in_size, out_size))

        self.layers = nn.Sequential(*layers)

    def forward(self, x):

        # Here, we use relu function to activate except for the last layer

        for layer in self.layers[:-1]:
            x = F.relu(layer(x))
        return self.layers[-1](x)

class PasseyNet(nn.Module):
    def __init__(self):
        super(PasseyNet, self).__init__()
        self.well_log_net = WellLogNet()
        self.ro_net = RONet()
        self.bias_net = BiasNet()

    def set_well_log_net_hyperparams(self, hp):
        self.well_log_net.hyperparams = hp
    
    def set_ro_net_hyperparams(self, hp):
        self.ro_net.hyperparams = hp
    
    def set_bias_net_hyperparams(self, hp):
        self.bias_net.hyperparams = hp

    def forward(self, x):
        well_log_matrix = self.well_log_net(x)
        ro_matrix = self.ro_net(x)
        bias_matrix = self.bias_net(x)
        
        output = torch.matmul(torch.matmul(x, well_log_matrix.T), 
                              torch.pow(10.0, ro_matrix)) + bias_matrix
        return output

ImportError: cannot import name 'get_hidden_sizes' from 'utils' (unknown location)

In [17]:
# Set the hyperparameters for search in the Net 
config = {
  "well_log_net": {
      "input_dim": 3,
     "hidden_sizes": tune.choice([256, 512, 1024]),
     "output_dim": 3 
  },
  
  "ro_net": {
      "input_dim": 3,
     "hidden_sizes": tune.choice([256, 512, 1024]),
     "output_dim": 1
  },
  
  "bias_net": {
      "input_dim": 3,
     "hidden_sizes": tune.choice([256, 512, 1024]),
     "output_dim": 1
  }
}

In [19]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error 
from sklearn.model_selection import KFold
import torch

torch.autograd.set_detect_anomaly(True)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 

# Define the NAS by ray
def train_model(config):

    # Initializing sub-models by config

    well_log_net = WellLogNet()  
    ro_net = RONet()
    bias_net = BiasNet()

    model = PasseyNet() # Initializing defaultly
    model.set_well_log_net_hyperparams()
    model.set_ro_net_hyperparams()
    model.set_bias_net_hyperparams()
    model.to(device)

    train = train.to(device)
    label = label.to(device)

    # Initializing optimizer and loss functions
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.MSELoss()

    score = -val_mse(model)

    return score

# Store loss、mae、mse、R2、mape
loss_list = []
mae_list = []
mse_list = []
r2_list = []
r2_adjust_list = []
mape_list = []

for i in range(100):

    kf = KFold(n_splits=5, shuffle=True, random_state=42) # 5-fold

    for fold ,(train_idx, val_idx) in enumerate(kf.split(train)):

        # Dividing training and validating datasets
        train_x, train_y = train[train_idx], label[train_idx]
        val_x, val_y = train[val_idx], label[val_idx]

        for epoch in range(100):

            output = model(train) # Training
            output = output.cpu() # Return output to CPU
            label = label.cpu() # Return label to CPU
            
            loss = criterion(output, label)
            optimizer.zero_grad()   
            loss.backward()        
            optimizer.step()

            # Validating
            with torch.no_grad():
                val_output = model(val_x)
                val_output = val_output.cpu()
                val_y = val_y.cpu()
                val_loss = criterion(val_output, val_y)

                val_pred = val_output.detach().numpy()
                val_true = val_y.detach().numpy()

                val_mae = mean_absolute_error(val_true, val_pred)
                val_mse = mean_squared_error(val_true, val_pred)

                val_r2 = r2_score(val_true, val_pred)
                val_mape = mean_absolute_percentage_error(val_true, val_pred)

                val_adjust_r2 = 1-((1-val_r2)*(len(val_x)-1))/(len(val_x)-6-1)
                
                # 打印结果  
                # print(f'Epoch: {epoch+1:02d}, Loss: {loss:.4f}, R2: {val_r2:.4f}, MAE: {val_mae:.4f}, MSE: {val_mse:.4f}, MAPE: {val_mape:.4f}')
        
            # 存储loss、mae和mse
            loss_list.append(val_loss.item())
            mae_list.append(val_mae)
            mse_list.append(val_mse)
            r2_list.append(val_r2)
            r2_adjust_list.append(val_adjust_r2)
            mae_list.append(val_mape)

# Run tune
ray.shutdown()
ray.init(num_gpus=1)

analysis = tune.run(
    train_model,
    config=config,
    num_samples=10, # Number of tray
    metric="score",
    mode="max",
    resources_per_trial={"gpu": 1}  
)

# Best config   
best_config = analysis.get_best_config()

# Generate model by the best config
well_log_net = WellLogNet(best_config["well_log_net"])
ro_net = RONet(best_config["ro_net"])
bias_net = BiasNet(best_config["bias_net"])
best_model = PasseyNet(well_log_net, ro_net, bias_net)


2023-08-08 20:58:37,065	INFO worker.py:1612 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
2023-08-08 20:58:39,302	INFO tune.py:657 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2023-08-08 20:58:49
Running for:,00:00:10.26
Memory:,11.1/15.9 GiB

Trial name,# failures,error file
train_model_4b5ec_00000,1,"C:\Users\Administrator\ray_results\train_model_2023-08-08_20-58-39\train_model_4b5ec_00000_0_hidden_sizes=1024,hidden_sizes=512,hidden_sizes=512_2023-08-08_20-58-39\error.txt"

Trial name,status,loc,bias_net/hidden_size s,ro_net/hidden_sizes,well_log_net/hidden_ sizes
train_model_4b5ec_00001,PENDING,,512,512,512
train_model_4b5ec_00002,PENDING,,256,256,256
train_model_4b5ec_00003,PENDING,,512,512,1024
train_model_4b5ec_00004,PENDING,,256,512,1024
train_model_4b5ec_00005,PENDING,,512,512,256
train_model_4b5ec_00006,PENDING,,512,1024,1024
train_model_4b5ec_00007,PENDING,,512,512,256
train_model_4b5ec_00008,PENDING,,512,512,512
train_model_4b5ec_00009,PENDING,,256,512,512
train_model_4b5ec_00000,ERROR,127.0.0.1:4656,1024,512,512


2023-08-08 20:58:44,212	ERROR tune_controller.py:911 -- Trial task failed for trial train_model_4b5ec_00000
Traceback (most recent call last):
  File "f:\Anaconda3\envs\jupyterlab\lib\site-packages\ray\air\execution\_internal\event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "f:\Anaconda3\envs\jupyterlab\lib\site-packages\ray\_private\auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "f:\Anaconda3\envs\jupyterlab\lib\site-packages\ray\_private\client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "f:\Anaconda3\envs\jupyterlab\lib\site-packages\ray\_private\worker.py", line 2493, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(TypeError): [36mray::ImplicitFunc.train()[39m (pid=4656, ip=127.0.0.1, actor_id=c56cc040c0ef8cd94665089801000000, repr=train_model)
  File "python\ray\_raylet.pyx", line 1424, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", li

Trial name
train_model_4b5ec_00000
train_model_4b5ec_00001


[2m[36m(train_model pid=4656)[0m Caught sync error: Sync process failed: GetFileInfo() yielded path 'C:/Users/Administrator/ray_results/train_model_2023-08-08_20-58-39/train_model_4b5ec_00000_0_hidden_sizes=1024,hidden_sizes=512,hidden_sizes=512_2023-08-08_20-58-39/error.pkl', which is outside base dir 'C:\Users\Administrator\ray_results\train_model_2023-08-08_20-58-39\train_model_4b5ec_00000_0_hidden_sizes=1024,hidden_sizes=512,hidden_sizes=512_2023-08-08_20-58-39\'. Retrying after sleeping for 1.0 seconds...
[2m[36m(train_model pid=4656)[0m Caught sync error: Sync process failed: GetFileInfo() yielded path 'C:/Users/Administrator/ray_results/train_model_2023-08-08_20-58-39/train_model_4b5ec_00000_0_hidden_sizes=1024,hidden_sizes=512,hidden_sizes=512_2023-08-08_20-58-39/error.pkl', which is outside base dir 'C:\Users\Administrator\ray_results\train_model_2023-08-08_20-58-39\train_model_4b5ec_00000_0_hidden_sizes=1024,hidden_sizes=512,hidden_sizes=512_2023-08-08_20-58-39\'. Retry

ValueError: `C:\Users\Administrator\ray_results\train_model_2023-08-08_20-58-39\experiment_state-2023-08-08_20-58-39.json` must either be a path to an experiment checkpoint file, or a directory containing an experiment checkpoint file.

In [None]:
# Plot the 5-fold training results        
plt.plot(loss_list)
plt.title('Five-Fold Loss Curve')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

# 绘制MAE、MSE、MAPE curves
plt.plot(mae_list, label='MAE')
plt.plot(mse_list, label='MSE')
plt.plot(mae_list, label='MAE')
plt.title('MAE, MSE, MAPE Curve')  
plt.xlabel('Epoch')
plt.ylabel('Error')
plt.legend()
plt.show()

# Plot R2、R2_adjust curves
plt.plot(r2_list, label='R2')
plt.plot(r2_adjust_list, label='R2_adjust')
plt.title('R2 & Adjusted R2 Curve')
plt.xlabel('Epoch')
plt.ylabel('Error')
plt.legend()
plt.show()

In [None]:
#Loading training set into dataframe
test_df = pd.read_csv('./data/TOC_WF_LMX.csv')
test_df.head()

In [None]:
test_df = test_df.drop(columns='Depth/Thickness(m)')
test_df = test_df.drop(columns='Journal')
test_df = test_df.drop(columns='Author')
test_df = test_df.drop(columns='Well')
test_df = test_df.drop(columns='Area')
test_df = test_df.drop(columns='DOI1')
test_df = test_df.drop(columns='DOI2')
test_df = test_df.drop(columns='Unnamed: 11')
test_df = test_df.drop(columns='Unnamed: 12')

In [None]:
test_df.shape

In [None]:
test_df_last_30 = test_df.tail(30)
test_df_last_30.shape

In [None]:
#This is an example, taking TOC as label to predict
test_label = test_df_last_30['TOC(%)']
test_train = test_df_last_30.drop('TOC(%)', axis=1) # we don't need it in this project
test_label.shape, test_train.shape

In [None]:
# test_train = test_train.drop(columns='井名')
# test_train = test_train.drop(columns='Depth')

In [None]:
#Normalizing dataset
normalized_test_train = normalize(test_train, test_train.columns)
normalized_test_train

In [None]:
#Test Nan and fill with mean
for column in list(normalized_test_train.columns[ normalized_test_train.isnull().sum() > 0]):
    mean_val = normalized_test_train[column].mean()
    normalized_test_train[column].fillna(mean_val, inplace=True)

In [None]:
# 需要传入Tensor 
X_test = normalized_test_train.values # 转为numpy数组
X_test = torch.Tensor(X_test).to(device) # 转为Tensor


# 切换到训练模式
model.train() 

# 预测值 
y_pred = model(X_test)
y_pred = y_pred.cpu().detach().numpy()

# 真实值
y_true = test_label 

# 残差 
y_true = y_true.ravel()  
y_pred = y_pred.ravel()
resid = y_true - y_pred

# 绘图
plt.hist(resid, bins=20)
plt.title('Residuals Histogram')
plt.xlabel('Prediction Error')
plt.ylabel('Frequency')

# 均值和标准差
mean = resid.mean()
stddev = resid.std()

# 绘制竖直线
plt.axvline(mean, color='r')
plt.axvline(mean + 2*stddev, color='r', linestyle='--')
plt.axvline(mean - 2*stddev, color='r', linestyle='--')

plt.show()

In [None]:
# 绘制预测值实际值差距图

# well_name = test_df_last_30['井名']

x = range(len(y_pred))

plt.scatter(x, y_pred, label='Predicted')
plt.scatter(x, y_true, label='True')

plt.title('Prediction-True Comparison')
plt.ylabel('TOC')
plt.xlabel('Sample Number')
plt.legend()
plt.show()

In [None]:
plt.scatter(y_true, y_pred, label='Predicted')
plt.ylabel('Predicted TOC')
plt.xlabel('True TOC')

plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'k--', lw=2)
plt.title('Prediction-True Comparison')
plt.legend()
plt.show()