In [1]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
import pandas as pd
import numpy as np
import tqdm
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.metrics     import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('WLC_MUD_LOG_INTERPOLATED.csv')
df.dropna(axis=1, how='all', inplace=True)

target_to_feature = {
    'GR': ['GR', 'ICGRC', 'TH', 'WOBA', 'NEU', 'DEXM', 'DXC', 'U', 'TQA', 'LTHDIGITAL', 'ROP'],
    'ROP': ['ROP', 'DXC', 'DEXM', 'ROPA', 'U', 'LTHDIGITAL', 'FRACTUREGRADNT', 'BDTI', 'KREV', 'BDDI', 'Unnamed: 0'],
    'TQA': ['LTHDIGITAL', 'WOBA', 'PWPA', 'KREV', 'BDTI', 'OVERBDNGRADNT', 'Unnamed: 0.1', 'DVER', 'BDDI', 'MTOA'],
    'RPMT': ["RPMS", "RPMT", "MFIA", "SPPA", "C1C2", "ACS", "ARM48P", "ARM16P", "AC", "RSHA", "HKLX", "RMED"],
    'BDTI': ["BDTI", "KREV", "Unnamed: 0", "BDDI", "DVER", "OVERBDNGRADNT", "PWPA", "FRACTUREGRADNT", "PPE", "MTOA", "LTHDIGITAL"],
    'KREV': ["KREV", "BDTI", "Unnamed: 0", "BDDI", "DVER", "OVERBDNGRADNT", "PWPA", "FRACTUREGRADNT", "PPE", "MTOA", "LTHDIGITAL"],
    'DXC': ['DEXM', 'ROPA', 'WOBA', 'ROP', 'U', 'LTHDIGITAL', 'ICGRC', 'BDTI', 'KREV', 'FRACTUREGRADNT', 'ETHA'],
    'LTHDIGITAL': ["LTHDIGITAL", "TVA", "FRACTUREGRADNT", "PPE", "BDDI", "Unnamed: 0", "Unnamed: 0.1", "DVER", "PWPA", "KREV", "BDTI", "OVERBDNGRADNT"],
    'PPE': ["PPE", "DVER", "BDDI", "Unnamed: 0", "OVERBDNGRADNT", "FRACTUREGRADNT", "PWPA", "KREV", "BDTI", "LTHDIGITAL", "MTOA"],
    'SPPA': ["SPPA", "MFIA", "RPMT", "RPMS", "ETHA", "GWR", "U", "CALI", "K", "TQA", "TH", "MFOA"],
    
}


TARGET_COL = 'PPE'
FEATURE_COLS = target_to_feature[TARGET_COL]
lookahead = 50

# df = your DataFrame; TARGET = 'KREV'
corr_with_target = df.dropna().corr()[TARGET_COL].abs().sort_values(ascending=False)
print("Top features by |ρ|:\n", corr_with_target.head(12))


windows = [3,10]
depth_col = "Unnamed: 0"

new_cols = []
for w in windows:
    roll = (
        df[FEATURE_COLS]
        .rolling(window=w, min_periods=1)      # past‑only window
        .mean()
        .add_suffix(f"_rm{w}")
    )
    df = pd.concat([df, roll], axis=1)
    new_cols.extend(roll.columns.tolist())

FEATURE_COLS = FEATURE_COLS + new_cols

FEATURE_COLS

import numpy as np

# assume df, FEATURE_COLS and TARGET_COL are already defined
depths    = df['Unnamed: 0'].to_numpy()
X_mat     = df[FEATURE_COLS].to_numpy()    # shape (n_samples, n_features)
y_vec     = df[TARGET_COL].to_numpy()      # shape (n_samples,)
lookahead = 50

# 1) compute the look‐ahead indices
j = np.searchsorted(depths, depths + lookahead)

# 2) First mask: keep only those i where j is in‐bounds
in_bounds = j < len(depths)
i1        = np.nonzero(in_bounds)[0]    # the original row-indices
j1        = j[in_bounds]               # the corresponding look-ahead indices

# 3) Second mask: drop any whose target is NaN
not_nan   = ~np.isnan(y_vec[j1])
i2        = i1[not_nan]
j2        = j1[not_nan]

# 4) slice out your X and y
X = X_mat[i2]
y = y_vec[j2]

print("Prepared X.shape =", X.shape, "   y.shape =", y.shape)


import numpy as np

block_size = 2000
n = len(X)

# assign each row to a “block” and pick evens for train, odds for test
blocks = np.arange(n) // block_size
train_mask = (blocks % 2) == 0

X_train = X[train_mask]
X_test  = X[~train_mask]
y_train = y[train_mask]
y_test  = y[~train_mask]


from sklearn.pipeline             import Pipeline
from sklearn.ensemble            import HistGradientBoostingRegressor
from sklearn.metrics             import mean_squared_error

# 1) Define your HGBR hyper-parameters
hgb_params = {
    'max_iter': 500,           # number of trees
    'learning_rate': 0.05,      # shrinkage
    'max_depth': 5,           # tree depth
    'random_state': 42,
    'verbose': 0,               # prints progress per iteration
    'l2_regularization': 1.0, # helps control overfitting with deeper trees
    'n_iter_no_change': 50,
}

# 2) Build the pipeline
pipeline = Pipeline([
    # note: no scaler needed for tree-based methods
    ('hgb', HistGradientBoostingRegressor(**hgb_params))
])

# 3) Train
pipeline.fit(X_train, y_train)

# 4) Predict & evaluate
y_pred    = pipeline.predict(X_test)
test_mse  = mean_squared_error(y_test, y_pred)
print("Test MSE (HistGradientBoosting):", test_mse)


import numpy as np

# 0) Copy so you don’t clobber the original
df3 = df.copy()

# 1) Pre‑allocate two prediction columns
df3['pred_train_50m'] = np.nan
df3['pred_test_50m']  = np.nan

# 2) Roll your arrays
depths   = df3['Unnamed: 0'].to_numpy()
X_mat    = df3[FEATURE_COLS].to_numpy()
y_vec    = df3[TARGET_COL].to_numpy()
lookahead = 50

# 3) Look‑ahead index (j[i] ≈ i + lookahead‑metres)
j = np.searchsorted(depths, depths + lookahead)

# 4) Keep only i,j pairs where j is in bounds
in_bounds = j < len(depths)
i1 = np.nonzero(in_bounds)[0]
j1 = j[in_bounds]

# 5) Drop any pair whose target y[j] is NaN
not_nan = ~np.isnan(y_vec[j1])
i2 = i1[not_nan]
j2 = j1[not_nan]

# 6) Build your feature/label arrays for prediction
X = X_mat[i2]       # features drawn at row i2
y =  y_vec[j2]      # “future” targets at row j2

# 7) Define alternating‑block train/test mask
block_size = 2000
blocks     = np.arange(len(X)) // block_size
train_mask = (blocks % 2) == 0

# 8) Break out the row‑indices for DF write‑back
i_train = i2[train_mask]    # feature rows used for training
j_train = j2[train_mask]    # rows where pred_train_50m will go
i_test  = i2[~train_mask]   # feature rows used for testing
j_test  = j2[~train_mask]   # rows where pred_test_50m will go

# 9) Run your model on each slice
pred_train = pipeline.predict(X[train_mask])
pred_test  = pipeline.predict(X[~train_mask])

# 10) Write back into the two new columns
df3.loc[j_train, 'pred_train_50m'] = pred_train
df3.loc[j_test,  'pred_test_50m']  = pred_test


import plotly.offline as pyo
import plotly.graph_objs as go
# Set notebook mode to work in offline
pyo.init_notebook_mode()

fig = px.line(df3, x="Unnamed: 0", y=[TARGET_COL, "pred_train_50m", "pred_test_50m"],
              labels={"x": "Depth", "y": TARGET_COL},
              title=f"{TARGET_COL} vs. Depth")
fig.show()

df4 = df3[['Unnamed: 0', 'pred_50m', 'prediction_for_50m']]

df4

df4.to_csv(f'{TARGET_COL}2.csv', index=False)

import joblib
joblib.dump(pipeline, f'Models/{TARGET_COL}_50m_pipeline.pkl')



KeyboardInterrupt: 