In [None]:
# train_ctown_lstm_tf.py
import pandas as pd, numpy as np, tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

CSV_PATH = "./dataset/leakage/ctown_leak_dataset.csv"
WINDOW = 12          # 12 steps (e.g., 1h if 5-min step)
STRIDE = 3
BATCH = 64
EPOCHS = 30
LR = 1e-3
RANDOM = 42

df = pd.read_csv(CSV_PATH)
# recover/ensure time order
leak_df = df[df.leak == 1]
noleak_df = df[df.leak == 0].sample(len(leak_df), random_state=42)
balanced = pd.concat([leak_df, noleak_df]).sample(frac=1, random_state=42)
df = balanced.copy()

if 'Unnamed: 0' in df.columns: df = df.rename(columns={'Unnamed: 0':'t'})
if 't' not in df.columns:
    df['t'] = df.groupby('scenario_id').cumcount()

# Identify boundaries where a new scenario begins
meta_cols = ["leak", "node", "area", "Cd", "tstart", "tend", "bin"]
# each unique combo of these = one scenario
# df["scenario_id"] = (df[meta_cols] != df[meta_cols].shift()).any(axis=1).cumsum()

feat_cols = [c for c in df.columns if c.startswith(('P_','Q_','H_'))]
label_col = 'leak'


# split by scenario_id (no leakage)
sc_ids = df['scenario_id'].dropna().unique()
tr_ids, te_ids = train_test_split(sc_ids, test_size=0.2, random_state=RANDOM, shuffle=True)
tr_ids, va_ids = train_test_split(tr_ids, test_size=0.2, random_state=RANDOM, shuffle=True)

def make_windows(sub):
    sub = sub.sort_values('t')
    X, y = [], []
    Xraw, yraw = sub[feat_cols].values, sub[label_col].values
    for i in range(0, len(sub)-WINDOW+1, STRIDE):
        X.append(Xraw[i:i+WINDOW])
        y.append(yraw[i+WINDOW-1])
    return np.array(X), np.array(y)

def pack(ids):
    partsX, partsy = [], []
    for sid in ids:
        Xc, yc = make_windows(df[df.scenario_id==sid])
        if len(Xc): partsX.append(Xc); partsy.append(yc)
    return np.vstack(partsX), np.hstack(partsy)

Xtr, ytr = pack(tr_ids); Xva, yva = pack(va_ids); Xte, yte = pack(te_ids)

# scale features (fit on train only)
scaler = StandardScaler().fit(Xtr.reshape(-1, Xtr.shape[-1]))
def scale(X): return scaler.transform(X.reshape(-1, X.shape[-1])).reshape(X.shape)
Xtr, Xva, Xte = scale(Xtr), scale(Xva), scale(Xte)

In [None]:


# build model
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(WINDOW, Xtr.shape[-1])),
    tf.keras.layers.LSTM(64, return_sequences=False),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer=tf.keras.optimizers.Adam(LR),
              loss='binary_crossentropy',
              metrics=[tf.keras.metrics.AUC(name='auc'),
                       tf.keras.metrics.Precision(name='prec'),
                       tf.keras.metrics.Recall(name='rec')])

cb = [tf.keras.callbacks.EarlyStopping(monitor='val_auc', mode='max',
                                       patience=5, restore_best_weights=True)]
model.fit(Xtr, ytr, validation_data=(Xva, yva),
          epochs=EPOCHS, batch_size=BATCH, callbacks=cb, verbose=1)

print("\nTest metrics:", model.evaluate(Xte, yte, batch_size=BATCH, verbose=0))
# optional: save
model.save("./model_trained/leakage_lstm_1.keras")


In [None]:
# build stacked LSTM/GRU model
from tensorflow.keras import layers, models, optimizers, callbacks, metrics

model = models.Sequential([
    layers.Input(shape=(WINDOW, Xtr.shape[-1])),

    # Stacked recurrent layers
    layers.LSTM(128, return_sequences=True),
    layers.Dropout(0.3),
    layers.LSTM(64, return_sequences=False),  # last layer outputs sequence summary
    layers.Dropout(0.3),

    # Or try GRUs instead:
    # layers.GRU(128, return_sequences=True),
    # layers.Dropout(0.3),
    # layers.GRU(64, return_sequences=False),
    # layers.Dropout(0.3),

    # Dense head
    layers.Dense(32, activation="relu"),
    layers.Dropout(0.2),
    layers.Dense(1, activation="sigmoid")
])

model.compile(
    optimizer=optimizers.Adam(LR),
    loss="binary_crossentropy",
    metrics=[
        metrics.AUC(name="auc"),
        metrics.Precision(name="prec"),
        metrics.Recall(name="rec")
    ]
)

cb = [callbacks.EarlyStopping(monitor="val_auc", mode="max",
                              patience=6, restore_best_weights=True)]
model.fit(Xtr, ytr, validation_data=(Xva, yva),
          epochs=EPOCHS, batch_size=BATCH, callbacks=cb, verbose=1)

print("\nTest metrics:", model.evaluate(Xte, yte, batch_size=BATCH, verbose=0))

In [None]:
from tensorflow.keras import layers, models, callbacks, optimizers, metrics

# Stacked GRU (using previous dataset variables)
model = models.Sequential()
model.add(layers.GRU(64, return_sequences=True, input_shape=(Xtr.shape[1], Xtr.shape[2])))
model.add(layers.Dropout(0.4))
model.add(layers.GRU(32, return_sequences=False))
model.add(layers.Dropout(0.4))
model.add(layers.Dense(32, activation="relu"))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(1, activation="sigmoid"))

# Compile
model.compile(
    optimizer=optimizers.Adam(learning_rate=1e-3),
    loss="binary_crossentropy",
    metrics=[
        metrics.AUC(name="auc"),
        metrics.Precision(name="prec"),
        metrics.Recall(name="rec"),
    ]
)

# Early stopping
early_stop = callbacks.EarlyStopping(
    monitor="val_auc", patience=6, restore_best_weights=True
)

# Train
history = model.fit(
    Xtr, ytr,
    validation_data=(Xva, yva),
    epochs=30,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)

# Evaluate
test_metrics = model.evaluate(Xte, yte, verbose=0)
print("Test metrics:", test_metrics)


In [None]:
model.save("./model_trained/leakage_gru_1.keras")

In [5]:
import os
file = os.listdir('/home/zayd/Desktop/Digital_twin_project/machine_learning/model_trained/LightGBM_0.0.2_moving_avg')
print(len(file))

70


In [None]:
import pandas as pd
import logging
from pathlib import Path
import os

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[logging.StreamHandler()]
)

# Load main data
demand = pd.read_parquet('./dataset/junction_base_demand-0-dynamic_input.parquet')
statics = [
    './dataset/junction_elevation-0-static_input.parquet',
    './dataset/pipe_diameter-0-static_input.parquet',
    './dataset/pipe_initial_status-0-static_input.parquet',
    './dataset/pipe_length-0-static_input.parquet',
    './dataset/pipe_minor_loss-0-static_input.parquet',
    './dataset/pipe_roughness-0-static_input.parquet'
]
links = pd.read_parquet('./dataset/pipes.parquet')

# Ensure output folder exists
Path("./dataset/junctions/").mkdir(parents=True, exist_ok=True)

for junction in demand.columns[demand.columns.str.startswith("J")]:
    demnd = demand[['scenario_id', junction, 'time_id']]
    logging.info(f"Processing junction {junction}")

    for static_path in statics:
        stat = pd.read_parquet(static_path)
        if junction in stat.columns:
            stat = stat[["scenario_id", junction]].rename(columns={junction: f'{junction}_elevation'})
            demnd = pd.merge(demnd, stat, on=["scenario_id"], how="inner")
            logging.info(f"Merged {junction} from {static_path}")

            near_nodes = [node["start"] for idx , node in links.iterrows() if node['end'] == junction]
            near_nodes += [node["end"] for idx , node in links.iterrows() if node['start'] == junction]
            for node in near_nodes:
                if node in demand.columns:
                    node_demands = demand[["scenario_id", node]].rename(columns={node: f'{node}_demand'})
                    demnd = pd.merge(demnd, node_demands, on=["scenario_id"], how="inner")
                    logging.info(f"Merged node {node} for junction {junction}")
                else:
                    logging.warning(f"Merge impossible for node {node} and junction {junction}")            
        else:
            pipes = [row["pipe_id"] for idx, row in links.iterrows()
                     if row["start"] == junction or row["end"] == junction]            
            for pipe in pipes:
                if pipe in stat.columns:
                    #####################################
                    # Get new columns name and rename it: 
                    #####################################
                    filename = os.path.basename(static_path)  
                    variable_name = filename.split('-')[0]  
                    variable_name = variable_name.replace("pipe_", "")
                    stat_pipe = stat[["scenario_id", pipe]].rename(columns={pipe: f'{pipe}_{variable_name}'})
                    ########################
                    # Merging the dataframes
                    ########################
                    demnd = pd.merge(demnd, stat_pipe, on=["scenario_id"], how="inner")
                    logging.info(f"Merged pipe {pipe} for junction {junction}")
                else:
                    logging.warning(f"Merge impossible for pipe {pipe} and junction {junction}")


    output_path = f"./dataset/junctions/{junction}.parquet"
    demnd.to_parquet(output_path, engine="pyarrow", index=False)
    logging.info(f"{junction} saved to {output_path}")


2025-08-26 11:49:34,813 - INFO - Processing junction J511
2025-08-26 11:49:34,941 - INFO - Merged J511 from ./dataset/junction_elevation-0-static_input.parquet
2025-08-26 11:49:35,114 - INFO - Merged node J503 for junction J511
2025-08-26 11:49:36,420 - INFO - Merged node J580 for junction J511
2025-08-26 11:49:38,242 - INFO - Merged pipe P349 for junction J511
2025-08-26 11:49:40,039 - INFO - Merged pipe P524 for junction J511
2025-08-26 11:49:42,082 - INFO - Merged pipe P349 for junction J511
2025-08-26 11:49:43,768 - INFO - Merged pipe P524 for junction J511
2025-08-26 11:49:45,944 - INFO - Merged pipe P349 for junction J511
2025-08-26 11:49:48,779 - INFO - Merged pipe P524 for junction J511
2025-08-26 11:49:51,929 - INFO - Merged pipe P349 for junction J511
2025-08-26 11:49:56,946 - INFO - Merged pipe P524 for junction J511
2025-08-26 11:50:02,770 - INFO - Merged pipe P349 for junction J511
2025-08-26 11:50:07,479 - INFO - Merged pipe P524 for junction J511
2025-08-26 11:50:17,698 