In [4]:
import sys
print(sys.executable)
print(sys.path[:5])

/usr/local/bin/python3.12
['/Users/igwanhyeong/Applications/PyCharm.app/Contents/plugins/python-ce/helpers/jupyter_debug', '/Users/igwanhyeong/Applications/PyCharm.app/Contents/plugins/python-ce/helpers/pydev', '/Library/Frameworks/Python.framework/Versions/3.12/lib/python312.zip', '/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12', '/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/lib-dynload']


In [1]:
import sys

import polars as pl
import torch
import os
import glob

from modeling_module.data_loader.MultiPartDataModule import MultiPartDataModule
from modeling_module.data_loader.MultiPartExoDataModule import MultiPartExoDataModule
from modeling_module.utils.exogenous_utils import compose_exo_calendar_cb

'''
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
https://developer.nvidia.com/cuda-12-8-0-download-archive
'''

MAC_DIR = '/Users/igwanhyeong/PycharmProjects/data_research/raw_data/'
WINDOW_DIR = 'C:/Users/USER/PycharmProjects/research/raw_data/'

if sys.platform == 'win32':
    DIR = WINDOW_DIR
    print(torch.cuda.is_available())
    print(torch.cuda.device_count())
    print(torch.version.cuda)
    print(torch.__version__)
    print(torch.cuda.get_device_name(0))
    print(torch.__version__)
else:
    DIR = MAC_DIR
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

save_dir = DIR + 'fit/model_validation'

# if os.path.exists(save_dir):
#     files = glob.glob(os.path.join(save_dir, "*.pt"))
#     print(f"Deleting {len(files)} old checkpoint files...")
#     for f in files:
#         try:
#             os.remove(f)
#         except Exception as e:
#             print(f"Error deleting {f}: {e}")
# else:
#     os.makedirs(save_dir, exist_ok=True)

print("Clean up complete.")


Clean up complete.


In [3]:
import polars as pl
import numpy as np

ETT1 = pl.read_csv(DIR + "csv/ETTh1.csv")

df = (
    ETT1
    .select(["date", "HUFL"])
    .with_columns(pl.lit("A").alias("unique_id"))
    # 원본 date 문자열을 그대로 Datetime으로 파싱
    .with_columns(
        pl.col("date").str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S", strict=False).alias("date")
    )
    .sort(["unique_id", "date"])
)

# time index
df = df.with_columns(
    pl.arange(0, pl.len()).over("unique_id").alias("t_idx")
)

# (1) known-future 스케줄: promo (예: 특정 시간대에만 1)
# 하루 24시간 중 8~10시, 18~20시에 프로모션이라고 가정
df = df.with_columns([
    (pl.col("t_idx") % 24).alias("hour"),
])

df = df.with_columns([
    (
        ((pl.col("hour") >= 8) & (pl.col("hour") <= 10)) |
        ((pl.col("hour") >= 18) & (pl.col("hour") <= 20))
    ).cast(pl.Int8).alias("promo_flag")
])

# (2) calendar exo: 24h sin/cos
df = df.with_columns([
    ( (2*np.pi*pl.col("t_idx")/24.0).sin().cast(pl.Float32) ).alias("exo_fut_sin24"),
    ( (2*np.pi*pl.col("t_idx")/24.0).cos().cast(pl.Float32) ).alias("exo_fut_cos24"),
])

# (3) (중요) 타깃에 promo 효과 "주입" -> exo가 없으면 예측이 어려워지고, 있으면 쉬워짐
# HUFL_y = HUFL + alpha*promo_flag + beta*sin24  (alpha는 체감되게 크게)
alpha = 2.0
beta  = 0.5
df = df.with_columns([
    (
        pl.col("HUFL").cast(pl.Float32)
        + pl.col("promo_flag").cast(pl.Float32) * pl.lit(alpha)
        + pl.col("exo_fut_sin24").cast(pl.Float32) * pl.lit(beta)
    ).alias("y")
])

# =========================
# past_exo 후보 생성
# =========================
# 기준: y를 만들었으면 y 기반으로 만드는 게 가장 직관적.
# (HUFL 원본 기반으로도 가능하나, 지금은 y에 promo/seasonality가 주입되어 있으니 y 기준 추천)

df = df.with_columns([
    # (A) lag / diff
    pl.col("y").shift(1).over("unique_id").alias("pe_lag1_y"),
    pl.col("y").shift(24).over("unique_id").alias("pe_lag24_y"),  # 하루 전(24시간 전)
    (pl.col("y") - pl.col("y").shift(1).over("unique_id")).alias("pe_diff1_y"),
    (pl.col("y") - pl.col("y").shift(24).over("unique_id")).alias("pe_diff24_y"),

    # (B) rolling mean / std (짧은/중간 윈도우)
    pl.col("y").rolling_mean(window_size=6).over("unique_id").alias("pe_rm6_y"),
    pl.col("y").rolling_mean(window_size=24).over("unique_id").alias("pe_rm24_y"),
    pl.col("y").rolling_std(window_size=24).over("unique_id").alias("pe_rs24_y"),

    # (C) z-score (24시간 기준)
    (
        (pl.col("y") - pl.col("y").rolling_mean(24).over("unique_id"))
        / (pl.col("y").rolling_std(24).over("unique_id") + 1e-6)
    ).alias("pe_z24_y"),

    # (D) EMA (지수이동평균) - Polars ewm_mean 사용
    pl.col("y").ewm_mean(alpha=0.2).over("unique_id").alias("pe_ema_a02_y"),

    # (E) promo의 과거 상태 (이벤트의 lag)
    pl.col("promo_flag").shift(1).over("unique_id").cast(pl.Float32).alias("pe_lag1_promo"),
    pl.col("promo_flag").rolling_mean(24).over("unique_id").cast(pl.Float32).alias("pe_rm24_promo"),
])

# rolling/shift로 인해 처음 구간에 null이 생깁니다.
# TrainingDataset은 null을 그대로 numpy로 가져오면 nan이 될 수 있으니, 보통 0으로 채우는 편이 안전합니다.
past_cols = [
    "pe_lag1_y", "pe_lag24_y", "pe_diff1_y", "pe_diff24_y",
    "pe_rm6_y", "pe_rm24_y", "pe_rs24_y", "pe_z24_y",
    "pe_ema_a02_y", "pe_lag1_promo", "pe_rm24_promo",
]

df = df.with_columns([pl.col(c).fill_null(0.0).cast(pl.Float32) for c in past_cols])

df.select(["date","promo_flag", "y", 'HUFL'] + past_cols).head(5)


date,promo_flag,y,HUFL,pe_lag1_y,pe_lag24_y,pe_diff1_y,pe_diff24_y,pe_rm6_y,pe_rm24_y,pe_rs24_y,pe_z24_y,pe_ema_a02_y,pe_lag1_promo,pe_rm24_promo
datetime[μs],i8,f32,f64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
2016-07-01 00:00:00,0,5.827,5.827,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.827,0.0,0.0
2016-07-01 01:00:00,0,5.822409,5.693,5.827,0.0,-0.004591,0.0,0.0,0.0,0.0,0.0,5.82445,0.0,0.0
2016-07-01 02:00:00,0,5.407,5.157,5.822409,0.0,-0.415409,0.0,0.0,0.0,0.0,0.0,5.653363,0.0,0.0
2016-07-01 03:00:00,0,5.443553,5.09,5.407,0.0,0.036553,0.0,0.0,0.0,0.0,0.0,5.582289,0.0,0.0
2016-07-01 04:00:00,0,5.791012,5.358,5.443553,0.0,0.347459,0.0,0.0,0.0,0.0,0.0,5.64438,0.0,0.0


In [3]:
lookback = 52
horizon = 8

future_exo_cb = compose_exo_calendar_cb(date_type = 'H', sincos = True)

data_module = MultiPartExoDataModule(
    df,
    id_col = 'unique_id',
    date_col = 'date',
    y_col = 'y',
    lookback = lookback,
    horizon = horizon,
    batch_size = 128,
    past_exo_cont_cols = past_cols,
    future_exo_cb = future_exo_cb,
    freq = 'hourly',
    shuffle = True,
    split_mode = 'multi',
)

train_loader = data_module.get_train_loader()
val_loader = data_module.get_val_loader()

In [4]:
data_module.val_dataset[0]

(tensor([[-4.7606],
         [-4.3540],
         [-4.4834],
         [-3.3980],
         [ 0.2494],
         [ 3.3850],
         [ 7.5550],
         [12.8870],
         [16.7210],
         [21.2590],
         [19.8074],
         [19.5090],
         [18.4906],
         [19.2900],
         [18.7494],
         [16.9950],
         [16.7636],
         [15.6370],
         [16.3570],
         [13.6950],
         [10.8650],
         [ 9.1310],
         [ 2.0856],
         [ 0.7760],
         [-2.8176],
         [-2.6790],
         [-2.4734],
         [ 0.5540],
         [ 5.6744],
         [ 9.5470],
         [ 7.8230],
         [11.6810],
         [16.9220],
         [21.4600],
         [17.9324],
         [19.8440],
         [15.6776],
         [17.0130],
         [15.7354],
         [17.4640],
         [14.3526],
         [15.7710],
         [15.6200],
         [15.0350],
         [13.4100],
         [12.2120],
         [10.5256],
         [11.2250],
         [ 9.1044],
         [-0.2010],


In [5]:
print("len(train_dataset) =", len(data_module.train_dataset))
print("len(val_dataset)   =", len(data_module.val_dataset))

train_loader = data_module.get_train_loader()
val_loader   = data_module.get_val_loader()

print("len(train_loader) =", len(train_loader))
print("len(val_loader)   =", len(val_loader))


len(train_dataset) = 13889
len(val_dataset)   = 3472
len(train_loader) = 108
len(val_loader)   = 28


In [None]:
from modeling_module.training.model_trainers.total_train import run_total_train_monthly, run_total_train_weekly, \
    run_total_train_hourly

model_dict = run_total_train_hourly(
    train_loader,
    val_loader,
    lookback = lookback,
    horizon = horizon,
    save_dir = save_dir,
    models_to_run = ['patchtst']
)

In [6]:
from modeling_module.utils.checkpoint import load_model_dict
# Load
from modeling_module.models.model_builder import (
    build_patchTST_base, build_patchTST_quantile,
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

builders = {
    f"hourly_PatchTSTBase_L{lookback}_H{horizon}_exo": build_patchTST_base,
    f"hourly_PatchTSTQuantile_L{lookback}_H{horizon}_exo": build_patchTST_quantile,
}
loaded = load_model_dict(save_dir, builders, device = device)

[load] hourly_PatchTSTBase_L52_H8_exo ← C:/Users/USER/PycharmProjects/research/raw_data/fit/model_validation\hourly_PatchTSTBase_L52_H8_exo.pt
{'device': 'cuda', 'lookback': 52, 'horizon': 8, 'epochs': 1, 'lr': 0.0001, 'weight_decay': 0.0001, 't_max': 10, 'patience': 50, 'max_grad_norm': 30.0, 'amp_device': 'cuda', 'loss_mode': 'point', 'point_loss': 'huber', 'huber_delta': 5.0, 'q_star': 0.5, 'use_cost_q_star': False, 'Cu': 1.0, 'Co': 1.0, 'quantiles': (0.1, 0.5, 0.9), 'use_intermittent': True, 'alpha_zero': 1.2, 'alpha_pos': 1.0, 'gamma_run': 0.6, 'cap': None, 'use_horizon_decay': False, 'tau_h': 24.0, 'val_use_weights': False, 'spike_loss': {'enabled': True, 'strategy': 'mix', 'huber_delta': 2.0, 'asym_up_weight': 2.0, 'asym_down_weight': 1.0, 'mad_k': 3.5, 'w_spike': 6.0, 'w_norm': 1.0, 'alpha_huber': 0.7, 'beta_asym': 0.3, 'mix_with_baseline': False, 'gamma_baseline': 0.2}, 'lambda_hist_scale': 0.1, 'lambda_hist_var': 0.03, 'hist_window': 12, 'anchor_last_k': 8, 'anchor_weight': 0

In [7]:
%load_ext autoreload
%autoreload 2

import importlib, modeling_module.utils.plot_utils as pu
import modeling_module.training.forecaster as fo
importlib.reload(pu)
importlib.reload(fo)

def my_exo_cb(start_idx: int, Hm: int, device="cuda" if torch.cuda.is_available() else "cpu"):
    # exo_dim = 2 (sin, cos)
    return fo.make_calendar_exo(start_idx, Hm, period=24, device=device)

models_for_plot = {
    name: res['model']
    for name, res in model_dict.items()
}

def my_exo_cb(start_idx: int, H: int, device="cuda"):
    # (H, 4) = hour sin/cos (2) + week sin/cos (2)
    t = torch.arange(start_idx, start_idx + H, device=device, dtype=torch.float32)
    h = torch.stack([torch.sin(2*torch.pi*t/24.0),  torch.cos(2*torch.pi*t/24.0)], dim=-1)    # (H,2)
    w = torch.stack([torch.sin(2*torch.pi*t/168.0), torch.cos(2*torch.pi*t/168.0)], dim=-1)   # (H,2)
    return torch.cat([h, w], dim=-1)  # (H,4)

pu.plot_forecast(
    models=loaded,
    loader=val_loader,
    horizon=8,
    freq="hourly",          # daily 말고 hourly 권장 (ETT)
    plan_dt=None,
    future_exo_cb=my_exo_cb,
    metrics_parquet_path = f'{save_dir}/metrics_exo.parquet'
)

NameError: name 'model_dict' is not defined

In [8]:
((pl
    .read_parquet(save_dir + '/metrics.parquet')
    .select(['sample_idx', 'model', 'y_true', 'y_pred_point', 'MAE', 'RMSE', 'sMAPE', 'WAPE'])
 ).join(
    pl.read_parquet(save_dir + '/metrics_exo.parquet')
      .select(['sample_idx', 'model', 'y_true', 'y_pred_point', 'MAE', 'RMSE', 'sMAPE', 'WAPE'])
      .rename({'y_true': 'e_y_true', 'y_pred_point': 'e_y_pred_point', 'MAE': 'e_MAE', 'RMSE': 'e_RMSE', 'sMAPE': 'e_sMAPE', 'WAPE': 'e_WAPE'})
    , on = ['sample_idx'], how = 'left'
)
 .select(['sample_idx', 'model', 'y_true', 'y_pred_point', 'e_y_pred_point', 'MAE', 'e_MAE', 'RMSE', 'e_RMSE', 'sMAPE', 'e_sMAPE', 'WAPE', 'e_WAPE'])
 .unique(subset = ['sample_idx', 'model'])
 .filter(pl.col('model') == 'hourly_PatchTSTBase_L52_H8')
 .sort(by = 'sample_idx')
 )

sample_idx,model,y_true,y_pred_point,e_y_pred_point,MAE,e_MAE,RMSE,e_RMSE,sMAPE,e_sMAPE,WAPE,e_WAPE
i64,str,list[f64],list[f64],list[f64],f64,f64,f64,f64,f64,f64,f64,f64
0,"""hourly_PatchTSTBase_L52_H8""","[5.291, 8.439, … 8.239]","[8.956542, 10.082773, … 17.93141]","[9.533715, 11.049342, … 20.384336]",5.810367,7.554124,6.52931,8.326397,0.510518,0.601826,0.727444,0.907267
1,"""hourly_PatchTSTBase_L52_H8""","[3.818, 8.506, … 15.472]","[6.81191, 9.539867, … 16.781738]","[6.430343, 11.726782, … 16.306581]",1.159212,1.458791,1.480057,1.743568,0.130045,0.157962,0.098826,0.120056
2,"""hourly_PatchTSTBase_L52_H8""","[8.439, 9.243, … 7.904]","[9.165412, 9.376855, … 9.954692]","[10.061072, 9.170853, … 8.27275]",1.414258,0.848474,1.570511,0.949333,0.180092,0.116628,0.191084,0.113242
3,"""hourly_PatchTSTBase_L52_H8""","[10.047, 7.971, … 7.636]","[11.374949, 11.198786, … 11.284766]","[11.150146, 9.785376, … 10.76541]",2.945574,2.162363,3.02889,2.296217,0.314506,0.243786,0.36685,0.269388
4,"""hourly_PatchTSTBase_L52_H8""","[11.253, 1.942, … -2.344]","[10.758159, 7.598352, … 4.100243]","[10.616682, 7.926925, … 4.478985]",5.903785,7.240591,6.303219,7.728424,1.653842,1.60585,1.193136,1.355753
5,"""hourly_PatchTSTBase_L52_H8""","[9.846, 9.578, … -5.961]","[13.265215, 13.888227, … 2.766505]","[11.751611, 12.833981, … 6.176485]",6.631746,5.871999,7.215113,7.197752,1.020231,0.839169,0.941842,0.786893
6,"""hourly_PatchTSTBase_L52_H8""","[11.186, 10.315, … 20.629999]","[13.210413, 13.686142, … 15.741817]","[13.447442, 14.602949, … 15.609864]",2.514547,2.49744,2.761918,2.737551,0.169602,0.162899,0.163134,0.157716
7,"""hourly_PatchTSTBase_L52_H8""","[12.726, 10.449, … 6.229]","[12.070307, 9.956302, … 11.343987]","[15.615198, 12.716984, … 14.849728]",2.29154,4.091363,2.795768,5.492817,0.313795,0.46185,0.306494,0.494081
8,"""hourly_PatchTSTBase_L52_H8""","[10.583, 10.65, … 9.377]","[13.323889, 13.555085, … 12.583464]","[15.041212, 14.944033, … 13.164637]",2.433942,3.354718,2.580388,3.442841,0.213317,0.275771,0.236156,0.315062
9,"""hourly_PatchTSTBase_L52_H8""","[-9.243, -3.282, … 6.229]","[-1.755715, -3.029436, … 9.433214]","[5.682845, 4.780294, … 9.005836]",3.223716,4.827561,3.94122,6.796184,0.903805,0.983349,0.628129,0.94108


In [None]:
((pl
    .read_parquet(save_dir + '/metrics.parquet')
    .select(['sample_idx', 'model', 'y_true', 'y_pred_point', 'MAE', 'RMSE', 'sMAPE', 'WAPE'])
 ).join(
    pl.read_parquet(save_dir + '/metrics_exo.parquet')
      .select(['sample_idx', 'model', 'y_true', 'y_pred_point', 'MAE', 'RMSE', 'sMAPE', 'WAPE'])
      .rename({'y_true': 'e_y_true', 'y_pred_point': 'e_y_pred_point', 'MAE': 'e_MAE', 'RMSE': 'e_RMSE', 'sMAPE': 'e_sMAPE', 'WAPE': 'e_WAPE'})
    , on = ['sample_idx'], how = 'left'
)
 .select(['sample_idx', 'model', 'y_true', 'y_pred_point', 'e_y_pred_point', 'MAE', 'e_MAE', 'RMSE', 'e_RMSE', 'sMAPE', 'e_sMAPE', 'WAPE', 'e_WAPE'])
 .unique(subset = ['sample_idx', 'model'])
 .filter(pl.col('model') == 'hourly_PatchTSTBase_L52_H8')
 .sort(by = 'sample_idx')
 )

In [None]:
pl.read_parquet(save_dir + '/metrics_exo.parquet')

In [13]:
from modeling_module.training.forecater_v2 import forecast_to_parquet

df = forecast_to_parquet(
    model_dict=loaded,
    loader=val_loader,
    parquet_path="outputs/preds.parquet",
    horizon=100,
    freq="weekly",
    mode="val",
    plan_dt=202601,
    device="cuda",
    max_samples=50,   # 저장할 샘플 개수 제한
)

[DMS] Point AR Start. Hm=8, H_req=100
[DMS] Point AR Start. Hm=8, H_req=100
[DMS] Point AR Start. Hm=8, H_req=100
[DMS] Point AR Start. Hm=8, H_req=100
[DMS] Point AR Start. Hm=8, H_req=100
[DMS] Point AR Start. Hm=8, H_req=100
[DMS] Point AR Start. Hm=8, H_req=100
[DMS] Point AR Start. Hm=8, H_req=100
[DMS] Point AR Start. Hm=8, H_req=100
[DMS] Point AR Start. Hm=8, H_req=100
[DMS] Point AR Start. Hm=8, H_req=100
[DMS] Point AR Start. Hm=8, H_req=100
[DMS] Point AR Start. Hm=8, H_req=100
[DMS] Point AR Start. Hm=8, H_req=100
[DMS] Point AR Start. Hm=8, H_req=100
[DMS] Point AR Start. Hm=8, H_req=100
[DMS] Point AR Start. Hm=8, H_req=100
[DMS] Point AR Start. Hm=8, H_req=100
[DMS] Point AR Start. Hm=8, H_req=100
[DMS] Point AR Start. Hm=8, H_req=100
[DMS] Point AR Start. Hm=8, H_req=100
[DMS] Point AR Start. Hm=8, H_req=100
[DMS] Point AR Start. Hm=8, H_req=100
[DMS] Point AR Start. Hm=8, H_req=100
[DMS] Point AR Start. Hm=8, H_req=100
[DMS] Point AR Start. Hm=8, H_req=100
[DMS] Point 

In [14]:
df

part_id,sample_idx,model,horizon,y_pred_point,y_pred_q50
str,i64,str,i64,list[f64],list[f64]
"""A""",0,"""hourly_PatchTSTBase_L52_H8_exo""",100,"[3.3243, 3.490515, … 13.766656]","[3.3243, 3.490515, … 13.766656]"
"""A""",0,"""hourly_PatchTSTQuantile_L52_H8…",100,"[7.73454, 9.142086, … -0.000676]","[7.73454, 9.142086, … -0.000676]"
"""A""",1,"""hourly_PatchTSTBase_L52_H8_exo""",100,"[4.750186, 4.987696, … 8.811855]","[4.750186, 4.987696, … 8.811855]"
"""A""",1,"""hourly_PatchTSTQuantile_L52_H8…",100,"[5.22233, 9.522999, … 0.001793]","[5.22233, 9.522999, … 0.001793]"
"""A""",2,"""hourly_PatchTSTBase_L52_H8_exo""",100,"[9.57448, 9.170853, … 15.20472]","[9.57448, 9.170853, … 15.20472]"
…,…,…,…,…,…
"""A""",47,"""hourly_PatchTSTQuantile_L52_H8…",100,"[4.743947, 5.030616, … 6.541367]","[4.743947, 5.030616, … 6.541367]"
"""A""",48,"""hourly_PatchTSTBase_L52_H8_exo""",100,"[9.944551, 10.441777, … 0.948352]","[9.944551, 10.441777, … 0.948352]"
"""A""",48,"""hourly_PatchTSTQuantile_L52_H8…",100,"[10.50242, 11.013859, … 0.004285]","[10.50242, 11.013859, … 0.004285]"
"""A""",49,"""hourly_PatchTSTBase_L52_H8_exo""",100,"[2.745607, 1.647364, … 0.351337]","[2.745607, 1.647364, … 0.351337]"


In [23]:

from modeling_module.utils.plot_utils_v2 import plot_from_rows

plot_from_rows(
    df,
    max_plots=10,
    show=True,
    zoom=27,                   # True 또는 int
    title_prefix="VAL | weekly"
)

[autoreload of modeling_module.utils.plot_utils failed: Traceback (most recent call last):
  File "C:\Users\USER\python\py312\Lib\site-packages\IPython\extensions\autoreload.py", line 322, in check
    elif self.deduper_reloader.maybe_reload_module(m):
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\USER\python\py312\Lib\site-packages\IPython\extensions\deduperreload\deduperreload.py", line 524, in maybe_reload_module
    new_source_code = f.read()
                      ^^^^^^^^
UnicodeDecodeError: 'cp949' codec can't decode byte 0xeb in position 745: illegal multibyte sequence
]
[autoreload of modeling_module.training.forecater_v2 failed: Traceback (most recent call last):
  File "C:\Users\USER\python\py312\Lib\site-packages\IPython\extensions\autoreload.py", line 322, in check
    elif self.deduper_reloader.maybe_reload_module(m):
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\USER\python\py312\Lib\site-packages\IPython\extensions\dedupe

TypeError: the truth value of a DataFrame is ambiguous

Hint: to check if a DataFrame contains any values, use `is_empty()`.