In [4]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error

In [5]:
SEQ_LEN   = 5          # 1,3,5-candle sequences
TOL       = 0.002      # acceptance band on raw coefficient
DOWNSAMPLE_NO_LINE = 0.1   # keep only 10 % of no-line rows

# 1. load candle file (has header row)
df_c = pd.read_csv(
    "/home/iatell/financial_data/merge_data/archive/Bitcoin_BTCUSDT_kaggle_1D_candles_prop.csv",
    parse_dates=['timestamp']          # ensure datetime
)

# 2. load label file (no header)
df_l = pd.read_csv(
    "/home/iatell/financial_data/merge_data/archive/meta_learning_data/ohlcv_log(2).csv",
    header=None,
    names=['timestamp', 'last_close', 'line1_raw']
)

# 3. convert label timestamp to datetime **explicitly**
df_l['timestamp'] = pd.to_datetime(df_l['timestamp'], errors='coerce')

# 4. extract the parenthesised value
df_l['line1'] = (
    df_l['line1_raw']
      .astype(str)
      .str.extract(r'\(([^)]+)\)', expand=False)
      .astype(float)
)
df_l = df_l.drop(columns=['line1_raw'])

# 5. align lengths
last_labeled = df_l['timestamp'].max()
df_c = df_c[df_c['timestamp'] <= last_labeled].copy()
# 2.2 join and create binary flag + log-ratio target
df = pd.merge(df_c, df_l[['timestamp','line1']], on='timestamp')
# create them once, right after you read df_c and df_l
df['has_line'] = (df['line1'] != -1).astype(int)
df['log_coef'] = np.where(df['has_line'] == 1,
                          np.log(df['line1']),
                          np.nan)     # will be masked later

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [1]:
import xgboost as xgb
print(xgb.__version__)

# Test if GPU is recognized
model = xgb.XGBClassifier(tree_method='gpu_hist', gpu_id=0)
print(model.get_params())


3.0.4
{'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'feature_weights': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': 'gpu_hist', 'validate_parameters': None, 'verbosity': None, 'gpu_id': 0}


In [6]:
# print('total rows in label file :', len(df_l))
print('rows with has_line==1 :', df['has_line'].sum())


rows with has_line==1 : 151


In [7]:
def make_multi_sequences(candles, lengths=(1,3,5)):
    max_len = max(lengths)
    X, y_class, y_reg = [], [], []
    for i in range(max_len-1, len(candles)):
        parts = []
        for L in lengths:
            seq = candles.iloc[i-L+1:i+1]
            norm = seq.iloc[-1]['close']
            parts.append((seq[['open','high','low','close']] / norm).values.ravel())
        meta = candles.iloc[i][['upper_shadow','body','lower_shadow',
                                'Candle_Color','upper_body_ratio',
                                'lower_body_ratio','upper_lower_body_ratio']].values
        X.append(np.concatenate(parts + [meta]))
        y_class.append(candles.iloc[i]['has_line'])
        y_reg.append(candles.iloc[i]['log_coef'])
    return np.array(X), np.array(y_class), np.array(y_reg)

X_all, y_cls_all, y_reg_all = make_multi_sequences(df, (1,3,5))

# mask rows with NaN regression target
mask = ~np.isnan(y_reg_all)
X_reg   = X_all[mask]
y_reg   = y_reg_all[mask]

# build binary-classification dataset (all rows)
X_cls   = X_all
y_cls   = y_cls_all

# down-sample majority (no-line) class to reduce “always predict 1.0”
# 1. balanced classification indices
neg_idx   = np.where(y_cls_all == 0)[0]
pos_idx   = np.where(y_cls_all == 1)[0]
neg_keep  = np.random.choice(neg_idx,
                             size=int(DOWNSAMPLE_NO_LINE * len(neg_idx)),
                             replace=False)
idx_cls   = np.concatenate([pos_idx, neg_keep])        # indices into X_all / y_cls_all

# 2. build the classification arrays
X_cls_bal = X_all[idx_cls]
y_cls_bal = y_cls_all[idx_cls]

# 3. indices inside the balanced set that have a regression label
idx_pos_in_bal = idx_cls[y_cls_bal == 1]               # original row numbers of positives

# 4. convert those original row numbers to positions inside the regression subset
#    (because X_reg / y_reg only contain the positive rows)
orig_pos_rows   = np.where(y_cls_all == 1)[0]          # original indices of positives
pos_in_reg      = np.searchsorted(orig_pos_rows, idx_pos_in_bal)

X_reg_bal = X_reg[pos_in_reg]
y_reg_bal = y_reg[pos_in_reg]

In [8]:
def custom_obj(predt: np.ndarray, dtrain: xgb.DMatrix):
    n_cls = len(y_cls_bal)
    n_reg = len(y_reg_bal)

    p_hat  = predt[:n_cls]        # classification logits
    mu_hat = predt[n_cls:]        # regression predictions

    y_cls = y_cls_bal
    y_reg = y_reg_bal

    # classification
    grad_cls = p_hat - y_cls
    hess_cls = p_hat * (1. - p_hat)

    # regression (log-ratio)
    delta    = mu_hat - y_reg
    grad_reg = delta
    hess_reg = np.ones_like(delta)

    grad = np.concatenate([grad_cls, grad_reg])
    hess = np.concatenate([hess_cls, hess_reg])
    return grad, hess


def custom_eval(predt: np.ndarray, dtrain: xgb.DMatrix):
    n_cls = len(y_cls_bal)
    n_reg = len(y_reg_bal)

    p_hat  = predt[:n_cls]
    mu_hat = predt[n_cls:]

    # classification metric
    logloss = -np.mean(
        y_cls_bal * np.log(p_hat + 1e-12) +
        (1 - y_cls_bal) * np.log(1 - p_hat + 1e-12)
    )

    # regression metric
    log_mae = mean_absolute_error(y_reg_bal, mu_hat)

    return [('logloss', logloss), ('log_mae', log_mae)]

In [9]:
# 1. balanced classification rows
X_cls_bal = X_cls[idx_cls]
y_cls_bal = y_cls[idx_cls]

# 2. regression rows (positives only, already aligned to regression arrays)
idx_pos_in_bal = idx_cls[y_cls[idx_cls] == 1]
pos_in_reg     = np.searchsorted(np.where(y_cls_all == 1)[0], idx_pos_in_bal)

X_reg_bal = X_reg[pos_in_reg]
y_reg_bal = y_reg[pos_in_reg]

# 3. concatenate features and labels
X_train = np.concatenate([X_cls_bal, X_reg_bal], axis=0)
y_train = np.concatenate([y_cls_bal, y_reg_bal])   # 1-D vector

dtrain = xgb.DMatrix(X_train.astype(np.float32), label=y_train.astype(np.float32))

# 4. train
params = dict(
    max_depth=4,
    eta=0.03,
    subsample=0.9,
    colsample_bytree=0.9,
    min_child_weight=3,
    disable_default_eval_metric=1
)

bst = xgb.train(
    params,
    dtrain,
    num_boost_round=600,
    obj=custom_obj,
    custom_metric=custom_eval, 
    verbose_eval=50
)

In [None]:
model_p  = xgb.XGBRegressor(
    objective='binary:logistic',
    n_estimators=600,
    max_depth=4,
    learning_rate=0.03,
    subsample=0.9,
    colsample_bytree=0.9,
    min_child_weight=3,
    n_jobs=-1,
    random_state=42
)

model_mu = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=600,
    max_depth=4,
    learning_rate=0.03,
    subsample=0.9,
    colsample_bytree=0.9,
    min_child_weight=3,
    n_jobs=-1,
    random_state=42
)

model_p.fit(X_cls_bal, y_cls_bal)
model_mu.fit(X_reg_bal, y_reg_bal)

def predict_line(models, candles_df, lengths=(1,3,5)):
    seq_df = candles_df.tail(max(lengths))
    X_seq, _, _ = make_multi_sequences(seq_df, lengths)
    p   = model_p.predict(X_seq[-1:])[0]
    mu  = np.exp(model_mu.predict(X_seq[-1:])[0])
    return None if p < 0.5 else mu

coeff = predict_line([model_p, model_mu], df)

In [12]:
coeff = predict_line(bst, df)   # <— use df, not df_c

IndexError: invalid index to scalar variable.

In [1]:
import pandas as pd

# Use the same path as in your Flask app
file_path = "/home/iatell/projects/meta-learning/data/Bitcoin_BTCUSDT_kaggle_1D_candles_prop.csv"

# Load the data
df = pd.read_csv(file_path)

# Convert the 'timestamp' column to actual datetime objects
# Assuming the column is named 'timestamp' or 'date'. Adjust if needed.
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Sort by date just in case it's not ordered
df = df.sort_values('timestamp').reset_index(drop=True)

# Calculate the difference in days between each consecutive row
time_diffs = df['timestamp'].diff().dt.days

# Find and print the locations where the gap is larger than 1 day
gaps = df[time_diffs > 1]

if not gaps.empty:
    print("Found gaps in the data after the following dates:")
    # We print the row *before* the gap occurs
    print(df.iloc[gaps.index - 1])
else:
    print("No time gaps found in the data.")

No time gaps found in the data.
