In [17]:
import pandas as pd
import glob
import matplotlib.pyplot as plt

files = glob.glob("../hku-data/test_data/*.csv")
data_list = []

for f in files:
    df = pd.read_csv(f)
    print(f"{f}: {len(df)} rows")

    df['date'] = pd.to_datetime(df['timestamp'], errors='coerce')
    if df['date'].isna().sum() > 0:
        print(f"⚠️ Warning: {df['date'].isna().sum()} invalid dates in {f}")


    df['symbol'] = f.split('/')[-1].replace('.csv','')
    data_list.append(df)

data = pd.concat(data_list)
print(f"\nВсего строк после объединения: {len(data)}")

before_drop = len(data)
data = data.drop_duplicates(subset=['date', 'symbol'])
print(f"Удалено дубликатов: {before_drop - len(data)}")
print(f"Итого строк после очистки: {len(data)}")

# Сортируем
data.sort_values(by=['date', 'symbol'], inplace=True)

n = 20  # period in days
data['return'] = data.groupby('symbol')['close'].pct_change(n)

ERROR! Session/line number was not unique in database. History logging moved to new session 9
../hku-data/test_data\1000BONK-USDT-PERP_15m_1460d.csv: 63344 rows
../hku-data/test_data\1000FLOKI-USDT-PERP_15m_1460d.csv: 82530 rows
../hku-data/test_data\1000PEPE-USDT-PERP_15m_1460d.csv: 82626 rows
../hku-data/test_data\1000SHIB-USDT-PERP_15m_1460d.csv: 140984 rows
../hku-data/test_data\AAVE-USDT-PERP_15m_1460d.csv: 140984 rows
../hku-data/test_data\ACH-USDT-PERP_15m_1460d.csv: 89546 rows
../hku-data/test_data\ADA-USDT-PERP_15m_1460d.csv: 140984 rows
../hku-data/test_data\ALGO-USDT-PERP_15m_1460d.csv: 140984 rows
../hku-data/test_data\APE-USDT-PERP_15m_1460d.csv: 122372 rows
../hku-data/test_data\APT-USDT-PERP_15m_1460d.csv: 101688 rows
../hku-data/test_data\AR-USDT-PERP_15m_1460d.csv: 138734 rows
../hku-data/test_data\ARB-USDT-PERP_15m_1460d.csv: 86760 rows
../hku-data/test_data\ATOM-USDT-PERP_15m_1460d.csv: 140984 rows
../hku-data/test_data\AVA-USDT-PERP_15m_1460d.csv: 26200 rows
../hku-

In [None]:
# === 4. Cross-sectional rank + Enhanced Momentum ===
def cross_section_rank(group):
    group = group.copy()
    group['cs_rank'] = group['return'].rank(pct=True)

    if group['volume'].std() > 0:
        vol_scaled = (group['volume'] - group['volume'].mean()) / group['volume'].std()
    else:
        vol_scaled = 0
    group['enhanced_mom'] = (group['cs_rank'] - 0.5) * (1 + 0.1 * vol_scaled)
    return group

data = (
    data.groupby('date', group_keys=False)[['symbol','return','volume']]
        .apply(cross_section_rank)
)


top_threshold = 0.4
bottom_threshold = -0.4

data['signal'] = 0
data.loc[data['enhanced_mom'] > top_threshold, 'signal'] = 1
data.loc[data['enhanced_mom'] < bottom_threshold, 'signal'] = -1

data.to_csv("cross_section_enhanced_momentum_signals.csv", index=False)
print(data.head(20))

plt.figure(figsize=(12,6))
data.groupby('date')['signal'].mean().plot(title="Mean signal among all currencies")
plt.xlabel("Date")
plt.ylabel("Mean signal")
plt.show()