In [1]:
import os
import pandas as pd
import json
import requests
import logging

from urllib.parse import urlencode  # For Python 3

# Check if the log file already exists; if not, create it
log_file_path = 'catcher.log'
if not os.path.exists(log_file_path):
    open(log_file_path, 'a').close()  # Create an empty log file

# Configure logging to write to both console and file
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(levelname)s: %(message)s',
    handlers=[
        logging.StreamHandler(),  # Log to console
        logging.FileHandler(log_file_path)  # Log to file
    ]
)

# Create the 'data2' directory if it doesn't exist
if not os.path.exists('data'):
    os.makedirs('data')

# Configure logging to write to both console and file
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# ... (logging handlers configuration remains the same)

url = 'http://aws.okx.com'
history_candles_base = '/api/v5/market/history-candles?instId=BTC-USDT-SWAP&bar=1Dutc'

store = pd.HDFStore('data/catcher.h5', 'w')

try:
    df = store['a0']
except KeyError:  # If 'data2' key does not exist in HDF5 filedf = df.sort_values(by='ts')
    df = pd.DataFrame()
    pass
try:
    df.rename(columns={'open': 'Open','high': 'High', 'low':'Low', 'close':'Close','volume': 'Volume'}, inplace=True)
except KeyError:
    print("One or more columns specified for renaming were not found in the DataFrame.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

# Continue with your code assuming df is now either loaded or initialized as empty

after_value = df.iloc[-1, 0] if not df.empty else None
before_value = df.iloc[0, 0] if not df.empty else None

# Convert columns to numeric or string
#for col in ['ts', 'Open', 'High', 'Low', 'Close','Volume', 'volCcy', 'volCcyQuote','confirm']:
#    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(str)
        
def fetch_and_append_data(after=None):
    logger.info('ftch_and_append_data()')
    params = {}
    if after:
        params['after'] = after

    history_candles_url = f"{url}{history_candles_base}&{urlencode(params)}"

    logger.info(f"Sending request to {history_candles_url}")

    response = requests.get(history_candles_url)
    response_json = json.loads(response.text)

    # Assuming 'data' key contains the actual candlestick data with the expected column names
    new_df = pd.DataFrame(response_json['data'], columns=['ts', 'Open', 'High', 'Low', 'Close','Volume', 'volCcy', 'volCcyQuote', 'confirm'])
    
    if new_df.empty:
        return
    
    # Make sure the 'ts' column is converted to numeric if possible, else convert it to string
    new_df['ts'] = pd.to_numeric(new_df['ts'], errors='coerce')

    global df
    
    # Check for next page (pagination) based on the length of the new data batch
    if len(new_df) >= 100:  # Assuming each batch returns at least 100 rows
        df = pd.concat([df, new_df], axis=0).reset_index(drop=True)  # Append new_df to df first

        # Save DataFrame to HDF5
        logger.info("Saving DataFrame to HDF5 file...")
        store.put('a0', df, format='table')
        logger.info("DataFrame saved successfully.")

        after_value = new_df.iloc[-1, 0]  # Get the timestamp of the last row in the new batch
        fetch_and_append_data(after=after_value)

    else:  # If the new batch has less than 100 rows (assuming this indicates the end of pagination)
        df = pd.concat([df, new_df], axis=0).reset_index(drop=True)  # Append new_df to df even if it's the last batch

        # Convert columns to numeric or string
        for col in ['ts', 'Open', 'High', 'Low', 'Close', 'Volume', 'volCcy', 'volCcyQuote', 'confirm']:
            df[col] = pd.to_numeric(df[col], errors='coerce')

        # Save DataFrame to HDF5
        logger.info("Saving DataFrame to HDF5 file...")
        store.put('a0', df, format='table')
        logger.info("DataFrame saved successfully.")

        # If there are no more pages or the last page had less than 100 rows, save the final DataFrame
        df = df.apply(pd.to_numeric, errors='coerce')
        # Jump out of the function
        return df



def update(before=None):
    logger.info('update()')
    params = {}
    if before:
        params['before'] = before

    history_candles_url = f"{url}{history_candles_base}&{urlencode(params)}"

    logger.info(f"Sending request to {history_candles_url}")

    response = requests.get(history_candles_url)
    response_json = json.loads(response.text)

    # Assuming 'data' key contains the actual candlestick data with the expected column names
    new_df = pd.DataFrame(response_json['data'], columns=['ts', 'Open', 'High', 'Low', 'Close','Volume', 'volCcy', 'volCcyQuote','confirm'])
    
    # Make sure the 'ts' column is converted to numeric if possible, else convert it to string
    new_df['ts'] = pd.to_numeric(new_df['ts'], errors='coerce')
    
    # If there's no new data, stop fetching
    if new_df.empty:
        return
    
    global df
    
    # Check for next page (pagination) based on the length of the new data batch
    if len(new_df) >= 100:  # Assuming each batch returns at least 100 rows
        df = pd.concat([new_df, df], axis=0).reset_index(drop=True)  # Append new_df to df first
        df = df.apply(pd.to_numeric, errors='coerce')
        # Save DataFrame to HDF5
        logger.info("Saving DataFrame to HDF5 file...")
        store.put('a0', df, format='table')
        logger.info("DataFrame saved successfully.")

        before_value = new_df.iloc[0, 0]  # Get the timestamp of the last row in the new batch
        update(before=before_value)

    else:  # If the new batch has less than 100 rows (assuming this indicates the end of pagination)
        df = pd.concat([new_df, df], axis=0).reset_index(drop=True)  # Append new_df to df even if it's the last batch

        # Convert columns to numeric or string
        for col in ['ts', 'Open', 'High', 'Low', 'Close', 'Volume', 'volCcy', 'volCcyQuote', 'confirm']:
            df[col] = pd.to_numeric(df[col], errors='coerce')

        # Save DataFrame to HDF5
        logger.info("Saving DataFrame to HDF5 file...")
        store.put('a0', df, format='table')
        logger.info("DataFrame saved successfully.")

        # If there are no more pages or the last page had less than 100 rows, save the final DataFrame

        # Jump out of the function
        return df

fetch_and_append_data(after_value)

before_value = df.iloc[0, 0] if not df.empty else None
update(before_value) 

store.close()

# Assuming your DataFrame is loaded with 'ts' column in milliseconds
df['datetime'] = pd.to_datetime(df['ts'], unit='ms')  # Convert timestamps to datetime (ignore the FutureWarning for now)
df.set_index('datetime', inplace=True)  # Set datetime as the index for time series operations
df['weekday'] = df.index.weekday  # Weekday: Monday=0, Tuesday=1, ..., Sunday=6
# Optionally add other time-related features:
df['day'] = df.index.day
df = df.drop(columns = ['confirm'])
# Instead of reopening an existing store, create a new one with the desired filename
new_store_filename = './data//catcher.h5'
store = pd.HDFStore(new_store_filename)

logging.info("Saving DataFrame with preload to new HDF5 file...")
store.put('a1', df, format='table')

# Don't forget to close the store after saving
store.close()

# Logging that the script has finished execution
logger.info(f"Script execution completed. DataFrame shape: {df.shape}")
print(f'df:{df.shape}')

df:(1502, 10)


In [2]:
import gc
#del format, level, handlers, logger, url, history_candles_base, store, after_value, before_value, params, history_candles_url, response, response_json, new_df, fetch_and_append_data, after, update before
 
gc.collect()

76

In [3]:
import numpy as np

signal = df['Close']

# Assuming signal is your input data
fft_result = np.abs(np.fft.fft(signal))

# Get the indices of the three largest values
top_3= fft_result.argsort()[-8:][::-1]
top_3 = top_3.astype('int32')

slow_Upper = min(top_3[1],top_3[2])
length_Upper = min(top_3[3],top_3[4])
fast_Upper = min(top_3[5],top_3[6])
print(top_3)

del top_3
gc.collect()

[   0 1500    2 1501    1 1496    6    3]


0

In [4]:
!pip install pandas_ta

Collecting pandas_ta
  Downloading pandas_ta-0.3.14b.tar.gz (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.1/115.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ done
Building wheels for collected packages: pandas_ta
  Building wheel for pandas_ta (setup.py) ... [?25l- \ | done
[?25h  Created wheel for pandas_ta: filename=pandas_ta-0.3.14b0-py3-none-any.whl size=218910 sha256=4eff0de2f3e10f126871a8e14b2565122e086b33ab908c1210882fa2ee5464ac
  Stored in directory: /root/.cache/pip/wheels/69/00/ac/f7fa862c34b0e2ef320175100c233377b4c558944f12474cf0
Successfully built pandas_ta
Installing collected packages: pandas_ta
Successfully installed pandas_ta-0.3.14b0


In [5]:
import numpy as np
import pandas as pd
import pandas_ta as ta
import h5py
import warnings
from sklearn.utils import compute_class_weight
import numpy as np
import pandas as pd
#import gc
from sklearn.cluster import KMeans

warnings.filterwarnings('ignore')

df = df.sort_values(by='ts')

# Assuming n_clusters is the number of clusters you want
n_clusters = 3

# Initialize the KMeans model
kmeans = KMeans(n_clusters=n_clusters)
# target drop
df['N_C'] = df['Close'].shift(-1)
df['N_H'] = df['High'].shift(-1)
df['N_L'] = df['Low'].shift(-1)
df['N_V'] = df['Volume'].shift(-1)


df = df.apply(pd.to_numeric, errors='coerce')


# define target  
df['tarPer_C'] = (df['N_C'] - df['Close'])/ df['Close']
df['tarPer_H'] = (df['N_H'] - df['Close']) / df['Close']
df['tarPer_L']= (df['N_L'] - df['Close']) / df['Close']
df['tarPer_V']= (df['N_V'] - df['Volume']) / df['Volume']

# drop furtue index
df = df.drop(columns=['N_C', 'N_H', 'N_L', 'N_V'])


df['High'] = pd.to_numeric(df['High'], errors='coerce')
df['Low'] = pd.to_numeric(df['Low'], errors='coerce')
# High-Low midpoint (HL2)
df['HL2']=(df['High']+df['Low']) / 2


# Typical Price (HLC3)
df.ta.hlc3(offset=None)

# Volume Weighted Average Price (VWAP)
df.ta.vwap(anchor=None, offset=None, append=True)


# Weighted Close Price (WCP)
df.ta.wcp(offset=None, append=True)


df['ML'] = df['High'] - df["Low"]
df['PV_ML'] = df['ML'] / df['Volume']
df['BDL'] = df['Close'] - df['Low']
df['AKL'] = df['High'] - df['Close']
df['PV_BDL'] = df['BDL'] / df['Volume']
df['PV_AKL'] = df['AKL'] / df['Volume']
df['bull_l'] = df['BDL'] - df['AKL']
df['divbull_l'] = df['BDL'] / df['AKL']


df['PS_AKL'] = df['AKL'] / df['ML']


df['bullrock'] = df['Close'] - df['Open']
df['SA'] = df['ML'] - df['bullrock']
df['max_bull_zone'] = df[['Close', 'Open']].sub(df['Low'], axis=0).max(axis=1)
df['up_SA'] = df['ML'] - df['max_bull_zone']
df['low_SA'] = df['max_bull_zone'] - abs(df['bullrock'])
df['subUL'] = df['up_SA'] - df['low_SA']
df['divUL'] = df['up_SA'] / df['low_SA']
df['posLM'] = df['max_bull_zone'] / df['ML']
df['pos_low_SA'] = df['low_SA']/ df['ML']
df['pos_bullrock'] = df['bullrock']/ +df['ML']
df['PVLM'] = df['max_bull_zone'] / df['Volume']
df['PV_up_SA'] = df['up_SA'] / (  df['Volume'])
df['PV_low_SA'] = df['low_SA']/ (  df['Volume'])
df['PV_bullrock'] = df['bullrock']/ (  df['Volume'])
df['PV_SA'] = df['SA']/ (  df['Volume'])


len_c = len(df['Close'])
df['C_r'] = df['Close'].rank(ascending=False) / len_c
df['H_r'] = df['High'].rank(ascending=False) / len_c
df['L_r'] = df['Low'].rank(ascending=False) / len_c
df['V_r'] = df['Volume'].rank(ascending=False) / len_c

df['W_r'] = df['VWAP_D'].rank(ascending=False) / len_c

    
values, counts = np.unique(df['Volume'], return_counts=True)
weights = compute_class_weight('balanced', classes=values, y=df['Volume'])
df['Vgini'] = 1 - np.sum((weights * counts) ** 2) / (np.sum(counts) ** 2)

values, counts = np.unique(df['VWAP_D'], return_counts=True)
weights = compute_class_weight('balanced', classes=values, y=df['VWAP_D'])
df['Wgini'] = 1 - np.sum((weights * counts) ** 2) / (np.sum(counts) ** 2)

values, counts = np.unique(df['Close'], return_counts=True)
weights = compute_class_weight('balanced', classes=values, y=df['Close'])
df['Cgini'] = 1 - np.sum((weights * counts) ** 2) / (np.sum(counts) ** 2)   

bounds_list = [length_Upper, slow_Upper, fast_Upper]

for length_ in bounds_list:

    df['L_C'+str(length_)] = df['Close'].shift(length_)
    df['L_H'+str(length_)] = df['High'].shift(length_)
    df['L_L'+str(length_)] = df['Low'].shift(length_)
    df['L_V'+str(length_)] = df['Volume'].shift(length_)
    df['L_O'+str(length_)] = df['Open'].shift(length_)

    df['L_C'+str(length_)] = df['L_C'+str(length_)].fillna(0)
    df['L_O'+str(length_)] = df['L_O'+str(length_)].fillna(0)
    df['L_L'+str(length_)] = df['L_L'+str(length_)].fillna(0)    

    df['amihud'+str(length_)] = (2*(df['L_H'+str(length_)] - df['L_L'+str(length_)]) - abs(df['L_O'+str(length_)] - df['L_C'+str(length_)])) / df['L_V'+str(length_)]

    df['ADClose'+str(length_)] = df['Close'] - df['L_C'+str(length_)]
    df['ADHigh'+str(length_)] = df['High'] - df['L_H'+str(length_)]
    df['ADLow'+str(length_)] = df['Low'] - df['L_L'+str(length_)]
    df['ADVolume'+str(length_)] = df['Volume'] - df['L_V'+str(length_)]

    df['PV_ADClose'+str(length_)] = df['ADClose'+str(length_)] / (  df['L_V'+str(length_)] )
    df['PV_ADHigh'+str(length_)] = df['ADHigh'+str(length_)] / (  df['L_V'+str(length_)] )
    df['PV_Low'+str(length_)] = df['ADLow'+str(length_)] / (  df['L_V'+str(length_)] )

    df['divClose'+str(length_)] = df['Close'] / (  df['L_C'+str(length_)])
    df['divHigh'+str(length_)] = df['High'] / (  df['L_H'+str(length_)])
    df['divLow'+str(length_)] = df['Low'] / (  df['L_L'+str(length_)])
    df['divVolume'+str(length_)] = df['Volume'] / (  df['L_V'+str(length_)]    )

    df['DELow'+str(length_)] = df['ADLow'+str(length_)] / (  df['L_L'+str(length_)])
    df['ML'+str(length_)]  = df['L_H'+str(length_)] - df['L_L'+str(length_)] 
    df['PV_ML' +str(length_)]= df['ML'+str(length_)]  / (  df['L_V'+str(length_)] )
    df['BDL'+str(length_)] = df['L_C'+str(length_)]- df['L_L'+str(length_)] 
    df['AKL'+str(length_)] = df['L_H'+str(length_)]- df['L_C'+str(length_)]
    df['PV_BDL'+str(length_)]= df['BDL'+str(length_)] / (  df['L_V'+str(length_)] )
    df['PV_AKL'+str(length_)] = df['AKL'+str(length_)] / (  df['L_V'+str(length_)] )
    df['bull_l'+str(length_)] = df['BDL'+str(length_)] - df['AKL'+str(length_)]
    df['divbull_l'+str(length_)] = df['BDL'+str(length_)] / (  df['AKL'+str(length_)])
    df['PS_AKL'+str(length_)] = df['AKL'+str(length_)] / (  df['ML'+str(length_)] )
    df['bullrock'+str(length_)] = df['L_C'+str(length_)]- df['L_O'+str(length_)]
    df['SA'+str(length_)] = df['ML'+str(length_)]  - df['bullrock'+str(length_)]

    df['max_bull_zone'+str(length_)] = np.where(df['bullrock'+str(length_)] >= 0,
                                          df['L_C'+str(length_)] - df['L_L'+str(length_)],
                                          df['L_O'+str(length_)] - df['L_L'+str(length_)])

    df['up_SA'+str(length_)] = df['ML'+str(length_)]  - df['max_bull_zone'+str(length_)]
    df['low_SA'+str(length_)] = df['max_bull_zone'+str(length_)] - abs(df['bullrock'+str(length_)])

    df['subUL' ] = df['up_SA'+ str(length_) ] - df['low_SA'+ str(length_) ]
    df['divUL'+ str(length_) ] = df['up_SA'+ str(length_) ] / (  df['low_SA'+ str(length_) ])

    df['posLM'+str(length_)] = df['max_bull_zone'+str(length_)] / (  df['ML'+str(length_)] )
    df['pos_up_SA'+str(length_)] = df['up_SA'+str(length_)] / (  df['ML'+str(length_)] )
    df['pos_low_SA'+str(length_)] = df['low_SA'+str(length_)]/ (  df['ML'+str(length_)] )
    df['pos_bullrock'+str(length_)] = df['bullrock'+str(length_)]/ (  df['ML'+str(length_)] )
    df['pos_SA'+str(length_)] = df['SA'+str(length_)]/ (  df['ML'+str(length_)] )

    df['PVLM'+str(length_)] = df['max_bull_zone'+str(length_)] / (  df['L_V'+str(length_)] )
    df['PV_up_SA'+str(length_)] = df['up_SA'+str(length_)] / (  df['L_V'+str(length_)] )
    df['PV_low_SA'+str(length_)] = df['low_SA'+str(length_)]/ (  df['L_V'+str(length_)] )
    df['PV_bullrock'+str(length_)] = df['bullrock'+str(length_)]/ (  df['L_V'+str(length_)] )
    df['PV_SA'+str(length_)] = df['SA'+str(length_)]/ (  df['L_V'+str(length_)] )

    # sub

    df['sub0ML'+str(length_)]  = df['ML'] - df['ML'+str(length_)] 
    df['sub0PV_ML' +str(length_)]=  df['PV_ML'] - df['PV_ML' +str(length_)]
    df['sub0BDL'+str(length_)] = df['BDL'] - df['BDL'+str(length_)]  
    df['sub0AKL'+str(length_)] = df['AKL'] - df['AKL'+str(length_)] 
    df['sub0PV_BDL'+str(length_)] = df['PV_BDL'] - df['PV_BDL'+str(length_)] 
    df['sub0PV_AKL'+str(length_)] = df['PV_AKL'] - df['PV_AKL'+str(length_)] 
    df['sub0bull_l'+str(length_)] = df['bull_l'] - df['bull_l'+str(length_)] 
    df['sub0divbull_l'+str(length_)] = df['divbull_l'] - df['divbull_l'+str(length_)] 

    #df['sub0logbull_l'+str(length_)] = df['logbull_l'] - df['logbull_l'+str(length_)] 
    #df['sub0PS_BDL'+str(length_)] = df['PS_BDL'] - df['PS_BDL'+str(length_)] 

    df['sub0bullrock'+str(length_)] = df['bullrock'] - df['bullrock'+str(length_)] 
    df['sub0SA'+str(length_)] = df['SA'] - df['SA'+str(length_)]  
    df['sub_fuLM'+str(length_)] = df['max_bull_zone'] - df['max_bull_zone'+str(length_)] 
    df['sub0up_SA'+str(length_)] = df['up_SA'] - df['up_SA'+str(length_)] 
    df['sub0low_SA'+str(length_)] = df['low_SA'] - df['low_SA'+str(length_)] 

    df['sub0posLM'+str(length_)] = df['posLM'] - df['posLM'+str(length_)] 
    #df['sub0pos_up_SA'+str(length_)] = df['pos_up_SA'] - df['pos_up_SA'+str(length_)] 
    df['sub0pos_low_SA'+str(length_)] = df['pos_low_SA'] - df['pos_low_SA'+str(length_)] 
    df['sub0pos_bullrock'+str(length_)] = df['pos_bullrock'] - df['pos_bullrock'+str(length_)] 

    df['sub0PVLM'+str(length_)] = df['PVLM'] - df['PVLM'+str(length_)] 
    df['sub0PV_up_SA'+str(length_)] = df['PV_up_SA'] - df['PV_up_SA'+str(length_)]  
    df['sub0PV_low_SA'+str(length_)] = df['PV_low_SA'] - df['PV_low_SA'+str(length_)] 
    df['sub0PV_bullrock'+str(length_)] = df['PV_bullrock'] - df['PV_bullrock'+str(length_)] 
    df['sub0PV_SA'+str(length_)] = df['PV_SA'] - df['PV_SA'+str(length_)] 

    # div

    df['div0ML'+str(length_)]  = df['ML'] / (  df['ML'+str(length_)] )
    df['div0PV_ML' +str(length_)]=  df['PV_ML'] / (  df['PV_ML' +str(length_)])
    df['div0BDL'+str(length_)] = df['BDL'] / (  df['BDL'+str(length_)]  )
    df['div0AKL'+str(length_)] = df['AKL'] / (  df['AKL'+str(length_)] )
    df['div0PV_BDL'+str(length_)] = df['PV_BDL'] / (  df['PV_BDL'+str(length_)] )
    df['div0PV_AKL'+str(length_)] = df['PV_AKL'] / (  df['PV_AKL'+str(length_)] )
    df['div0bull_l'+str(length_)] = df['bull_l'] / (  df['bull_l'+str(length_)] )

    #df['div0logbull_l'+str(length_)] = df['logbull_l'] / (  df['logbull_l'+str(length_)] )
    #df['div0PS_BDL'+str(length_)] = df['PS_BDL'] / (  df['PS_BDL'+str(length_)] )
    df['div0PS_AKL'+str(length_)] = df['PS_AKL'] / (  df['PS_AKL'+str(length_)] )
    #df['div0PS_bull_l'+str(length_)] = df['PS_bull_l'] / (  df['PS_bull_l'+str(length_)] )

    df['div0bullrock'+str(length_)] = df['bullrock'] / (  df['bullrock'+str(length_)] )
    df['div0SA'+str(length_)] = df['SA'] / (  df['SA'+str(length_)]  )
    df['div_fuLM'+str(length_)] = df['max_bull_zone'] / (  df['max_bull_zone'+str(length_)] )
    df['div0low_SA'+str(length_)] = df['low_SA'] / (  df['low_SA'+str(length_)] )

    #df['div0pos_up_SA'+str(length_)] = df['pos_up_SA'] / (  df['pos_up_SA'+str(length_)] )
    df['div0pos_low_SA'+str(length_)] = df['pos_low_SA'] / (  df['pos_low_SA'+str(length_)] )
    df['div0pos_bullrock'+str(length_)] = df['pos_bullrock'] / (  df['pos_bullrock'+str(length_)] )
    #df['div0pos_SA'+str(length_)] = df['pos_SA'] / (  df['pos_SA'+str(length_)]  )

    df['div0PVLM'+str(length_)] = df['PVLM'] / (  df['PVLM'+str(length_)] )
    df['div0PV_up_SA'+str(length_)] = df['PV_up_SA'] / (  df['PV_up_SA'+str(length_)]  )
    df['div0PV_low_SA'+str(length_)] = df['PV_low_SA'] / (  df['PV_low_SA'+str(length_)] )
    df['div0PV_bullrock'+str(length_)] = df['PV_bullrock'] / (  df['PV_bullrock'+str(length_)] )
    df['div0PV_SA'+str(length_)] = df['PV_SA'] / (  df['PV_SA'+str(length_)] )

    # log

    df['log0PV_BDL'+str(length_)] = np.log(abs(df['PV_BDL'] ))- np.log(abs(df['PV_BDL'+str(length_)] ))
    df['log0PV_AKL'+str(length_)] = np.log(abs(df['PV_AKL'])) - np.log(abs(df['PV_AKL'+str(length_)] ))
    df['log0bull_l'+str(length_)] = np.log(abs(df['bull_l'])) - np.log(abs(df['bull_l'+str(length_)] ))
    #df['log0logbull_l'+str(length_)] = np.log(abs(df['logbull_l'])) - np.log(abs(df['logbull_l'+str(length_)]) )
    #df['log0PS_bull_l'+str(length_)] = np.log(abs(df['PS_bull_l']) )- np.log(abs(df['PS_bull_l'+str(length_)]) )
    df['log0bullrock'+str(length_)] = np.log(abs(df['bullrock'])) - np.log(abs(df['bullrock'+str(length_)]) )
    df['log0pos_bullrock'+str(length_)] = np.log(abs(df['pos_bullrock'])) - np.log(abs(df['pos_bullrock'+str(length_)]))
    df['log0PV_bullrock'+str(length_)] = np.log(abs(df['PV_bullrock']) )- np.log(abs(df['PV_bullrock'+str(length_)]) )
    df['log0PV_SA'+str(length_)] = np.log(abs(df['PV_SA'])) - np.log(abs(df['PV_SA'+str(length_)])) 


def convert_days_to_periods(d, x):
    total_days = len(x)
    return int(np.floor(total_days * d))

def rank(x: pd.Series):
    # Assuming pct=True gives percentage rank
    return x.rank(pct=True)

def delay(x: pd.Series, d: int):
    if d < 0 or d >= len(x):
        raise ValueError(f"Invalid delay value d={d}. It should be between 0 and {len(x) - 1}")
    return x.shift(d)

def correlation(x: pd.Series, y: pd.Series, d: float):
    window_size = convert_days_to_periods(d, x.index)
    return x.rolling(window=window_size).corr(y)

def rolling_covariance(x: pd.Series, y: pd.Series, window_size: int):
    return x.rolling(window=window_size).cov(y)

def scale(x: pd.Series, a: float = 1.0):
    norm = np.sum(np.abs(x))
    return x * (a / norm)

def delta(x: pd.Series, d: int):
    return x - delay(x, d)

def signedpower(x: pd.Series, a: float):
    return np.sign(x) * np.abs(x) ** a

def decay_linear(x: pd.Series, d: float):
    weights = np.arange(1, convert_days_to_periods(d+1, x)+1)[::-1] / sum(np.arange(1, convert_days_to_periods(d+1, x)+1))
    return x.rolling(window=len(weights), min_periods=1).apply(lambda w: np.dot(w, weights))

def indneutralize(x: pd.Series, g: pd.Series):
    group_means = x.groupby(g).transform('mean')
    return x - group_means

def ts_min(x: pd.Series, d: int):
    return x.rolling(d).min()

def ts_max(x: pd.Series, d: int):
    return x.rolling(d).max()

def ts_min_day(x: pd.Series, d: int):
    # Get the index of the minimum values
    min_values = x.rolling(d).min()
    min_indices = min_values.idxmin()
    
    # Extract the day component from the DatetimeIndex
    days = [x.index[i].day for i in min_indices]
    return pd.Series(days, index=min_indices.index)

def ts_max_day(x: pd.Series, d: int):
    # Same process for the maximum values
    max_values = x.rolling(d).max()
    max_indices = max_values.idxmax()
    
    # Extract the day component from the DatetimeIndex
    days = [x.index[i].day for i in max_indices]
    return pd.Series(days, index=max_indices.index)

def ts_rank(x: pd.Series, d: int):
    # Note that pandas' rolling rank is deprecated; use expanding instead
    return x.expanding(min_periods=d).rank()

def min(x: pd.Series, d: int):
    return ts_min(x, d)

def max(x: pd.Series, d: int):
    return ts_max(x, d)

def sum(x: pd.Series, d: int):
    return x.rolling(d).sum()

def product(x: pd.Series, d: int):
    # Be cautious with this function as it could lead to underflow/overflow issues
    return x.rolling(d).apply(np.prod, raw=True)

def stddev(x: pd.Series, d: int):
    return x.rolling(d).std()


    
for length_ in bounds_list:

    df['Ccv+str(length_)'] = df['Close'].rolling(window=length_).std() / df['Close'].rolling(window=length_).mean()
    df['Hcv+str(length_)'] = df['High'].rolling(window=length_).std() / df['High'].rolling(window=length_).mean()
    df['Lcv+str(length_)'] = df['Low'].rolling(window=length_).std() / df['Low'].rolling(window=length_).mean()
    df['Vcv+str(length_)'] = df['Volume'].rolling(window=length_).std() / df['Volume'].rolling(window=length_).mean()
    df['Wcv+str(length_)'] = df['VWAP_D'].rolling(window=length_).std() / df['VWAP_D'].rolling(window=length_).mean()

    df['Crange_values'+str(length_)] = df['Close'].rolling(window=length_).max() - df['Close'].rolling(window=length_).min()
    df['Hrange_values'+str(length_)] = df['High'].rolling(window=length_).max() - df['High'].rolling(window=length_).min()
    df['Lrange_values'+str(length_)] = df['Low'].rolling(window=length_).max() - df['Low'].rolling(window=length_).min()
    df['Vrange_values'+str(length_)] = df['Volume'].rolling(window=length_).max() - df['Volume'].rolling(window=length_).min()
    df['Wrange_values'+str(length_)] = df['VWAP_D'].rolling(window=length_).max() - df['VWAP_D'].rolling(window=length_).min()

    df['Cpercentile_90'+str(length_)] = df['Close'].rolling(window=length_).quantile(0.9)
    df['Hpercentile_90'+str(length_)] = df['High'].rolling(window=length_).quantile(0.9)
    df['Lpercentile_90'+str(length_)] = df['Low'].rolling(window=length_).quantile(0.9)
    df['Vpercentile_90'+str(length_)] = df['Volume'].rolling(window=length_).quantile(0.9)
    df['Wpercentile_90'+str(length_)] = df['VWAP_D'].rolling(window=length_).quantile(0.9)


    df[f'Cautcorr{length_}'] = df['Close'].autocorr(lag=length_) 
    df[f'Hautcorr{length_}'] = df['High'].autocorr(lag=length_) 
    df[f'Lautcorr{length_}'] = df['Low'].autocorr(lag=length_) 
    df[f'Vautcorr{length_}'] = df['Volume'].autocorr(lag=length_) 
    df[f'Wautcorr{length_}'] = df['VWAP_D'].autocorr(lag=length_) 


    df[f'CVKMclst{length_}'] = kmeans.fit_predict(df[[f'Cautcorr{length_}', f'Vautcorr{length_}']])
    df[f'CWKMclst{length_}'] = kmeans.fit_predict(df[[f'Cautcorr{length_}', f'Wautcorr{length_}']])


    df[f'HVKMclst{length_}'] = kmeans.fit_predict(df[[f'Hautcorr{length_}', f'Vautcorr{length_}']])
    df[f'HWKMclst{length_}'] = kmeans.fit_predict(df[[f'Hautcorr{length_}', f'Wautcorr{length_}']])

    df[f'LVKMclst{length_}'] = kmeans.fit_predict(df[[f'Lautcorr{length_}', f'Vautcorr{length_}']])
    df[f'LWKMclst{length_}'] = kmeans.fit_predict(df[[f'Lautcorr{length_}', f'Wautcorr{length_}']])


    df['entropy'+ str(length_)] = ta.entropy(df['Close'], length_)
    df['kurtosis'+ str(length_)] = ta.kurtosis(df['Close'], length_)
    df['mad'+ str(length_)] = ta.mad(df['Volume'], length_)
    df['quantile_50'+ str(length_)] = ta.quantile(df['Close'], q=0.5,  window=length_)  # For 50th percentile (median)



    df['del_val'+ str(length_)] = delay(df['Close'], length_)

    df['cov_val'+ str(length_)] = rolling_covariance(df['Close'], df['L_C'+str(length_)], length_)

 

    df['amihud'] = (2*(df['High'] - df['Low']) - abs(df['Open'] - df['Close'])) / df['Volume']
    df['rolling_std'+str(length_)] = df['Close'].rolling(window=length_).std()

    df['min_val'+ str(length_)] = ts_min(df['Close'], length_)
    # Other time-series operations using ts_operator
    df['max_val'+ str(length_)] = ts_max(df['Close'], length_)

    

start_mem = df.memory_usage().sum() / 1024**2

for col in df.columns:
    col_type = df[col].dtype

    if col_type != object:
        c_min = df[col].min()
        c_max = df[col].max()
        if str(col_type)[:3] == "int":
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)
        else:
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float32)
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float32)


logger.info(f"Memory usage of dataframe is {start_mem:.2f} MB")
end_mem = df.memory_usage().sum() / 1024**2
logger.info(f"Memory usage after optimization is: {end_mem:.2f} MB")
decrease = 100 * (start_mem - end_mem) / start_mem
logger.info(f"Decreased by {decrease:.2f}%")
print(df.shape)

(1502, 415)


In [6]:
lst = [n_clusters, kmeans, values, counts, weights, bounds_list, start_mem, col_type, c_min, c_max, end_mem, decrease]
del lst
gc.collect()

0

In [7]:
import h5py
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer


df = df[slow_Upper:]
df = df.dropna(axis=1,how='all')

# Replace infinity and large values with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

df.columns = df.columns.astype(str)

# Now proceed with the imputation
imputer = SimpleImputer(strategy='median')
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Identify and drop columns with all the same value in-place
df.drop(df.columns[df.nunique() == 1], axis=1, inplace=True)

columns_to_drop = ['tarPer_C', 'tarPer_H', 'tarPer_L', 'tarPer_V', 'ts']

yc = df['tarPer_C']
yh = df['tarPer_H']
yl = df['tarPer_L']
yv = df['tarPer_V']

df = df.drop(columns = columns_to_drop, axis=1)

print(f'df: {df.shape}')

del imputer
gc.collect()

df: (1500, 363)


9158

In [8]:
foInd = df.columns.get_loc('divbull_l')
df1 = df.iloc[:, foInd+1:]
df0 = df.iloc[:, :foInd]
from sklearn.decomposition import PCA
import pandas as pd

# Assuming df is your DataFrame with n columns where n > 30
pca = PCA(n_components=30)

# Fit and transform the data
principal_components = pca.fit_transform(df1)

# Create a new DataFrame with the first 30 principal components
df2 = pd.DataFrame(principal_components, columns=[f'PC{i}' for i in range(1, 31)])
#df = pd.comcat([df0,df2],axis = 1)
print(f'df2:{df2.shape}')

del pca, principal_components
gc.collect()

df2:(1500, 30)


0

In [9]:
from gplearn.genetic import SymbolicTransformer
import numpy as np

function_set = ['add', 'sub', 'mul', 'div',
                'sqrt', 'log', 'abs', 'neg', 'inv',
                'max', 'min', 'sin', 'cos', 'tan']

n_components_=10

gp = SymbolicTransformer( population_size=200, hall_of_fame=80, n_components=n_components_, generations=3, tournament_size=1, stopping_criteria=1.0, const_range=(-1.0, 1.0), init_depth=(2, 6), init_method='half and half', function_set=function_set, metric='pearson', parsimony_coefficient=0.001, p_crossover=0.9, p_subtree_mutation=0.01, p_hoist_mutation=0.01, p_point_mutation=0.01, p_point_replace=0.05, max_samples=1.0, feature_names=None, warm_start=False, low_memory=False, n_jobs=1, verbose=0, random_state=None)
 

# Fit the model
gp.fit(df, yc)
dfgp = pd.DataFrame(data = gp.transform(df))

print(f'dfgp"{dfgp.shape}')

del function_set, n_components_,gp
gc.collect()

dfgp"(1500, 10)


81

In [10]:
df = pd.concat([df0, df2, dfgp], axis = 1)
print(f'df:{df.shape}')

df:(1500, 59)


In [11]:
X = df[:-1]   
pyc = yc[:-1]
pyh = yh[:-1]
pyl = yl[:-1]
pyv = yv[:-1]
         
news = df[-1:]
nyc = yc[-1:]
nyh = yh[-1:]
nyl = yl[-1:]
nyv = yv[-1:]
print(f'X:{X.shape}')

del yc,yh,yl,yv
gc.collect()

X:(1499, 59)


0

In [12]:

import lightgbm as lgb  
from hyperopt import hp, fmin, tpe,space_eval
from lightgbm import LGBMRegressor, cv
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_squared_error


# Define searched space
hyper_space = {'objective': 'regression',
               'metric':'rmse',
               'boosting':'gbdt', 'device':'gpu',#'gpu_device_id': 0,
               #'n_estimators': hp.choice('n_estimators', [25, 40, 50, 75, 100, 250, 500]),
               'max_depth':  hp.choice('max_depth', list(range(6, 18, 2))),
               'num_leaves': hp.choice('num_leaves', list(range(20, 180, 20))),
               'subsample': hp.choice('subsample', [.7, .8, .9, 1]),
               'colsample_bytree': hp.uniform('colsample_bytree', 0.7, 1),
               'learning_rate': hp.uniform('learning_rate', 0.03, 0.12),
               #'reg_alpha': hp.choice('reg_alpha', [.1, .2, .3, .4, .5, .6]),
               #'reg_lambda':  hp.choice('reg_lambda', [.1, .2, .3, .4, .5, .6]),               
               'min_child_samples': hp.choice('min_child_samples', [20, 45, 70, 100])
              }


# Assuming df is your DataFrame with features in all columns except the last one (target)

y = pyc

# Split the data into training and testing sets (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

params = {
    'boosting_type': 'gbdt',
    # Add more params as needed
    'verbosity': -1,
    'early_stopping_round': 10
}

def evaluate_metric(params):

    # Convert data to DMatrix format required by LightGBM
    dtrain = lgb.Dataset(X_train, label=y_train)
    dtest = lgb.Dataset(X_test, label=y_test)

    lgbm_reg = lgb.train(params, dtrain, 2000, valid_sets = [dtest])
                        
    pred_lgb = lgbm_reg.predict(X_test, num_iteration=lgbm_reg.best_iteration)

    score_uni = np.sqrt(mean_squared_error(pred_lgb, y_test))
    print(f'Score Validation : {score_uni}')
    return score_uni

# Seting the number of Evals
MAX_EVALS= 15

# Fit Tree Parzen Estimator
best_vals = fmin(evaluate_metric, 
                 space=hyper_space,
                 verbose=-1,
                 algo=tpe.suggest, 
                 max_evals=MAX_EVALS)

# Print best parameters
best_params = space_eval(hyper_space, best_vals)

print(best_params)

params = best_params

del hyper_space, y, X_train, X_test, y_train, y_test, best_vals,  best_params
gc.collect()


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 14574
[LightGBM] [Info] Number of data points in the train set: 1199, number of used features: 59
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
  0%|          | 0/15 [00:00<?, ?trial/s, best loss=?]



[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 59 dense feature groups (0.07 MB) transferred to GPU in 0.002185 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.001759
Score Validation : 0.0360818525484429
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 14574
[LightGBM] [Info] Number of data points in the train set: 1199, number of used features: 59
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 59 dense feature groups (0.07 MB) transferred to GPU in 0.001367 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.001759
Score Validation : 0.03795436452967022
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 14574
[LightGBM] [Info]

890

In [13]:
import lightgbm as lgb
import numpy as np

'''
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',  # 回归任务
    'metric': 'l2',  # 均方误差作为评估指标   'device':'gpu'
    'device': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
}
'''
# 创建Dataset
train_data = lgb.Dataset(X, label=pyc)
# 训练模型
model = lgb.train(params, train_set=train_data, num_boost_round=100)

# 预测单行数据
#new_data = gpNews.reshape(1, -1)  # 将测试数据重塑为1行
Cprd = model.predict(news)
print("Prediction for the new data: ", Cprd[0])

# 创建Dataset
train_data = lgb.Dataset(X, label=pyh)
# 训练模型
model = lgb.train(params, train_set=train_data, num_boost_round=100)
# 预测单行数据
#new_data = gpNews.reshape(1, -1)  # 将测试数据重塑为1行
Hprd = model.predict(news)
print("Prediction for the new data: ", Hprd[0])

# 创建Dataset
train_data = lgb.Dataset(X, label=pyl)
# 训练模型
model = lgb.train(params, train_set=train_data, num_boost_round=100)
# 预测单行数据
#new_data = gpNews.reshape(1, -1)  # 将测试数据重塑为1行
Lprd = model.predict(news)
print("Prediction for the new data: ", Lprd[0])

# 创建Dataset
train_data = lgb.Dataset(X, label=pyv)
# 训练模型
model = lgb.train(params, train_set=train_data, num_boost_round=100)
# 预测单行数据
#new_data = gpNews.reshape(1, -1)  # 将测试数据重塑为1行
Vprd = model.predict(news)
print("Prediction for the new data: ", Vprd[0])

del  model, train_data, params
gc.collect()

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 14573
[LightGBM] [Info] Number of data points in the train set: 1499, number of used features: 59
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 59 dense feature groups (0.09 MB) transferred to GPU in 0.001611 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.001895
Prediction for the new data:  -0.006035147044962047
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 14573
[LightGBM] [Info] Number of data points in the train set: 1499, number of used features: 59
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogr

76

In [14]:
pc = news['Close'].values*(Cprd[0]+1)
ph = news['Close'].values*(Hprd[0]+1)
pl = news['Close'].values*(Lprd[0]+1)
pv = news['Volume'].values*(Vprd[0]+1)

pmin = np. min([pc, ph, pl])
pmax = np. max([pc, ph, pl])
lowste=(pc-pmin)/5
upste=(pmax-pc)/5

b3=pmin+lowste
b2=pmin+2*lowste
b1=pmin+3*lowste

a3=pmax-upste
a2=pmax - 2* upste
a1= pmax - 3 * upste

upstop = pmax * 1.0005
lowstop = pmin * 0.9995

c = f'{round(Cprd[0]*100, 2)}％'
h = f'{round(Hprd[0]*100, 2)}％'
l = f'{round(Lprd[0]*100, 2)}％'
v = f'{round(Vprd[0]*100, 2)}％'


odlst = [('c',c), ('h', h), ('l', l), ('v', v), ('pc',pc), ('ph',ph), ('pl',pl), ('pv',pv),('b3',b3), ('b2',b2), ('b1',b1), ('a1', a1), ('a2',a2), ('a3',a3), ('upstop', upstop), ('lowstop',lowstop)]
oddf = pd.DataFrame(odlst)
oddf.to_csv('oddf.csv')
print(oddf)

          0                     1
0         c                 -0.6％
1         h                 1.62％
2         l                -1.34％
3         v                28.62％
4        pc  [47471.761377132614]
5        ph    [48532.0698296115]
6        pl    [47118.7664151022]
7        pv    [8259222.27477484]
8        b3   [47189.36540750829]
9        b2   [47259.96439991437]
10       b1   [47330.56339232045]
11       a1   [47895.88475812417]
12       a2   [48107.94644861994]
13       a3   [48320.00813911572]
14   upstop          48556.335865
15  lowstop          47095.207032
