In [20]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os 
# Set the display option to show all columns
pd.set_option('display.max_columns', None)

In [21]:
INPUT_FILE_PATH = '../raw_data/BYBIT_BTC_DATA.csv'
OUTPUT_FILE_PATH = '../processed_data/processed_BTC_data.csv'

def drop_columns_if_present(df, columns_to_drop=None):
        if columns_to_drop is None:
            columns_to_drop = ["conversion_line", "unnamed:_17", "plot", "smoothing_line.1", "smoothing_line", "laggingspan", "smoothing_line_1"]

        columns_to_drop = [col for col in columns_to_drop if col in df.columns]
        return df.drop(columns_to_drop, axis=1)

def shorten_column_name(col_name):
        if col_name.startswith('upper_band_'):
            return 'upper_b' + col_name[-1]
        elif col_name.startswith('lower_band_'):
            return 'lower_b' + col_name[-1]
        elif col_name == 'kumo_cloud_upper_line':
            return 'kumo_up'
        elif col_name == 'kumo_cloud_lower_line':
            return 'kumo_down'
        elif col_name == 'elder_force_index':
            return 'efi'
        elif col_name == 'onbalancevolume':
            return 'obv'
        elif col_name == 'lagging_span':
            return 'laggingspan'
        elif col_name == 'leading_span_a':
            return 'leadingspan1'
        elif col_name == 'leading_span_b':
            return 'leadingspan2'
        else:
            return col_name

def preprocess_data(raw_data, columns_to_keep=None):
        data = raw_data.copy()
        data['time'] = pd.to_datetime(data['time'], unit='s')
        data['target_close'] = data['close'].shift(-1)
        data.columns = [col.lower().replace(' ', '_').replace('#', '').replace('.', '_') for col in data.columns]
        data.columns = [shorten_column_name(col) for col in data.columns]
        data.set_index('time', inplace=True)

        # Drop unnecessary columns
        data = drop_columns_if_present(data)

        # Filter columns based on the list of columns to keep
        if columns_to_keep:
            missing_columns = [col for col in columns_to_keep if col not in data.columns]
            columns_to_keep = [col for col in columns_to_keep if col in data.columns]
            data = data[columns_to_keep]

        # Drop rows containing NaN values
        data.dropna(inplace=True)

        # Create the directory if it doesn't exist
        output_directory = os.path.dirname(OUTPUT_FILE_PATH)
        if not os.path.exists(output_directory):
            os.makedirs(output_directory)

        data.to_csv(OUTPUT_FILE_PATH, index=False)
        return data



In [23]:
if __name__ == "__main__":
    # Load the data
    df = pd.read_csv(INPUT_FILE_PATH)
    

    # Define the columns you trust
    columns_we_trust = ['open', 'high', 'low', 'close', 'vwap', 'upper_b1', 'lower_b1',
                        'upper_b2', 'lower_b2', 'upper_b3', 'lower_b3', 'basis', 'upper',
                        'lower', 'parabolicsar', 'twap', 'volume', 'volume_ma', 'adx',
                        'efi', 'atr', 'obv', 'roc', 'cci', 'target_close']

    # Preprocess the data
    df = preprocess_data(df, columns_to_keep=columns_we_trust)
    print(df.head())




                        open     high      low    close          vwap  \
time                                                                    
2022-09-01 06:45:00  19990.5  20000.0  19945.0  19975.0  20051.911081   
2022-09-01 07:00:00  19975.0  19975.0  19876.0  19933.0  20039.109666   
2022-09-01 07:15:00  19933.0  19961.5  19878.0  19898.0  20032.127497   
2022-09-01 07:30:00  19898.0  19908.0  19771.0  19879.0  20012.464334   
2022-09-01 07:45:00  19879.0  19924.5  19862.0  19891.5  20007.578221   

                         upper_b1      lower_b1      upper_b2      lower_b2  \
time                                                                          
2022-09-01 06:45:00  20105.414350  19998.407813  20158.917619  19944.904544   
2022-09-01 07:00:00  20102.269953  19975.949380  20165.430240  19912.789093   
2022-09-01 07:15:00  20099.983866  19964.271127  20167.840235  19896.414758   
2022-09-01 07:30:00  20097.564922  19927.363747  20182.665509  19842.263160   
2022-09-01 07: