In [72]:
import os
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set the display option to show all columns
pd.set_option('display.max_columns', None)

# Connect to the SQLite database
try:
    conn = sqlite3.connect('../BTC_data.db')
    # Execute the query and store the result in a DataFrame
    df = pd.read_sql_query("SELECT * FROM BTC_data;", conn)
    print(df)
except sqlite3.Error as e:
    print(f"An error occurred: {e}")
finally:
    # Close the connection
    if conn:
        conn.close()


       index                 time     open     high      low    close  \
0          0  2022-09-01 06:45:00  19990.5  20000.0  19945.0  19975.0   
1          1  2022-09-01 07:00:00  19975.0  19975.0  19876.0  19933.0   
2          2  2022-09-01 07:15:00  19933.0  19961.5  19878.0  19898.0   
3          3  2022-09-01 07:30:00  19898.0  19908.0  19771.0  19879.0   
4          4  2022-09-01 07:45:00  19879.0  19924.5  19862.0  19891.5   
...      ...                  ...      ...      ...      ...      ...   
20129  20129  2023-03-29 23:00:00  28376.2  28443.0  28361.6  28415.4   
20130  20130  2023-03-29 23:15:00  28415.4  28415.4  28352.6  28408.3   
20131  20131  2023-03-29 23:30:00  28408.3  28408.4  28313.6  28355.6   
20132  20132  2023-03-29 23:45:00  28355.6  28370.0  28293.0  28330.0   
20133  20133  2023-03-30 00:00:00  28330.0  28359.3  28300.0  28354.6   

               vwap      upper_b1      lower_b1      upper_b2      lower_b2  \
0      20051.911081  20105.414350  19998.407

In [73]:
df.columns

Index(['index', 'time', 'open', 'high', 'low', 'close', 'vwap', 'upper_b1',
       'lower_b1', 'upper_b2', 'lower_b2', 'upper_b3', 'lower_b3', 'basis',
       'upper', 'lower', 'parabolicsar', 'twap', 'volume', 'volume_ma', 'adx',
       'efi', 'atr', 'obv', 'roc', 'cci', 'target_close', 'hour',
       'day_of_week', 'USA_open', 'EU_open', 'ASIA_open'],
      dtype='object')

In [74]:
df.iloc[0]

index                             0
time            2022-09-01 06:45:00
open                        19990.5
high                        20000.0
low                         19945.0
close                       19975.0
vwap                   20051.911081
upper_b1                20105.41435
lower_b1               19998.407813
upper_b2               20158.917619
lower_b2               19944.904544
upper_b3               20212.420888
lower_b3               19891.401275
basis                     20065.575
upper                  20180.181839
lower                  19950.968161
parabolicsar           20093.638784
twap                      20068.875
volume                     1404.391
volume_ma                1079.07575
adx                       33.073108
efi                   -12916.006674
atr                       68.322402
obv                       -5436.203
roc                       -0.388969
cci                     -128.941675
target_close                19933.0
hour                        

In [75]:
# Extract the head to csv:
import pandas as pd
import os
import logging
from typing import Union
from pathlib import Path

# Initialize logging
logging.basicConfig(level=logging.INFO)

def export_df_head_to_csv(df: pd.DataFrame, num_rows: int = 5, base_directory: Union[str, Path] = '..', 
                          folder_name: str = 'heads_csv', file_name: str = 'df_head.csv') -> str:
    """
    Export the head of a DataFrame to a CSV file.
    
    Parameters:
        df (pd.DataFrame): The DataFrame to export.
        num_rows (int): Number of rows to include in the head. Default is 5.
        base_directory (Union[str, Path]): The base directory for the folder. Default is one level up ('..').
        folder_name (str): The name of the folder to save the CSV in. Default is 'heads_csv'.
        file_name (str): The name of the CSV file. Default is 'df_head.csv'.
        
    Returns:
        str: The path where the CSV was saved.
    """
    try:
        # Create the complete folder path
        folder_path = Path(base_directory) / folder_name
        
        # Create folder if it doesn't exist
        folder_path.mkdir(parents=True, exist_ok=True)
        
        # Create the initial complete file path
        file_path = folder_path / file_name
        
        # Generate a new file name if file already exists to avoid overwriting
        counter = 1
        while file_path.exists():
            file_name_without_extension = file_path.stem
            extension = file_path.suffix
            new_file_name = f"{file_name_without_extension}_{counter}{extension}"
            file_path = folder_path / new_file_name
            counter += 1
        
        # Export the head of the DataFrame to CSV
        df.head(num_rows).to_csv(file_path, index=False)
        
        logging.info(f"Head of DataFrame has been exported to {file_path}")
        return str(file_path)
        
    except Exception as e:
        logging.error(f"An error occurred: {e}")
        return None



# Example usage
#export_df_head_to_csv(df, num_rows=15, base_directory='..', folder_name='heads_csv', file_name='sample_head.csv')
 

In [76]:
# First, i want to categorize the Bollinger Bands features. I will be straight forward:


# Define the function to categorize Bollinger Bands
def categorize_bollinger_bands(df: pd.DataFrame) -> pd.DataFrame:
    """
    Categorize the close price based on its position relative to Bollinger Band levels.
    
    Parameters:
        df (pd.DataFrame): DataFrame containing 'close' and Bollinger Band columns.
        
    Returns:
        pd.DataFrame: A DataFrame with binary indicators for Bollinger Band scenarios.
    """
    
    # Validate required columns
    required_columns = ['close', 'upper_b1', 'lower_b1', 'upper_b2', 'lower_b2', 'upper_b3', 'lower_b3']
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        raise ValueError(f"Missing required columns: {missing_columns}")

    # Initialize result DataFrame
    bollinger_categories = pd.DataFrame()

    # Define Bollinger Band pairs
    bollinger_pairs = [('upper_b3', 'upper_b2'), ('upper_b2', 'upper_b1'), ('upper_b1', 'lower_b1'), 
                       ('lower_b1', 'lower_b2'), ('lower_b2', 'lower_b3')]

    # Generate binary indicators
    for upper, lower in bollinger_pairs:
        bollinger_categories[f"{upper}_to_{lower}"] = (df['close'] <= df[upper]) & (df['close'] >= df[lower])

    # Add extreme cases
    bollinger_categories['above_upper_b3'] = df['close'] > df['upper_b3']
    bollinger_categories['below_lower_b3'] = df['close'] < df['lower_b3']
    
    return bollinger_categories

# Generate Bollinger Band features
BB_features = categorize_bollinger_bands(df)

# Append BB_features to the original DataFrame
df = pd.concat([df, BB_features], axis=1)

# Show the first few rows of the DataFrame with the new Bollinger Band features
df.head()


Unnamed: 0,index,time,open,high,low,close,vwap,upper_b1,lower_b1,upper_b2,lower_b2,upper_b3,lower_b3,basis,upper,lower,parabolicsar,twap,volume,volume_ma,adx,efi,atr,obv,roc,cci,target_close,hour,day_of_week,USA_open,EU_open,ASIA_open,upper_b3_to_upper_b2,upper_b2_to_upper_b1,upper_b1_to_lower_b1,lower_b1_to_lower_b2,lower_b2_to_lower_b3,above_upper_b3,below_lower_b3
0,0,2022-09-01 06:45:00,19990.5,20000.0,19945.0,19975.0,20051.911081,20105.41435,19998.407813,20158.917619,19944.904544,20212.420888,19891.401275,20065.575,20180.181839,19950.968161,20093.638784,20068.875,1404.391,1079.07575,33.073108,-12916.006674,68.322402,-5436.203,-0.388969,-128.941675,19933.0,6,3,0,0,1,False,False,False,True,False,False,False
1,1,2022-09-01 07:00:00,19975.0,19975.0,19876.0,19933.0,20039.109666,20102.269953,19975.94938,20165.43024,19912.789093,20228.590527,19849.628806,20054.025,20173.099966,19934.950034,20086.773233,20064.422414,3596.456,1220.2999,34.286679,-32649.598863,70.513659,-9032.659,-0.558743,-174.971402,19898.0,7,3,0,0,1,False,False,False,True,False,False,False
2,2,2022-09-01 07:15:00,19933.0,19961.5,19878.0,19898.0,20032.127497,20099.983866,19964.271127,20167.840235,19896.414758,20235.696605,19828.558389,20040.225,20164.350128,19916.099872,20074.126839,20059.529167,2031.825,1298.3545,35.413566,-38144.495454,71.441255,-11064.484,-0.889099,-170.945798,19879.0,7,3,0,0,1,False,False,False,True,False,False,False
3,3,2022-09-01 07:30:00,19898.0,19908.0,19771.0,19879.0,20012.464334,20097.564922,19927.363747,20182.665509,19842.26316,20267.766096,19757.162573,20028.2,20165.205255,19891.194745,20062.239228,20053.221774,4533.626,1486.23915,37.199831,-45000.838103,76.124022,-15598.11,-1.153598,-210.705421,19891.5,7,3,0,0,1,False,False,False,True,False,False,False
4,4,2022-09-01 07:45:00,19879.0,19924.5,19862.0,19891.5,20007.578221,20094.228099,19920.928343,20180.877977,19834.278464,20267.527855,19747.628586,20017.2,20160.852497,19873.547503,20038.94009,20048.097656,1759.389,1512.3163,38.476938,-35430.380874,75.150878,-13838.721,-1.017615,-141.89923,19924.5,7,3,0,0,1,False,False,False,True,False,False,False


In [77]:
# Given my needs, my next goal is to keep categorizing columns
# I have noticed other indicators such as: vwap, twap, parabolicsar that i could apply the same logic as with close price to categorize their location within the price deviation map of BB.


# Function to categorize a column based on sorted Bollinger Bands
import numpy as np

def categorize_column_sorted(df, column):
    upper_b3, upper_b2, upper_b1, lower_b1, lower_b2, lower_b3 = 'upper_b3', 'upper_b2', 'upper_b1', 'lower_b1', 'lower_b2', 'lower_b3'
    sorted_bands_df = df[[upper_b3, upper_b2, upper_b1, lower_b1, lower_b2, lower_b3]].apply(lambda row: sorted(row), axis=1, result_type='expand')
    conditions = [
        df[column] > sorted_bands_df.iloc[:, -1],
        (df[column] <= sorted_bands_df.iloc[:, -1]) & (df[column] > sorted_bands_df.iloc[:, -2]),
        (df[column] <= sorted_bands_df.iloc[:, -2]) & (df[column] > sorted_bands_df.iloc[:, -3]),
        (df[column] <= sorted_bands_df.iloc[:, -3]) & (df[column] > sorted_bands_df.iloc[:, 2]),
        (df[column] <= sorted_bands_df.iloc[:, 2]) & (df[column] > sorted_bands_df.iloc[:, 1]),
        (df[column] <= sorted_bands_df.iloc[:, 1]) & (df[column] > sorted_bands_df.iloc[:, 0]),
        df[column] <= sorted_bands_df.iloc[:, 0]
    ]
    labels = [
        'above_upper_b3',
        'upper_b3_to_upper_b2',
        'upper_b2_to_upper_b1',
        'upper_b1_to_lower_b1',
        'lower_b1_to_lower_b2',
        'lower_b2_to_lower_b3',
        'below_lower_b3'
    ]
    df[f'{column}_category'] = pd.Categorical(np.select(conditions, labels, default='uncategorized'), categories=labels, ordered=True)


# Columns to be categorized
indicators_to_categorize = ['vwap', 'twap', 'parabolicsar', 'high', 'low']

# Apply the categorization function to each of the specified indicators
for indicator in indicators_to_categorize:
    categorize_column_sorted(df, indicator)

# Perform one-hot encoding on the categorized columns
one_hot_columns = ['vwap_category', 'twap_category', 'parabolicsar_category', 'high_category', 'low_category']
df_one_hot = pd.get_dummies(df[one_hot_columns])


In [78]:
# Combine the one-hot encoded DataFrame with the original DataFrame
df = pd.concat([df, df_one_hot], axis=1)

# Check results
df.head()

Unnamed: 0,index,time,open,high,low,close,vwap,upper_b1,lower_b1,upper_b2,lower_b2,upper_b3,lower_b3,basis,upper,lower,parabolicsar,twap,volume,volume_ma,adx,efi,atr,obv,roc,cci,target_close,hour,day_of_week,USA_open,EU_open,ASIA_open,upper_b3_to_upper_b2,upper_b2_to_upper_b1,upper_b1_to_lower_b1,lower_b1_to_lower_b2,lower_b2_to_lower_b3,above_upper_b3,below_lower_b3,vwap_category,twap_category,parabolicsar_category,high_category,low_category,vwap_category_above_upper_b3,vwap_category_upper_b3_to_upper_b2,vwap_category_upper_b2_to_upper_b1,vwap_category_upper_b1_to_lower_b1,vwap_category_lower_b1_to_lower_b2,vwap_category_lower_b2_to_lower_b3,vwap_category_below_lower_b3,twap_category_above_upper_b3,twap_category_upper_b3_to_upper_b2,twap_category_upper_b2_to_upper_b1,twap_category_upper_b1_to_lower_b1,twap_category_lower_b1_to_lower_b2,twap_category_lower_b2_to_lower_b3,twap_category_below_lower_b3,parabolicsar_category_above_upper_b3,parabolicsar_category_upper_b3_to_upper_b2,parabolicsar_category_upper_b2_to_upper_b1,parabolicsar_category_upper_b1_to_lower_b1,parabolicsar_category_lower_b1_to_lower_b2,parabolicsar_category_lower_b2_to_lower_b3,parabolicsar_category_below_lower_b3,high_category_above_upper_b3,high_category_upper_b3_to_upper_b2,high_category_upper_b2_to_upper_b1,high_category_upper_b1_to_lower_b1,high_category_lower_b1_to_lower_b2,high_category_lower_b2_to_lower_b3,high_category_below_lower_b3,low_category_above_upper_b3,low_category_upper_b3_to_upper_b2,low_category_upper_b2_to_upper_b1,low_category_upper_b1_to_lower_b1,low_category_lower_b1_to_lower_b2,low_category_lower_b2_to_lower_b3,low_category_below_lower_b3
0,0,2022-09-01 06:45:00,19990.5,20000.0,19945.0,19975.0,20051.911081,20105.41435,19998.407813,20158.917619,19944.904544,20212.420888,19891.401275,20065.575,20180.181839,19950.968161,20093.638784,20068.875,1404.391,1079.07575,33.073108,-12916.006674,68.322402,-5436.203,-0.388969,-128.941675,19933.0,6,3,0,0,1,False,False,False,True,False,False,False,upper_b1_to_lower_b1,upper_b1_to_lower_b1,upper_b1_to_lower_b1,upper_b1_to_lower_b1,lower_b1_to_lower_b2,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False
1,1,2022-09-01 07:00:00,19975.0,19975.0,19876.0,19933.0,20039.109666,20102.269953,19975.94938,20165.43024,19912.789093,20228.590527,19849.628806,20054.025,20173.099966,19934.950034,20086.773233,20064.422414,3596.456,1220.2999,34.286679,-32649.598863,70.513659,-9032.659,-0.558743,-174.971402,19898.0,7,3,0,0,1,False,False,False,True,False,False,False,upper_b1_to_lower_b1,upper_b1_to_lower_b1,upper_b1_to_lower_b1,lower_b1_to_lower_b2,lower_b2_to_lower_b3,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False
2,2,2022-09-01 07:15:00,19933.0,19961.5,19878.0,19898.0,20032.127497,20099.983866,19964.271127,20167.840235,19896.414758,20235.696605,19828.558389,20040.225,20164.350128,19916.099872,20074.126839,20059.529167,2031.825,1298.3545,35.413566,-38144.495454,71.441255,-11064.484,-0.889099,-170.945798,19879.0,7,3,0,0,1,False,False,False,True,False,False,False,upper_b1_to_lower_b1,upper_b1_to_lower_b1,upper_b1_to_lower_b1,lower_b1_to_lower_b2,lower_b2_to_lower_b3,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False
3,3,2022-09-01 07:30:00,19898.0,19908.0,19771.0,19879.0,20012.464334,20097.564922,19927.363747,20182.665509,19842.26316,20267.766096,19757.162573,20028.2,20165.205255,19891.194745,20062.239228,20053.221774,4533.626,1486.23915,37.199831,-45000.838103,76.124022,-15598.11,-1.153598,-210.705421,19891.5,7,3,0,0,1,False,False,False,True,False,False,False,upper_b1_to_lower_b1,upper_b1_to_lower_b1,upper_b1_to_lower_b1,lower_b1_to_lower_b2,lower_b2_to_lower_b3,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False
4,4,2022-09-01 07:45:00,19879.0,19924.5,19862.0,19891.5,20007.578221,20094.228099,19920.928343,20180.877977,19834.278464,20267.527855,19747.628586,20017.2,20160.852497,19873.547503,20038.94009,20048.097656,1759.389,1512.3163,38.476938,-35430.380874,75.150878,-13838.721,-1.017615,-141.89923,19924.5,7,3,0,0,1,False,False,False,True,False,False,False,upper_b1_to_lower_b1,upper_b1_to_lower_b1,upper_b1_to_lower_b1,upper_b1_to_lower_b1,lower_b1_to_lower_b2,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False


In [79]:
# Additional list of columns to drop based on user's request
columns_to_drop = ['vwap_category', 'twap_category', 'parabolicsar_category', 'high_category', 'low_category', 'open', 'high', 'low', 'vwap', 'twap', 'parabolicsar',
                   'upper_b1','lower_b1', 'upper_b2', 'lower_b2', 'upper_b3', 'lower_b3', 'basis', 'upper', 'lower', 'volume', 'volume_ma', 'adx', 'efi', 'atr', 'obv',
                   'roc', 'cci']

# Drop the additional columns
df = df.drop(columns=columns_to_drop)

# Show the first few rows of the further modified DataFrame
df.head()


Unnamed: 0,index,time,close,target_close,hour,day_of_week,USA_open,EU_open,ASIA_open,upper_b3_to_upper_b2,upper_b2_to_upper_b1,upper_b1_to_lower_b1,lower_b1_to_lower_b2,lower_b2_to_lower_b3,above_upper_b3,below_lower_b3,vwap_category_above_upper_b3,vwap_category_upper_b3_to_upper_b2,vwap_category_upper_b2_to_upper_b1,vwap_category_upper_b1_to_lower_b1,vwap_category_lower_b1_to_lower_b2,vwap_category_lower_b2_to_lower_b3,vwap_category_below_lower_b3,twap_category_above_upper_b3,twap_category_upper_b3_to_upper_b2,twap_category_upper_b2_to_upper_b1,twap_category_upper_b1_to_lower_b1,twap_category_lower_b1_to_lower_b2,twap_category_lower_b2_to_lower_b3,twap_category_below_lower_b3,parabolicsar_category_above_upper_b3,parabolicsar_category_upper_b3_to_upper_b2,parabolicsar_category_upper_b2_to_upper_b1,parabolicsar_category_upper_b1_to_lower_b1,parabolicsar_category_lower_b1_to_lower_b2,parabolicsar_category_lower_b2_to_lower_b3,parabolicsar_category_below_lower_b3,high_category_above_upper_b3,high_category_upper_b3_to_upper_b2,high_category_upper_b2_to_upper_b1,high_category_upper_b1_to_lower_b1,high_category_lower_b1_to_lower_b2,high_category_lower_b2_to_lower_b3,high_category_below_lower_b3,low_category_above_upper_b3,low_category_upper_b3_to_upper_b2,low_category_upper_b2_to_upper_b1,low_category_upper_b1_to_lower_b1,low_category_lower_b1_to_lower_b2,low_category_lower_b2_to_lower_b3,low_category_below_lower_b3
0,0,2022-09-01 06:45:00,19975.0,19933.0,6,3,0,0,1,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False
1,1,2022-09-01 07:00:00,19933.0,19898.0,7,3,0,0,1,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False
2,2,2022-09-01 07:15:00,19898.0,19879.0,7,3,0,0,1,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False
3,3,2022-09-01 07:30:00,19879.0,19891.5,7,3,0,0,1,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False
4,4,2022-09-01 07:45:00,19891.5,19924.5,7,3,0,0,1,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False


In [80]:
# Drop the 'index' column as requested
df = df.drop(columns=['index'])


# Convert 'hour' and 'day_of_week' to categorical True/False columns using one-hot encoding
df = pd.get_dummies(df, columns=['hour', 'day_of_week'], dtype=bool)

# Show the first few rows of the DataFrame to confirm the changes
df.head()






Unnamed: 0,time,close,target_close,USA_open,EU_open,ASIA_open,upper_b3_to_upper_b2,upper_b2_to_upper_b1,upper_b1_to_lower_b1,lower_b1_to_lower_b2,lower_b2_to_lower_b3,above_upper_b3,below_lower_b3,vwap_category_above_upper_b3,vwap_category_upper_b3_to_upper_b2,vwap_category_upper_b2_to_upper_b1,vwap_category_upper_b1_to_lower_b1,vwap_category_lower_b1_to_lower_b2,vwap_category_lower_b2_to_lower_b3,vwap_category_below_lower_b3,twap_category_above_upper_b3,twap_category_upper_b3_to_upper_b2,twap_category_upper_b2_to_upper_b1,twap_category_upper_b1_to_lower_b1,twap_category_lower_b1_to_lower_b2,twap_category_lower_b2_to_lower_b3,twap_category_below_lower_b3,parabolicsar_category_above_upper_b3,parabolicsar_category_upper_b3_to_upper_b2,parabolicsar_category_upper_b2_to_upper_b1,parabolicsar_category_upper_b1_to_lower_b1,parabolicsar_category_lower_b1_to_lower_b2,parabolicsar_category_lower_b2_to_lower_b3,parabolicsar_category_below_lower_b3,high_category_above_upper_b3,high_category_upper_b3_to_upper_b2,high_category_upper_b2_to_upper_b1,high_category_upper_b1_to_lower_b1,high_category_lower_b1_to_lower_b2,high_category_lower_b2_to_lower_b3,high_category_below_lower_b3,low_category_above_upper_b3,low_category_upper_b3_to_upper_b2,low_category_upper_b2_to_upper_b1,low_category_upper_b1_to_lower_b1,low_category_lower_b1_to_lower_b2,low_category_lower_b2_to_lower_b3,low_category_below_lower_b3,hour_0,hour_1,hour_2,hour_3,hour_4,hour_5,hour_6,hour_7,hour_8,hour_9,hour_10,hour_11,hour_12,hour_13,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
0,2022-09-01 06:45:00,19975.0,19933.0,0,0,1,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
1,2022-09-01 07:00:00,19933.0,19898.0,0,0,1,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
2,2022-09-01 07:15:00,19898.0,19879.0,0,0,1,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
3,2022-09-01 07:30:00,19879.0,19891.5,0,0,1,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
4,2022-09-01 07:45:00,19891.5,19924.5,0,0,1,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False


In [81]:
# Here I'm considering what use we can make with the data. I will move forward setting as target to predict if target_close(1) will be above or below close. 

# To do this we will reformat target_close

In [82]:
# Calculate the price difference as a percentage
df['price_diff_percentage'] = ((df['target_close'] - df['close']) / df['close']) * 100

# Create a new column 'Target' based on the calculated percentage
conditions = [
    (df['price_diff_percentage'] > 2),
    (df['price_diff_percentage'] < -2),
    (df['price_diff_percentage'] > 0) & (df['price_diff_percentage'] <= 2),
    (df['price_diff_percentage'] < 0) & (df['price_diff_percentage'] >= -2)
]

choices = [0, 1, 2, 3]

df['Target'] = pd.Categorical(np.select(conditions, choices, default=np.nan))

# Show the first few rows of the DataFrame to confirm the changes
df[['time', 'close', 'target_close', 'price_diff_percentage', 'Target']].head()


Unnamed: 0,time,close,target_close,price_diff_percentage,Target
0,2022-09-01 06:45:00,19975.0,19933.0,-0.210263,3.0
1,2022-09-01 07:00:00,19933.0,19898.0,-0.175588,3.0
2,2022-09-01 07:15:00,19898.0,19879.0,-0.095487,3.0
3,2022-09-01 07:30:00,19879.0,19891.5,0.06288,2.0
4,2022-09-01 07:45:00,19891.5,19924.5,0.1659,2.0


In [83]:
def calculate_streaks(df, target_col):
    """
    Calculate the streaks for a given target column in a DataFrame.
    
    Parameters:
        df (pd.DataFrame): The DataFrame containing the target column.
        target_col (str): The name of the target column.
    
    Returns:
        pd.Series: A Series containing the streak values.
    """
    streak = 0  # Initialize streak counter
    prev_value = None  # Store the previous row's value
    streaks = []  # List to store streak values
    
    for value in df[target_col]:
        # If the value is the same as the previous row, increment the streak counter
        if value == prev_value:
            streak += 1
        else:
            # Reset the streak counter if the value changes
            streak = 1
        
        # Store the current streak value
        streaks.append(streak)
        
        # Update the previous value
        prev_value = value
    
    return pd.Series(streaks, name='Streak')

# Calculate the streak feature for the 'Target' column
df['Streak'] = calculate_streaks(df, 'Target')

# Show the first few rows to confirm that the Streak column has been added
df.head(10)


Unnamed: 0,time,close,target_close,USA_open,EU_open,ASIA_open,upper_b3_to_upper_b2,upper_b2_to_upper_b1,upper_b1_to_lower_b1,lower_b1_to_lower_b2,lower_b2_to_lower_b3,above_upper_b3,below_lower_b3,vwap_category_above_upper_b3,vwap_category_upper_b3_to_upper_b2,vwap_category_upper_b2_to_upper_b1,vwap_category_upper_b1_to_lower_b1,vwap_category_lower_b1_to_lower_b2,vwap_category_lower_b2_to_lower_b3,vwap_category_below_lower_b3,twap_category_above_upper_b3,twap_category_upper_b3_to_upper_b2,twap_category_upper_b2_to_upper_b1,twap_category_upper_b1_to_lower_b1,twap_category_lower_b1_to_lower_b2,twap_category_lower_b2_to_lower_b3,twap_category_below_lower_b3,parabolicsar_category_above_upper_b3,parabolicsar_category_upper_b3_to_upper_b2,parabolicsar_category_upper_b2_to_upper_b1,parabolicsar_category_upper_b1_to_lower_b1,parabolicsar_category_lower_b1_to_lower_b2,parabolicsar_category_lower_b2_to_lower_b3,parabolicsar_category_below_lower_b3,high_category_above_upper_b3,high_category_upper_b3_to_upper_b2,high_category_upper_b2_to_upper_b1,high_category_upper_b1_to_lower_b1,high_category_lower_b1_to_lower_b2,high_category_lower_b2_to_lower_b3,high_category_below_lower_b3,low_category_above_upper_b3,low_category_upper_b3_to_upper_b2,low_category_upper_b2_to_upper_b1,low_category_upper_b1_to_lower_b1,low_category_lower_b1_to_lower_b2,low_category_lower_b2_to_lower_b3,low_category_below_lower_b3,hour_0,hour_1,hour_2,hour_3,hour_4,hour_5,hour_6,hour_7,hour_8,hour_9,hour_10,hour_11,hour_12,hour_13,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,price_diff_percentage,Target,Streak
0,2022-09-01 06:45:00,19975.0,19933.0,0,0,1,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,-0.210263,3.0,1
1,2022-09-01 07:00:00,19933.0,19898.0,0,0,1,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,-0.175588,3.0,2
2,2022-09-01 07:15:00,19898.0,19879.0,0,0,1,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,-0.095487,3.0,3
3,2022-09-01 07:30:00,19879.0,19891.5,0,0,1,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,0.06288,2.0,1
4,2022-09-01 07:45:00,19891.5,19924.5,0,0,1,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,0.1659,2.0,2
5,2022-09-01 08:00:00,19924.5,19871.0,0,1,1,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,-0.268514,3.0,1
6,2022-09-01 08:15:00,19871.0,19882.5,0,1,1,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,0.057873,2.0,1
7,2022-09-01 08:30:00,19882.5,19920.0,0,1,1,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,0.188608,2.0,2
8,2022-09-01 08:45:00,19920.0,19858.5,0,1,1,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,-0.308735,3.0,1
9,2022-09-01 09:00:00,19858.5,19892.0,1,1,0,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,0.168694,2.0,1


In [84]:
df = df.drop(columns=['price_diff_percentage'])


In [85]:
# Convert the 'USA_open', 'EU_open', and 'ASIA_open' columns to boolean
columns_to_convert = ['USA_open', 'EU_open', 'ASIA_open']
df[columns_to_convert] = df[columns_to_convert].astype(bool)

# Show the first few rows to confirm that the columns have been converted
df[['USA_open', 'EU_open', 'ASIA_open']].head()


Unnamed: 0,USA_open,EU_open,ASIA_open
0,False,False,True
1,False,False,True
2,False,False,True
3,False,False,True
4,False,False,True


In [90]:
# Set time as index
df = df.set_index(['time'])


In [91]:
df.head()

Unnamed: 0_level_0,close,target_close,USA_open,EU_open,ASIA_open,upper_b3_to_upper_b2,upper_b2_to_upper_b1,upper_b1_to_lower_b1,lower_b1_to_lower_b2,lower_b2_to_lower_b3,above_upper_b3,below_lower_b3,vwap_category_above_upper_b3,vwap_category_upper_b3_to_upper_b2,vwap_category_upper_b2_to_upper_b1,vwap_category_upper_b1_to_lower_b1,vwap_category_lower_b1_to_lower_b2,vwap_category_lower_b2_to_lower_b3,vwap_category_below_lower_b3,twap_category_above_upper_b3,twap_category_upper_b3_to_upper_b2,twap_category_upper_b2_to_upper_b1,twap_category_upper_b1_to_lower_b1,twap_category_lower_b1_to_lower_b2,twap_category_lower_b2_to_lower_b3,twap_category_below_lower_b3,parabolicsar_category_above_upper_b3,parabolicsar_category_upper_b3_to_upper_b2,parabolicsar_category_upper_b2_to_upper_b1,parabolicsar_category_upper_b1_to_lower_b1,parabolicsar_category_lower_b1_to_lower_b2,parabolicsar_category_lower_b2_to_lower_b3,parabolicsar_category_below_lower_b3,high_category_above_upper_b3,high_category_upper_b3_to_upper_b2,high_category_upper_b2_to_upper_b1,high_category_upper_b1_to_lower_b1,high_category_lower_b1_to_lower_b2,high_category_lower_b2_to_lower_b3,high_category_below_lower_b3,low_category_above_upper_b3,low_category_upper_b3_to_upper_b2,low_category_upper_b2_to_upper_b1,low_category_upper_b1_to_lower_b1,low_category_lower_b1_to_lower_b2,low_category_lower_b2_to_lower_b3,low_category_below_lower_b3,hour_0,hour_1,hour_2,hour_3,hour_4,hour_5,hour_6,hour_7,hour_8,hour_9,hour_10,hour_11,hour_12,hour_13,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,Target,Streak
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1
2022-09-01 06:45:00,19975.0,19933.0,False,False,True,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,3.0,1
2022-09-01 07:00:00,19933.0,19898.0,False,False,True,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,3.0,2
2022-09-01 07:15:00,19898.0,19879.0,False,False,True,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,3.0,3
2022-09-01 07:30:00,19879.0,19891.5,False,False,True,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,2.0,1
2022-09-01 07:45:00,19891.5,19924.5,False,False,True,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,2.0,2


In [92]:
# Export to csv

df.to_csv('../processed_data/KNN_data.csv')

In [93]:
import pandas as pd
import numpy as np
import sqlite3
from contextlib import closing

def read_csv_to_dataframe(file_path):
    try:
        return pd.read_csv(file_path)
    except Exception as e:
        print(f"An error occurred while reading the CSV file: {e}")
        return None

def create_sqlite_db(dataframe, table_name, conn):
    try:
        dataframe.to_sql(table_name, conn, if_exists='replace')
    except Exception as e:
        print(f"An error occurred while creating the SQLite table: {e}")

def query_sqlite_db(query, conn):
    try:
        return pd.read_sql_query(query, conn)
    except Exception as e:
        print(f"An error occurred while querying the SQLite database: {e}")

if __name__ == "__main__":
    csv_file_path = '../processed_data/KNN_data.csv'
    db_file_path = '../BTC_data.db'
    table_name = 'KNN_data'
    
    # Read data from CSV file into DataFrame
    df = read_csv_to_dataframe(csv_file_path)
    if df is not None:
        # Create a SQLite database saved to disk
        with closing(sqlite3.connect(db_file_path)) as conn:
            # Create table and insert data
            create_sqlite_db(df, table_name, conn)
            
            # Query to make sure the data has been inserted properly
            query = f"SELECT * FROM {table_name} LIMIT 5;"
            queried_data = query_sqlite_db(query, conn)
            if queried_data is not None:
                print(queried_data)


   index                 time    close  target_close  USA_open  EU_open  \
0      0  2022-09-01 06:45:00  19975.0       19933.0         0        0   
1      1  2022-09-01 07:00:00  19933.0       19898.0         0        0   
2      2  2022-09-01 07:15:00  19898.0       19879.0         0        0   
3      3  2022-09-01 07:30:00  19879.0       19891.5         0        0   
4      4  2022-09-01 07:45:00  19891.5       19924.5         0        0   

   ASIA_open  upper_b3_to_upper_b2  upper_b2_to_upper_b1  \
0          1                     0                     0   
1          1                     0                     0   
2          1                     0                     0   
3          1                     0                     0   
4          1                     0                     0   

   upper_b1_to_lower_b1  lower_b1_to_lower_b2  lower_b2_to_lower_b3  \
0                     0                     1                     0   
1                     0                     1 