In [7]:
import os
import pandas as pd

# Specify the directory containing the JSON files
directory = './BitcoinData/'

# Initialize an empty list to store the dataframes
dataframes = []

# Loop through all the files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.json'):  # Process only JSON files
        file_path = os.path.join(directory, filename)

        # Read the JSON file into a DataFrame
        df = pd.read_json(file_path)

        # Append the DataFrame to the list
        dataframes.append(df)

# Concatenate all DataFrames into one
combined_df = pd.concat(dataframes, ignore_index=True)

columns = [
    'Open Time',  # Timestamp for when the candlestick opened
    'Open Price',  # Price when the candlestick opened
    'High Price',  # Highest price during the interval
    'Low Price',  # Lowest price during the interval
    'Close Price',  # Price when the candlestick closed
    'Volume',  # Volume traded during the interval
    'Close Time',  # Timestamp for when the candlestick closed
    'Quote Asset Volume',  # Quote asset volume (e.g., USDT)
    'Number of Trades',  # Number of trades during the interval
    'Taker Buy Base Volume',  # Volume of the base asset (e.g., BTC) bought by takers
    'Taker Buy Quote Volume',  # Quote asset (e.g., USDT) spent by takers
    'Ignore'  # Ignored column (always 0)
]

# Assign these columns to your DataFrame
combined_df.columns = columns

# Optionally, if you want to convert the timestamps to human-readable format
combined_df['Open Time'] = pd.to_datetime(combined_df['Open Time'], unit='ms')
combined_df['Close Time'] = pd.to_datetime(combined_df['Close Time'], unit='ms')

combined_df['Day'] = combined_df['Open Time'].dt.day_name()

# Convert prices to float for comparisons
combined_df['Open Price'] = combined_df['Open Price'].astype(float)
combined_df['Close Price'] = combined_df['Close Price'].astype(float)

# Create a new column 'is_up' that checks if the Close Price is higher than the Open Price
combined_df['is_up'] = combined_df['Close Price'] > combined_df['Open Price']

# Group by week
combined_df['Week'] = combined_df['Open Time'].dt.to_period('W')

# Get the first 'm5' open price and last 'm5' close price for each week
first_of_week = combined_df.groupby('Week').first()
last_of_week = combined_df.groupby('Week').last()

# Compare the last close price with the first open price for each week
weekly_comparison = (last_of_week['Close Price'] > first_of_week['Open Price'])

# Map the result back to every row of the original dataframe, filling each week's 'is_weekly_up'
combined_df['is_weekly_up'] = combined_df['Week'].map(weekly_comparison)


# Define a function to resample data, compare with weekly data, and return a new DataFrame
def create_resampled_dataframe(resample_rule):
    # Resample the data to the desired timeframe
    resampled_df = combined_df.resample(resample_rule, on='Open Time').agg({
        'Open Time': 'first',
        'Open Price': 'first',
        'Close Price': 'last',
        'Day': 'first'
    }).dropna()

    # Check if resampled close price is higher than the open price (is_up)
    resampled_df['is_up'] = resampled_df['Close Price'] > resampled_df['Open Price']

    # Map the result from the weekly comparison back to each row
    resampled_df['Week'] = resampled_df['Open Time'].dt.to_period('W')
    resampled_df['is_weekly_up'] = resampled_df['Week'].map(weekly_comparison)

    # Keep only the required columns: day, open_time, is_up, is_weekly_up
    final_df = resampled_df[['Day', 'Open Time', 'is_up', 'is_weekly_up']]

    # Assign a name to this DataFrame based on the label
    return final_df


# Create DataFrames for each of the timeframes compared with weekly (W1)
m5_vs_w1_df = create_resampled_dataframe('5min')
m15_vs_w1_df = create_resampled_dataframe('15min')
h1_vs_w1_df = create_resampled_dataframe('1h')
h4_vs_w1_df = create_resampled_dataframe('4h')
h12_vs_w1_df = create_resampled_dataframe('12h')
d1_vs_w1_df = create_resampled_dataframe('1D')

# Display the resulting DataFrames
print("M5 vs W1 DataFrame")
print(m5_vs_w1_df)

print("\nM15 vs W1 DataFrame")
print(m15_vs_w1_df)

print("\nH1 vs W1 DataFrame")
print(h1_vs_w1_df)

print("\nH4 vs W1 DataFrame")
print(h4_vs_w1_df)

print("\nH12 vs W1 DataFrame")
print(h12_vs_w1_df)

print("\nD1 vs W1 DataFrame")
print(d1_vs_w1_df)


M5 vs W1 DataFrame
                         Day           Open Time  is_up  is_weekly_up
Open Time                                                            
2019-09-09 00:00:00   Monday 2019-09-09 00:00:00  False         False
2019-09-09 00:05:00   Monday 2019-09-09 00:05:00  False         False
2019-09-09 00:10:00   Monday 2019-09-09 00:10:00  False         False
2019-09-09 00:15:00   Monday 2019-09-09 00:15:00  False         False
2019-09-09 00:20:00   Monday 2019-09-09 00:20:00  False         False
...                      ...                 ...    ...           ...
2024-10-22 16:10:00  Tuesday 2024-10-22 16:10:00   True         False
2024-10-22 16:15:00  Tuesday 2024-10-22 16:15:00  False         False
2024-10-22 16:20:00  Tuesday 2024-10-22 16:20:00  False         False
2024-10-22 16:25:00  Tuesday 2024-10-22 16:25:00  False         False
2024-10-22 16:30:00  Tuesday 2024-10-22 16:30:00   True         False

[538759 rows x 4 columns]

M15 vs W1 DataFrame
                       

In [8]:
import pandas as pd


# Define a function to calculate correlation and matching percentage for each interval
def calculate_correlation_for_interval(df):
    # Create a new column that represents only the time of the day (HH:MM)
    df['Time'] = df['Open Time'].dt.time

    # Group by 'Day' and 'Time' (for example, Monday 00:00, Monday 00:05)
    grouped = df.groupby(['Day', 'Time'])

    # Initialize a list to store the results
    correlation_results = []

    # Iterate over each group and calculate the correlation and matching percentage
    for (day, time), group in grouped:
        # Calculate correlation between 'is_up' and 'is_weekly_up' (convert boolean to integer for correlation)
        is_up_values = group['is_up'].astype(int)
        is_weekly_up_values = group['is_weekly_up'].astype(int)

        if len(is_up_values) > 1:  # Need at least 2 values for correlation
            correlation = is_up_values.corr(is_weekly_up_values)
        else:
            correlation = None  # Correlation not meaningful for a single row

        # Calculate percentage of cases where 'is_up' and 'is_weekly_up' are the same
        matching_percentage = ((is_up_values == is_weekly_up_values).astype(int).mean()) * 100

        # Append the results
        correlation_results.append({
            'Day': day,
            'Time': time,
            'Correlation': correlation,
            'Matching Percentage': matching_percentage
        })

    # Convert the results into a DataFrame and return it
    return pd.DataFrame(correlation_results)


# Apply the function to each of the timeframes and create separate DataFrames
m5_correlation_df = calculate_correlation_for_interval(m5_vs_w1_df)
m15_correlation_df = calculate_correlation_for_interval(m15_vs_w1_df)
h1_correlation_df = calculate_correlation_for_interval(h1_vs_w1_df)
h4_correlation_df = calculate_correlation_for_interval(h4_vs_w1_df)
h12_correlation_df = calculate_correlation_for_interval(h12_vs_w1_df)
d1_correlation_df = calculate_correlation_for_interval(d1_vs_w1_df)

# Sort each DataFrame by 'Matching Percentage' in descending order
m5_correlation_df = m5_correlation_df.sort_values(by='Matching Percentage', ascending=False)
m15_correlation_df = m15_correlation_df.sort_values(by='Matching Percentage', ascending=False)
h1_correlation_df = h1_correlation_df.sort_values(by='Matching Percentage', ascending=False)
h4_correlation_df = h4_correlation_df.sort_values(by='Matching Percentage', ascending=False)
h12_correlation_df = h12_correlation_df.sort_values(by='Matching Percentage', ascending=False)
d1_correlation_df = d1_correlation_df.sort_values(by='Matching Percentage', ascending=False)

# Optionally, you can display or save each of the DataFrames separately
print("M5 vs W1 Correlation DataFrame:")
print(m5_correlation_df)

print("\nM15 vs W1 Correlation DataFrame:")
print(m15_correlation_df)

print("\nH1 vs W1 Correlation DataFrame:")
print(h1_correlation_df)

print("\nH4 vs W1 Correlation DataFrame:")
print(h4_correlation_df)

print("\nH12 vs W1 Correlation DataFrame:")
print(h12_correlation_df)

print("\nD1 vs W1 Correlation DataFrame:")
print(d1_correlation_df)


M5 vs W1 Correlation DataFrame:
            Day      Time  Correlation  Matching Percentage
1654    Tuesday  17:50:00     0.234725            61.797753
796    Saturday  18:20:00     0.186280            59.550562
105      Friday  08:45:00     0.190359            59.550562
1318   Thursday  13:50:00     0.187050            59.550562
860    Saturday  23:40:00     0.182523            59.176030
...         ...       ...          ...                  ...
651    Saturday  06:15:00    -0.153924            42.322097
703    Saturday  10:35:00    -0.153202            42.322097
209      Friday  17:25:00    -0.159688            41.947566
2001  Wednesday  22:45:00    -0.170479            41.573034
1702    Tuesday  21:50:00    -0.204710            39.700375

[2016 rows x 4 columns]

M15 vs W1 Correlation DataFrame:
           Day      Time  Correlation  Matching Percentage
158     Monday  15:30:00     0.199134            60.074627
623  Wednesday  11:45:00     0.189089            59.550562
133     Mond