In [2]:
import pandas as pd
import numpy as np

In [49]:
path = "../../raw_data/raw_btc_min_price_df_v1.csv"
data = pd.read_csv(path)
df = data.copy()
df.dtypes

Timestamp       int64
Date           object
Symbol         object
Open          float64
High          float64
Low           float64
Close         float64
Volume BTC    float64
Volume USD    float64
dtype: object

In [8]:
df.head()

Unnamed: 0,Timestamp,Date,Symbol,Open,High,Low,Close,Volume BTC,Volume USD
0,1676939580000,2023-02-21 00:33:00,BTC/USD,24859.34,24859.34,24859.34,24859.34,0.0,0.0
1,1676939520000,2023-02-21 00:32:00,BTC/USD,24821.96,24859.34,24821.96,24859.34,0.103099,2562.977818
2,1676939460000,2023-02-21 00:31:00,BTC/USD,24818.09,24821.96,24815.47,24821.96,0.09064,2249.866178
3,1676939400000,2023-02-21 00:30:00,BTC/USD,24812.25,24818.09,24812.25,24818.09,0.002203,54.68145
4,1676939340000,2023-02-21 00:29:00,BTC/USD,24809.27,24812.25,24809.27,24812.25,0.090675,2249.862431


In [11]:
def df_formating(df, columns):
    # Step 1: Rename columns
    formated_df = df.rename(columns={
        columns[0]: 'date',
        columns[1]: 'open',
        columns[2]: 'high',
        columns[3]: 'low',
        columns[4]: 'adj_close',
        columns[5]: 'volume'
    })

    # Step 2: Drop all other columns
    columns_to_keep = ['date', 'open', 'high', 'low', 'adj_close', 'volume']
    formated_df = formated_df[columns_to_keep]

    # Step 3: Set columns to float64
    formated_df = formated_df.astype({'open': 'float32', 'high': 'float32', 'low': 'float32', 'adj_close': 'float32', 'volume': 'float32'})

    # Step 4: Set 'date' column to datetime type
    formated_df['date'] = pd.to_datetime(formated_df['date'], format='mixed')

    # Step 5: Set 'date' column as the index
    formated_df.set_index('date', inplace=True)

    return formated_df

In [13]:
columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume USD']
formated_df = df_formating(df, columns)
formated_df.head()

Unnamed: 0_level_0,open,high,low,adj_close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-02-21 00:33:00,24859.339844,24859.339844,24859.339844,24859.339844,0.0
2023-02-21 00:32:00,24821.960938,24859.339844,24821.960938,24859.339844,2562.977783
2023-02-21 00:31:00,24818.089844,24821.960938,24815.470703,24821.960938,2249.866211
2023-02-21 00:30:00,24812.25,24818.089844,24812.25,24818.089844,54.68145
2023-02-21 00:29:00,24809.269531,24812.25,24809.269531,24812.25,2249.862549


In [14]:
formated_df.dtypes

open         float32
high         float32
low          float32
adj_close    float32
volume       float32
dtype: object

In [25]:
export_path = "../../raw_data/pro_btc_min_price_df_v2.csv"
formated_df.to_csv(export_path)

In [42]:
def convert_to_interval(input_file, output_file, interval_minutes):
    """
    Convert a CSV file with minute-level data to a new CSV file with a specified time interval.

    Parameters:
    - input_file (str): Path to the input CSV file.
    - output_file (str): Path to the output CSV file.
    - interval_minutes (int): Time interval for each row (e.g., 5 or 10 minutes).
    """
    # Read the input CSV file into a DataFrame
    df = pd.read_csv(input_file, index_col=0, parse_dates=True)

    df.index = pd.to_datetime(df.index)

    # Resample the DataFrame based on the specified time interval
    resampled_df = df.resample(f'{interval_minutes}T').agg({
        'open': 'first',
        'high': 'max',
        'low': 'min',
        'adj_close': 'last',
        'volume': 'sum'
    })

    # Drop rows with missing values (due to resampling)
    resampled_df = resampled_df.dropna()

    resampled_df = resampled_df.astype({'open': 'float32', 'high': 'float32', 'low': 'float32', 'adj_close': 'float32', 'volume': 'float32'})

    # Write the resampled DataFrame to a new CSV file
    resampled_df.to_csv(output_file)

    return resampled_df

In [47]:
input_csv_path = '../../raw_data/pro_btc_min_price_df_v2.csv'
output_csv_path = '../../raw_data/pro_btc_60min_price_df_v1.csv'
df_diff = convert_to_interval(input_csv_path, output_csv_path, interval_minutes=60)

In [48]:
df_diff.dtypes

open         float32
high         float32
low          float32
adj_close    float32
volume       float32
dtype: object