# Import

In [1]:
# adding common module to path, to be visible
import sys
sys.path.append("../../../common")

# data analysis
import pandas as pd
import numpy as np
np.set_printoptions(precision=4)
np.set_printoptions(suppress=True)
pd.options.display.float_format = '{:.4f}'.format
from datetime import datetime

# api related
import requests
import kagglehub

# database related
import psycopg
import os
import json
from tabulate import tabulate
from utility import dry_insert_into_db, insert_into_db, describe_table

# logging related
import logging
from pathlib import Path

# Init

In [2]:
pwf = str(Path.cwd()).split("bitcoin_analysis")[1]
logger = logging.getLogger("default_logger")

file_handler = logging.FileHandler("../../../logging/logger.txt")
file_formater = logging.Formatter(
    f"{pwf}\n" +
    f">>>\n" +
    f"%(levelname)s: %(message)s.\n" + 
    f"<<< %(asctime)s\n"
)

console_handler = logging.StreamHandler()
console_formater = logging.Formatter(f"Logged %(levelname)s in {pwf}")

file_handler.setFormatter(file_formater)
file_handler.setLevel(logging.INFO)

console_handler.setLevel(logging.INFO)
console_handler.setFormatter(console_formater)

logger.setLevel(logging.INFO)
logger.addHandler(file_handler)
logger.addHandler(console_handler)

In [3]:
with open("../../../secrets/secrets.json") as f:
    SECRETS = json.load(f)

CONNECTION_STRING = f"postgresql://postgres:{SECRETS["postgres_passcode"]}@localhost/bitcoin_analysis"

# Creating tables

The actual creation is done through CLI & `schema_creator.sql`

In [4]:
with open("../../../secrets/secrets.json") as f:
    secrets = json.load(f)
    
with psycopg.connect(CONNECTION_STRING) as conn:
    res = conn.execute("""
        SELECT
        	i.table_name as name,
        	t.tableowner as table_owner,
        	i.is_insertable_into as is_insertable,
        	coalesce(t.tablespace, 'pg_default') as "tablespace"
        FROM 
        	information_schema.tables i 
        	left join pg_tables t on i.table_name = t.tablename
        where i.table_schema = 'public'
        order by i.table_name;
    """)
res

<psycopg.Cursor [TUPLES_OK] [BAD] at 0x1084ffa10>

In [5]:
head = [tuple(i.name for i in res.description)]
content = res.fetchall()
head.extend(content)
query_result = head

In [6]:
print(
    tabulate(query_result[1:], headers=query_result[0], tablefmt="pipe")
)

| name                       | table_owner   | is_insertable   | tablespace   |
|:---------------------------|:--------------|:----------------|:-------------|
| bitcoin_ohlc               | postgres      | YES             | external     |
| bitcoin_trading_metadata   | postgres      | YES             | external     |
| cpi                        | postgres      | YES             | external     |
| dow_jones_ohlc             | postgres      | YES             | external     |
| dow_jones_trading_metadata | postgres      | YES             | external     |
| gold_ohlc                  | postgres      | YES             | external     |
| gold_trading_metadata      | postgres      | YES             | external     |
| nasdaq_ohlc                | postgres      | YES             | external     |
| nasdaq_trading_metadata    | postgres      | YES             | external     |
| oil_ohlc                   | postgres      | YES             | external     |
| oil_trading_metadata       | postgres 

In [None]:
logger.info(
    "Tables have been created.\n" +
    tabulate(query_result[1:], headers=query_result[0], tablefmt="pipe")
)

# Getting Bitcoin data

Twelve data failed at test-time due to restrictions of the free plan

In [7]:
# import
os.environ.update({"KAGGLEHUB_CACHE":"/Users/Misha/Documents/python_projects/data_analysis/bitcoin_analysis/temp/"})
dir_ = Path(kagglehub.dataset_download("mczielinski/bitcoin-historical-data"))
file_name = os.listdir(dir_)[0]
path = dir_ / file_name

print(f"Path to dataset: {path}")

Downloading from https://www.kaggle.com/api/v1/datasets/download/mczielinski/bitcoin-historical-data?dataset_version_number=457...


100%|███████████████████████████████████████████████████████████████████████████████████████████| 98.8M/98.8M [00:03<00:00, 26.7MB/s]

Extracting files...





Path to dataset: /Users/Misha/Documents/python_projects/data_analysis/bitcoin_analysis/temp/datasets/mczielinski/bitcoin-historical-data/versions/457/btcusd_1-min_data.csv


In [8]:
df = pd.read_csv(path)

In [9]:
try:
    assert bool((df >= 0).all(axis=None)) is True, "Some entries are negative"    
    assert bool(df.notna().any(axis="columns").all()) is True, "Some rows are completelly NA"
except AssertionError as err:
    logger.critical(f"Validation of the Bitcon data failed: {err}")
else:
    logger.info(f"Validation of the Bitoin data is successful")

Logged INFO in /phase1/preprocessing/simple


In [10]:
df["Timestamp"] = df["Timestamp"].map(datetime.fromtimestamp)
df.rename(columns=str.lower, inplace=True)
df.set_index("timestamp", inplace=True)

df.drop(columns="volume",inplace=True)

df = df.resample("1h").agg({
    "open":"first",
    "high":"max",
    "low":"min",
    "close":"last"
})
df

Unnamed: 0_level_0,open,high,low,close
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012-01-01 11:00:00,4.5800,4.5800,4.5800,4.5800
2012-01-01 12:00:00,4.5800,4.5800,4.5800,4.5800
2012-01-01 13:00:00,4.5800,4.5800,4.5800,4.5800
2012-01-01 14:00:00,4.5800,4.5800,4.5800,4.5800
2012-01-01 15:00:00,4.5800,4.5800,4.5800,4.5800
...,...,...,...,...
2025-12-23 20:00:00,87919.0000,87984.0000,87597.0000,87701.0000
2025-12-23 21:00:00,87694.0000,88295.0000,87482.0000,87655.0000
2025-12-23 22:00:00,87628.0000,87797.0000,87481.0000,87655.0000
2025-12-23 23:00:00,87663.0000,87716.0000,87209.0000,87334.0000


In [11]:
df = df.loc[:datetime.fromisoformat("2025-07-01 23:00:00"),:]
df = df.reset_index()
df["timestamp"] = "'" + df["timestamp"].astype(str) + "'"
df.fillna("NULL", inplace=True)
df

  df.fillna("NULL", inplace=True)


Unnamed: 0,timestamp,open,high,low,close
0,'2012-01-01 11:00:00',4.5800,4.5800,4.5800,4.5800
1,'2012-01-01 12:00:00',4.5800,4.5800,4.5800,4.5800
2,'2012-01-01 13:00:00',4.5800,4.5800,4.5800,4.5800
3,'2012-01-01 14:00:00',4.5800,4.5800,4.5800,4.5800
4,'2012-01-01 15:00:00',4.5800,4.5800,4.5800,4.5800
...,...,...,...,...,...
118328,'2025-07-01 19:00:00',106132.0000,106321.0000,105947.0000,106305.0000
118329,'2025-07-01 20:00:00',106292.0000,106292.0000,105713.0000,105761.0000
118330,'2025-07-01 21:00:00',105761.0000,105811.0000,105414.0000,105415.0000
118331,'2025-07-01 22:00:00',105435.0000,106006.0000,105280.0000,105953.0000


In [12]:
dry_insert_into_db(
    CONNECTION_STRING,
    "bitcoin_ohlc",
    df.to_numpy()
)

Insert values into bitcoin_ohlc:
| ts                    | open   | high   | low   | close   |
|:----------------------|:-------|:-------|:------|:--------|
| '2012-01-01 11:00:00' | 4.58   | 4.58   | 4.58  | 4.58    |
| '2012-01-01 12:00:00' | 4.58   | 4.58   | 4.58  | 4.58    |
| '2012-01-01 13:00:00' | 4.58   | 4.58   | 4.58  | 4.58    |
| ...                   | ...    | ...    | ...   | ...     |

Insertion shape is valid.


In [13]:
insert_into_db(
    CONNECTION_STRING,
    "bitcoin_ohlc",
    df.to_numpy()
)

In [14]:
print(describe_table(CONNECTION_STRING, "bitcoin_ohlc", "ts"))

Table summary: bitcoin_ohlc

| column_name   | data_type                   | is_nullable   |
|:--------------|:----------------------------|:--------------|
| ts            | timestamp without time zone | NO            |
| open          | double precision            | YES           |
| high          | double precision            | YES           |
| low           | double precision            | YES           |
| close         | double precision            | YES           |

With 118333 entiries

First & last being:
| ts                  |      open |      high |       low |     close |
|:--------------------|----------:|----------:|----------:|----------:|
| 2012-01-01 11:00:00 |      4.58 |      4.58 |      4.58 |      4.58 |
| 2025-07-01 23:00:00 | 105953    | 106112    | 105754    | 105810    |


In [16]:
logger.info(
    f"Bitcoin OHLC successfully acquired from Kaggle, transformed and loaded\n"
    f"{describe_table(CONNECTION_STRING, "bitcoin_ohlc", "ts")}"
)

Logged INFO in /phase1/preprocessing/simple


# Getting Nasdaq data