# Import

In [67]:
# data analysis
import pandas as pd
import numpy as np
np.set_printoptions(precision=4)
np.set_printoptions(suppress=True)
pd.options.display.float_format = '{:.4f}'.format
from datetime import datetime

# api related
import requests
import kagglehub

# database related
import psycopg
from tabulate import tabulate
import json
import os

# logging related
import logging
from pathlib import Path

# Init

In [2]:
pwf = str(Path.cwd()).split("bitcoin_analysis")[1]
logger = logging.getLogger("default_logger")

file_handler = logging.FileHandler("../../../logging/logger.txt")
file_formater = logging.Formatter(
    f"{pwf}\n" +
    f">>>\n" +
    f"%(levelname)s: %(message)s.\n" + 
    f"<<< %(asctime)s\n"
)

console_handler = logging.StreamHandler()
console_formater = logging.Formatter(f"Logged %(levelname)s in {pwf}")

file_handler.setFormatter(file_formater)
file_handler.setLevel(logging.INFO)

console_handler.setLevel(logging.INFO)
console_handler.setFormatter(console_formater)

logger.setLevel(logging.INFO)
logger.addHandler(file_handler)
logger.addHandler(console_handler)

In [43]:
with open("../../../secrets/secrets.json") as f:
    SECRETS = json.load(f)

CONNECTION_STRING = f"postgresql://postgres:{SECRETS["postgres_passcode"]}@localhost/bitcoin_analysis"

In [146]:
def insert_into_db(
    conn_str:str, 
    table:str, 
    data:np.ndarray, 
    string_array:bool = False,
    dry:bool = True
) -> None:
    """
    Inserts given 2D data into the `table` in the bitcoin database.

    ASSUMPTION: PostgeSQL database
    WARNING: Homogenous arrays are not supported. Untested - will resolve in unexpected behavior
    
    Params:
        - conn_string: connection sting to the database
        - table: name of the table in the database
        - data: 2D array of data to be inserted. With columns = table columns and rows = rows to be inserted
        - string_array=False: set to True if the data is string-like. If set improperly will result in corruption of data in the database table and / or an error
        - dry=True: specify whether to dry run insert the data. If True, will run the whole function and check if n-cols in array == n-cols in the target table
    
    Raises:
        - AssertionError: if input(s) are invalid
        - ValueError: if there is a shape mismatch between data-array and number of columns in the target table
    """
    assert conn_str is str, f"conn_str, must be a str. Given: {type(conn_str)}"
    assert table is str, f"table, must be a str. Given: {type(table)}"
    assert data is np.array, f"data must be a np.array. Given: {type(data)}"
    assert string_array is bool, f"string_array must be boolean. Given: {type(string_array)}"
    assert dry is bool, f"dry must be boolean. Given: {type(dry)}"
    assert len(data.shape) == 2, f"Data must be 2-dimensional. Given: {data.shape} ({len(data.shape)}-dimensional)"
    
    if string_array:
        data = "'" + data + "'"
        
    insert_values = ", ".join( # set separator between rows
        map(
            lambda x: "(" + ", ".join(x) + ")", # Wrap each row in parens
            data.astype(str) # cast each value to string
        )
    )

    with psycopg.connect(CONNECTION_STRING) as conn:
        res = conn.execute(f"SELECT column_name FROM information_schema.columns WHERE table_name = '{table}';")
        cols = res.fetchall()
        cols = np.array(cols).flatten()

        if len(cols) != data.shape[1]:
            raise ValueError(f"Size mismatch: {len(cols)} columns in the target table vs. {data.shape[1]} columns in data-array")

        if dry:
            string = ""
            for i, col in zip(data.astype(str).T[...,:3], cols):
                string += col + " <- " + ", ".join(i) + "\n"
            print(f"""
            Insert values:
                {insert_values[:20]}...
            Will be mapped to the following columns:
                {string}
            Insertion shape is valid.
            """)
            return 

    return

In [141]:
table = "bitcoin_ohlc"
data = np.arange(1,16).reshape((3,5))
with psycopg.connect(CONNECTION_STRING) as conn:
    cols = (conn
                .execute(f"SELECT column_name FROM information_schema.columns WHERE table_name = '{table}';")
                .fetchall()
            )
    cols = np.array(cols).flatten()
#np.concat((cols, data), axis=0).T
string = ""
for i, col in zip(data.astype(str).T, cols):
    string += col + " <- " + ", ".join(i) + "\n"
#print(string)
data

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10],
       [11, 12, 13, 14, 15]])

# Creating tables

The actual creation is done through CLI & `schema_creator.sql`

In [3]:
with open("../../../secrets/secrets.json") as f:
    secrets = json.load(f)
    
with psycopg.connect(CONNECTION_STRING) as conn:
    res = conn.execute("""
        SELECT
        	i.table_name as name,
        	t.tableowner as table_owner,
        	i.is_insertable_into as is_insertable,
        	coalesce(t.tablespace, 'pg_default') as "tablespace"
        FROM 
        	information_schema.tables i 
        	left join pg_tables t on i.table_name = t.tablename
        where i.table_schema = 'public'
        order by i.table_name;
    """)
res

<psycopg.Cursor [TUPLES_OK] [BAD] at 0x10fe065d0>

In [4]:
head = [tuple(i.name for i in res.description)]
content = res.fetchall()
head.extend(content)
query_result = head

In [5]:
print(
    tabulate(query_result[1:], headers=query_result[0], tablefmt="pipe")
)

| name                       | table_owner   | is_insertable   | tablespace   |
|:---------------------------|:--------------|:----------------|:-------------|
| bitcoin_ohlc               | postgres      | YES             | external     |
| bitcoin_trading_metadata   | postgres      | YES             | external     |
| cpi                        | postgres      | YES             | external     |
| dow_jones_ohlc             | postgres      | YES             | external     |
| dow_jones_trading_metadata | postgres      | YES             | external     |
| gold_ohlc                  | postgres      | YES             | external     |
| gold_trading_metadata      | postgres      | YES             | external     |
| nasdaq_ohlc                | postgres      | YES             | external     |
| nasdaq_trading_metadata    | postgres      | YES             | external     |
| oil_ohlc                   | postgres      | YES             | external     |
| oil_trading_metadata       | postgres 

In [7]:
logger.info(
    "Tables have been created.\n" +
    tabulate(query_result[1:], headers=query_result[0], tablefmt="pipe")
)

Logged INFO in /phase1/preprocessing/simple


# Getting Bitcoin data

Twelve data failed at test-time due to restrictions of the free plan

In [41]:
# import
os.environ.update({"KAGGLEHUB_CACHE":"/Users/Misha/Documents/python_projects/data_analysis/bitcoin_analysis/temp/"})
dir_ = Path(kagglehub.dataset_download("mczielinski/bitcoin-historical-data"))
file_name = os.listdir(dir_)[0]
path = dir_ / file_name

print(f"Path to dataset: {path}")

Downloading from https://www.kaggle.com/api/v1/datasets/download/mczielinski/bitcoin-historical-data?dataset_version_number=456...


100%|███████████████████████████████████████████████████████████████████████████████████████████| 98.8M/98.8M [00:04<00:00, 22.6MB/s]

Extracting files...





Path to dataset: /Users/Misha/Documents/python_projects/data_analysis/bitcoin_analysis/temp/datasets/mczielinski/bitcoin-historical-data/versions/456/btcusd_1-min_data.csv


In [60]:
df = pd.read_csv(path, nrows=100)

In [61]:
try:
    assert bool((df >= 0).all(axis=None)) is True, "Some entries are negative"    
    assert bool(df.notna().any(axis="columns").all()) is True, "Some rows are completelly NA"
except AssertionError as err:
    logger.critical(f"Validation of the Bitcon data failed: {err}")
else:
    logger.info(f"Validation of the Bitoin data is successful")

In [42]:
df["Timestamp"] = df["Timestamp"].map(datetime.fromtimestamp)
df.rename(columns=str.lower, inplace=True)
df.set_index("timestamp", inplace=True)

df.drop(columns="volume",inplace=True)

df = df.resample("1h").agg({
    "open":"first",
    "high":"max",
    "low":"min",
    "close":"last"
})
df

Unnamed: 0_level_0,open,high,low,close
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012-01-01 11:00:00,4.58,4.58,4.58,4.58
2012-01-01 12:00:00,4.58,4.58,4.58,4.58


In [None]:
insert_into_db()

In [102]:
insert_into_db?

[31mSignature:[39m
insert_into_db(
    conn_str: str,
    table: str,
    data: numpy.ndarray,
    string_array: bool = [38;5;28;01mFalse[39;00m,
) -> [38;5;28;01mNone[39;00m
[31mDocstring:[39m
Inserts given 2D data into the `table` in the bitcoin database.
Params:
    - conn_string: connection sting to the database
    - table: name of the table in the database
    - data: 2D array of data to be inserted
    - string_array: set to True if the data is string-like. If set improperly will result in corruption of data in the database table and / or an error

Raises:
    - AssertionError: if input(s) are invalid
[31mFile:[39m      /var/folders/_w/y37027mn4f99kw4yyhmf0kfw0000gp/T/ipykernel_2077/3423038572.py
[31mType:[39m      function