#### Exploratory Data Analysis -- Universal Analytics 

***Note: Creating a data pipline, identifying patterns, outliers, and correlations using descriptive statistics and visualizations.***

***Dataset: [Bitcoin Historical Data](https://www.kaggle.com/datasets/mczielinski/bitcoin-historical-data)*** 

In [1]:
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from BTC_Data_Pipline import BTCDataLoader


In [2]:
# ---------------------------------------------------------------
# 1. RUN THE DATA PIPELINE TO REFRESH DATA & SAVE TO DATAFRAMES
# ---------------------------------------------------------------

loader = BTCDataLoader(kaggle_dataset="mczielinski/bitcoin-historical-data")
df_hourly, df_daily = loader.load_and_clean(save_dir="Datasets")


2025-11-23 14:41:17,193 INFO Downloading dataset from Kaggle: mczielinski/bitcoin-historical-data
2025-11-23 14:41:18,673 INFO Kaggle download returned path: C:\Users\minha\.cache\kagglehub\datasets\mczielinski\bitcoin-historical-data\versions\426
2025-11-23 14:41:18,675 INFO Reading CSV: C:\Users\minha\.cache\kagglehub\datasets\mczielinski\bitcoin-historical-data\versions\426\btcusd_1-min_data.csv
2025-11-23 14:41:24,854 INFO Converting timestamp column 'Timestamp' to datetime
2025-11-23 14:41:30,296 INFO Initial rows: 7306238
2025-11-23 14:41:34,206 INFO Removed 1310460 rows with non-positive Volume.
2025-11-23 14:41:34,218 INFO Rows after cleaning: 5995778
2025-11-23 14:41:34,220 INFO Resampling to rule: 1H
  df_resampled = df.resample(rule).agg(agg).dropna()
2025-11-23 14:41:35,451 INFO Rows after resampling (1H): 118252
2025-11-23 14:41:35,452 INFO Resampling to rule: 1D
2025-11-23 14:41:36,288 INFO Rows after resampling (1D): 5070
2025-11-23 14:41:37,723 INFO Saved hourly cleaned

In [3]:
# ----------------------------------------------------------------
# 2. EXPLORATORY DATA ANALYSIS (Patterns, Outliers, Correlations)
# ----------------------------------------------------------------

# Use daily data for easier visualization
df = df_daily.copy()

# Calculate returns
df["Return"] = df["Close"].pct_change()

print("\nDescriptive Statistics:")
print(df.describe())



Descriptive Statistics:
                Open           High            Low          Close  \
count    5070.000000    5070.000000    5070.000000    5070.000000   
mean    21161.432316   21619.874083   20678.451276   21178.062884   
std     29834.803322   30366.067256   29264.315515   29846.625979   
min         3.800000       4.380000       3.800000       4.380000   
25%       443.377500     451.925000     431.675000     444.835000   
50%      7287.115000    7463.230000    7093.545000    7290.245000   
75%     30533.000000   31312.250000   29939.815000   30577.500000   
max    124728.000000  126272.000000  123148.000000  124728.000000   

              Volume       Return  
count    5070.000000  5069.000000  
mean     7398.489839     0.002784  
std      9003.276340     0.040818  
min         0.250000    -0.538385  
25%      1979.214849    -0.012352  
50%      4475.096965     0.001569  
75%      9555.579509     0.017732  
max    127286.486533     0.358107  


In [4]:
# -------------------------
# CORRELATION HEATMAP
# -------------------------
corr_fig = px.imshow(
    df[["Open", "High", "Low", "Close", "Volume", "Return"]].corr(),
    text_auto=True,
    color_continuous_scale="RdBu",
    title="Correlation Heatmap - Daily Bitcoin Data"
)
corr_fig.show()


In [19]:
# Interactive candlestick with BTC volume + estimated USD volume 

# Ensure index is datetime
if not isinstance(df.index, pd.DatetimeIndex):
    if "Date" in df.columns:
        df = df.set_index(pd.to_datetime(df["Date"]))
    else:
        df.index = pd.to_datetime(df.index)

# Find volume column (BTC)
vol_candidates = [c for c in df.columns if "vol" in c.lower()]
if not vol_candidates:
    raise KeyError("No volume column detected. Expected e.g. 'Volume' or 'Volume_(BTC)'.")
# prefer column containing 'btc' if exists
vol_col = None
for c in vol_candidates:
    if "btc" in c.lower():
        vol_col = c
        break
if vol_col is None:
    vol_col = vol_candidates[0]  # fallback

# Prepare series: BTC volume and USD-estimate = BTC_volume * Close
btc_vol = df[vol_col].astype(float).copy()
usd_est = btc_vol * df["Close"]

# Scale to millions for display
btc_vol_m = btc_vol / 1e6
usd_est_m = usd_est / 1e6

# Create subplot: candlestick (row1) + volume (row2) with secondary y-axis for USD estimate
fig = make_subplots(
    rows=2, cols=1,
    shared_xaxes=True,
    vertical_spacing=0.09,
    row_heights=[0.7, 0.3],
    specs=[[{"type": "candlestick"}],
           [{"secondary_y": True}]]
)

# Candlestick
fig.add_trace(
    go.Candlestick(
        x=df.index,
        open=df["Open"],
        high=df["High"],
        low=df["Low"],
        close=df["Close"],
        name="OHLC"
    ),
    row=1, col=1
)

# BTC volume bars (millions)
fig.add_trace(
    go.Bar(
        x=df.index,
        y=btc_vol_m,
        name=f"Volume ({vol_col}) [millions BTC]",
        marker_color=np.where(df["Close"] >= df["Open"], "green", "red"),
        hovertemplate=f"%{{x}}<br>BTC Volume: %{{customdata[0]:,.0f}} BTC<br>%{{y:.3f}}M BTC<extra></extra>",
        customdata=np.stack([btc_vol.values], axis=-1)
    ),
    row=2, col=1, secondary_y=False
)

# USD estimate line on secondary y-axis (millions USD)
fig.add_trace(
    go.Scatter(
        x=df.index,
        y=usd_est_m,
        mode="lines",
        name="Estimated Volume (USD) [millions]",
        line=dict(width=2, color="royalblue"),
        hovertemplate="%{x}<br>Estimated USD Volume: %{customdata[0]:,.0f} USD<br>%{y:.2f}M USD<extra></extra>",
        customdata=np.stack([usd_est.values], axis=-1)
    ),
    row=2, col=1, secondary_y=True
)

# Layout
fig.update_layout(
    title="Bitcoin Daily Candlestick with BTC Volume (bars) and Estimated USD Volume (line)",
    xaxis_rangeslider_visible=False,
    height=800,
    legend=dict(orientation="h", yanchor="bottom", y=0.99, xanchor="right", x=1),
    margin=dict(l=40, r=40, t=60, b=100)
)

# Axis labels
fig.update_yaxes(title_text="Price (USD)", row=1, col=1)
fig.update_yaxes(title_text=f"BTC Volume", row=2, col=1, secondary_y=False)
fig.update_yaxes(title_text="Estimated USD Volume", row=2, col=1, secondary_y=True)

# Improve hover formatting for each trace already added via hovertemplate
fig.show()


In [20]:
# -------------------------
# RETURNS DISTRIBUTION (OUTLIERS)
# -------------------------
returns_fig = px.box(
    df,
    y="Return",
    points="outliers",
    title="Daily Returns Boxplot"
)
returns_fig.show()
