# Data Visualization with Bokeh: Price, Volume Shocks, and PACF

This section builds interactive visualizations using only Bokeh for plotting.

Data sources: CSVs in the `data/` folder: `INFY_2015_2016.csv`, `NIFTY_IT_2015_2016.csv`, `TCS_2015_2016.csv`.

Assumptions (tunable):
- Trading year ≈ 252 sessions; 52-week moving average uses a 252-day rolling window.
- Volume shock: a day where relative volume change exceeds mean ± 3 standard deviations of daily volume percentage change.
- Price shock: a day where absolute close-to-close return exceeds mean ± 3 standard deviations of daily returns.
- "Volumeless price movement": a price shock day that is not a volume shock the same day.
- Timeseries is colored red between consecutive volume shock dates; otherwise, line segments are colored in a blue gradient by deviation from 52-week MA.
- PACF bars and ±1.96/√N bands are computed and plotted with Bokeh (no statsmodels plotting).

In [None]:
# Imports and setup
import os
from pathlib import Path
import math
import numpy as np
import pandas as pd

from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models import (
    ColumnDataSource, HoverTool, DatetimeTickFormatter, 
    LinearColorMapper, ColorBar, BasicTicker, PrintfTickFormatter, 
    Segment, Span, Tabs, TabPanel
)
from bokeh.layouts import row, column
from bokeh.palettes import Blues256

# compute only; plotting stays in Bokeh
from statsmodels.tsa.stattools import pacf as sm_pacf

output_notebook()

DATA_DIR = Path('data')

def _find_cols(df: pd.DataFrame):
    cols = {c.lower().strip(): c for c in df.columns}
    # date
    date_col = None
    for k in ['date', 'timestamp', 'time', 'datetime']:
        if k in cols:
            date_col = cols[k]
            break
    if date_col is None:
        date_col = df.columns[0]  # fallback to first column

    # close
    close_col = None
    for k in ['close', 'close price', 'adj close', 'adj_close', 'closing price']:
        if k in cols:
            close_col = cols[k]
            break
    if close_col is None:
        # try something containing 'close'
        cand = [c for c in df.columns if 'close' in c.lower()]
        close_col = cand[0] if cand else df.columns[1]

    # volume
    volume_col = None
    for k in ['volume', 'total trade quantity', 'shares traded', 'total traded quantity']:
        if k in cols:
            volume_col = cols[k]
            break
    if volume_col is None:
        cand = [c for c in df.columns if 'vol' in c.lower()]
        volume_col = cand[0] if cand else None
    return date_col, close_col, volume_col

def load_symbol_csv(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path)
    date_col, close_col, volume_col = _find_cols(df)
    df = df.rename(columns={date_col: 'Date', close_col: 'Close', **({} if volume_col is None else {volume_col: 'Volume'})})
    df['Date'] = pd.to_datetime(df['Date'])
    # Ensure numeric
    df['Close'] = pd.to_numeric(df['Close'], errors='coerce')
    if 'Volume' in df.columns:
        df['Volume'] = pd.to_numeric(df['Volume'], errors='coerce')
    df = df.sort_values('Date').dropna(subset=['Close']).reset_index(drop=True)
    return df[['Date','Close'] + (['Volume'] if 'Volume' in df.columns else [])]

def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out['Return'] = out['Close'].pct_change()
    if 'Volume' in out.columns:
        out['VolChg'] = out['Volume'].pct_change()
    else:
        out['VolChg'] = np.nan
    # 52-week MA ≈ 252 trading days
    out['MA52'] = out['Close'].rolling(252, min_periods=10).mean()
    out['MA_diff'] = out['Close'] - out['MA52']
    # z-scores for shocks
    ret = out['Return']
    vch = out['VolChg']
    # robust to NaNs
    r_mu, r_sd = np.nanmean(ret), np.nanstd(ret)
    v_mu, v_sd = np.nanmean(vch), np.nanstd(vch)
    out['PriceShock'] = (np.abs(ret - r_mu) > 3.0 * r_sd)
    out['VolShock'] = (np.abs(vch - v_mu) > 3.0 * v_sd) if np.isfinite(v_sd) and v_sd > 0 else False
    out['VolumelessPriceShock'] = out['PriceShock'] & ~out['VolShock'].fillna(False)
    return out

def _between_volume_shocks_mask(vol_shock: pd.Series) -> pd.Series:
    # Mark indices strictly between each consecutive pair of volume shocks
    n = len(vol_shock)
    mask = np.zeros(n, dtype=bool)
    shock_idx = np.flatnonzero(vol_shock.fillna(False).to_numpy())
    for i in range(len(shock_idx)-1):
        a, b = shock_idx[i], shock_idx[i+1]
        if b > a + 1:
            mask[a+1:b+1] = True  # include the index at b so segments up to b are inside
    return pd.Series(mask, index=vol_shock.index)

def make_timeseries_plot(sym: str, dfx: pd.DataFrame):
    dfx = dfx.dropna(subset=['Close']).reset_index(drop=True)
    dates = dfx['Date'].to_numpy()
    close = dfx['Close'].to_numpy()
    ma_diff = dfx['MA_diff'].to_numpy()
    between = _between_volume_shocks_mask(dfx['VolShock'] if 'VolShock' in dfx else pd.Series([False]*len(dfx)))
    # Build segment data
    x0_g, x1_g, y0_g, y1_g, val_g = [], [], [], [], []
    x0_r, x1_r, y0_r, y1_r = [], [], [], []
    for i in range(1, len(dfx)):
        x0, x1 = dates[i-1], dates[i]
        y0, y1 = close[i-1], close[i]
        # Decide if segment is within two volume shocks
        in_red = bool(between.iloc[i-1] and between.iloc[i])
        if in_red:
            x0_r.append(x0); x1_r.append(x1); y0_r.append(y0); y1_r.append(y1)
        else:
            x0_g.append(x0); x1_g.append(x1); y0_g.append(y0); y1_g.append(y1)
            # gradient value from mid-point or endpoint's MA diff
            mid_val = ma_diff[i] if np.isfinite(ma_diff[i]) else (ma_diff[i-1] if np.isfinite(ma_diff[i-1]) else 0.0)
            val_g.append(mid_val if np.isfinite(mid_val) else 0.0)

    src_grad = ColumnDataSource(dict(x0=x0_g, x1=x1_g, y0=y0_g, y1=y1_g, v=val_g))
    src_red  = ColumnDataSource(dict(x0=x0_r, x1=x1_r, y0=y0_r, y1=y1_r))

    # Markers for price shock without volume shock
    mask_pts = dfx['VolumelessPriceShock'].fillna(False)
    src_pts = ColumnDataSource(dfx.loc[mask_pts, ['Date','Close']])

    p = figure(height=350, width=800, x_axis_type='datetime', title=f'{sym}: Close with Shocks',
               toolbar_location='above', tools='pan,wheel_zoom,box_zoom,reset,save')
    p.xaxis.formatter = DatetimeTickFormatter(days='%b %d, %Y', months='%b %Y', years='%Y')
    p.yaxis.axis_label = 'Close'

    # Gradient mapper for MA_diff
    v = np.array(val_g) if len(val_g) else np.array([0.0])
    vmin = float(np.nanmin(v)) if len(v) else -1.0
    vmax = float(np.nanmax(v)) if len(v) else 1.0
    if not np.isfinite(vmin) or not np.isfinite(vmax) or vmin == vmax:
        vmin, vmax = -1.0, 1.0
    mapper = LinearColorMapper(palette=Blues256, low=vmin, high=vmax)

    # Gradient (blue spectrum) segments
    p.add_glyph(src_grad, Segment(x0='x0', y0='y0', x1='x1', y1='y1', line_color={'field':'v','transform':mapper}, line_width=2))

    # Red segments between two volume shocks
    if len(x0_r):
        p.add_glyph(src_red, Segment(x0='x0', y0='y0', x1='x1', y1='y1', line_color='red', line_width=2))

    # Price shock markers (no volume shock)
    r_pts = p.scatter(x='Date', y='Close', size=7, color='orange', alpha=0.9, source=src_pts, legend_label='Price shock (no volume)')

    # Color bar legend for blue gradient
    cb = ColorBar(color_mapper=mapper, ticker=BasicTicker(),
                  formatter=PrintfTickFormatter(format='%0.2f'),
                  label_standoff=8, location=(0,0), title='Close - MA52')
    p.add_layout(cb, 'right')

    p.add_tools(HoverTool(tooltips=[('Date','@Date{%F}'), ('Close','@Close{0,0.00}')],
                        formatters={'@Date':'datetime'}, mode='mouse', renderers=[r_pts]))
    p.legend.location = 'top_left'
    p.legend.click_policy = 'hide'
    return p

def make_pacf_plot(sym: str, dfx: pd.DataFrame, max_lags: int | None = None):
    y = dfx['Return'].dropna().to_numpy()
    n = len(y)
    if n < 10:
        p = figure(height=350, width=450, title=f'{sym}: PACF (insufficient data)')
        return p
    nlags = max_lags or min(60, n//2)
    pacf_vals = sm_pacf(y, nlags=nlags, method='ywmle')
    lags = np.arange(len(pacf_vals))
    # Confidence interval (approx) +/- 1.96/sqrt(N)
    conf = 1.96 / math.sqrt(n)
    p = figure(height=350, width=450, title=f'{sym}: PACF', tools='pan,wheel_zoom,box_zoom,reset,save')
    # Bars
    p.vbar(x=lags, top=pacf_vals, bottom=0, width=0.8, color='#4A6FA5')
    # Zero line and conf bands
    zero = Span(location=0.0, dimension='width', line_color='black', line_width=1)
    up = Span(location=conf, dimension='width', line_color='firebrick', line_dash='dashed', line_width=1)
    dn = Span(location=-conf, dimension='width', line_color='firebrick', line_dash='dashed', line_width=1)
    p.add_layout(zero); p.add_layout(up); p.add_layout(dn)
    p.xaxis.axis_label = 'Lag'
    p.yaxis.axis_label = 'PACF'
    return p

In [None]:
# Load data, build plots, and display as Tabs (with safe fallback)
symbols = []
dfs = {}
for p in sorted(DATA_DIR.glob('*.csv')):
    sym = p.stem.split('_')[0].upper()
    try:
        df0 = load_symbol_csv(p)
        df1 = preprocess(df0)
        if len(df1) >= 10:
            symbols.append(sym)
            dfs[sym] = df1
    except Exception as e:
        print(f'Failed to load {p.name}: {e}')

# Build per-symbol layouts
tab_panels = []
layouts = []
for sym in symbols:
    dfx = dfs[sym]
    p_ts = make_timeseries_plot(sym, dfx)
    p_pacf = make_pacf_plot(sym, dfx)
    lay = row(p_ts, p_pacf)
    layouts.append(lay)
    try:
        tab_panels.append(TabPanel(child=lay, title=sym))
    except Exception as e:
       
        pass


if tab_panels:
    tabs = Tabs(tabs=tab_panels)
    show(tabs)
elif layouts:
    show(column(*layouts))
else:
    print('No valid data found in data/ folder.')

  np.subtract(arr, avg, out=arr, casting='unsafe', where=where)
