# Interactive EDA Notebook
This notebook provides interactive versions of the EDA plots, allowing for time-range filtering.

In [1]:
import os
import pandas as pd
import sys

# Ensure we are in correct context
cwd = os.getcwd()
if cwd.endswith("eda"):
    BASE_DIR = os.path.join(cwd, "..")
    sys.path.append(cwd)
else:
    # Assuming we might be at root running this
    BASE_DIR = cwd
    sys.path.append(os.path.join(cwd, "eda"))

print(f"Base Dir: {BASE_DIR}")

Base Dir: /Volumes/HP_P900/Users/tungnguyen/Library/CloudStorage/GoogleDrive-nguyenlamtungthptltt@gmail.com/Meine Ablage/03_PersonalWork/FomoKaguya2026/eda/..


In [2]:
import plot_utils
import plot_utils_interactive
import importlib
importlib.reload(plot_utils)
importlib.reload(plot_utils_interactive)

<module 'plot_utils_interactive' from '/Volumes/HP_P900/Users/tungnguyen/Library/CloudStorage/GoogleDrive-nguyenlamtungthptltt@gmail.com/Meine Ablage/03_PersonalWork/FomoKaguya2026/eda/plot_utils_interactive.py'>

In [3]:
# Load Data
processed_dir = os.path.join(BASE_DIR, "data", "processed")
csv_path = os.path.join(processed_dir, "train.csv")

if os.path.exists(csv_path):
    print("Loading dataset... this may take a moment.")
    df = pd.read_csv(csv_path)
    
    # Preprocessing
    if 'time' in df.columns:
        df['time'] = pd.to_datetime(df['time'], utc=True) # Ensure UTC for consistency
        df.set_index('time', inplace=True)
        df.sort_index(inplace=True)
    
    if 'status' in df.columns:
        df['status'] = df['status'].astype(str)
        
    print(f"Loaded {len(df)} records.")
    display(df.head())
else:
    print(f"Error: {csv_path} not found.")

Loading dataset... this may take a moment.
Loaded 2934932 records.


Unnamed: 0_level_0,ip,identd,user,request,status,size,resource,protocol,utc,status_label
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1995-07-01 00:00:01+00:00,199.72.81.55,-,-,GET /history/apollo/ HTTP/1.0,200,6245.0,/history/apollo/,HTTP/1.0,UTC - 04,Success
1995-07-01 00:00:06+00:00,unicomp6.unicomp.net,-,-,GET /shuttle/countdown/ HTTP/1.0,200,3985.0,/shuttle/countdown/,HTTP/1.0,UTC - 04,Success
1995-07-01 00:00:09+00:00,199.120.110.21,-,-,GET /shuttle/missions/sts-73/mission-sts-73.ht...,200,4085.0,/shuttle/missions/sts-73/mission-sts-73.html,HTTP/1.0,UTC - 04,Success
1995-07-01 00:00:11+00:00,burger.letters.com,-,-,GET /shuttle/countdown/liftoff.html HTTP/1.0,304,0.0,/shuttle/countdown/liftoff.html,HTTP/1.0,UTC - 04,No Change
1995-07-01 00:00:11+00:00,199.120.110.21,-,-,GET /shuttle/missions/sts-73/sts-73-patch-smal...,200,4179.0,/shuttle/missions/sts-73/sts-73-patch-small.gif,HTTP/1.0,UTC - 04,Success


## Interactive Plots
Use the sliders to select the time range for analysis.

In [4]:
plot_utils_interactive.print_macro_overview(df)
print('\n' * 2)
plot_utils_interactive.plot_timeline_overview(df, interval='30T')

VBox(children=(Label(value='Selected: 1995-07-01 00:00  to  1995-08-22 23:59'), SelectionRangeSlider(continuou…






VBox(children=(HBox(children=(Dropdown(description='Interval:', index=4, layout=Layout(width='200px'), options…

In [5]:
plot_utils_interactive.plot_weekly_heatmap(df)

VBox(children=(HBox(children=(Dropdown(description='Interval:', index=5, layout=Layout(width='200px'), options…

In [6]:
plot_utils_interactive.plot_weekly_patterns(df)

VBox(children=(HBox(children=(Dropdown(description='Interval:', index=5, layout=Layout(width='200px'), options…

In [7]:
plot_utils_interactive.analyze_status_distribution(df)

VBox(children=(HBox(children=(Dropdown(description='Interval:', index=1, layout=Layout(width='200px'), options…

In [8]:
plot_utils_interactive.plot_daily_profile(df)

VBox(children=(HBox(children=(Dropdown(description='Interval:', index=4, layout=Layout(width='200px'), options…

In [9]:
plot_utils_interactive.plot_file_type_stats(df)

VBox(children=(Label(value='Selected: 1995-07-01 00:00  to  1995-08-22 23:59'), SelectionRangeSlider(continuou…

In [10]:
plot_utils_interactive.plot_top_users(df)

VBox(children=(Label(value='Selected: 1995-07-01 00:00  to  1995-08-22 23:59'), SelectionRangeSlider(continuou…

In [11]:
plot_utils_interactive.plot_status_breakdown(df)

VBox(children=(Label(value='Selected: 1995-07-01 00:00  to  1995-08-22 23:59'), SelectionRangeSlider(continuou…

In [12]:
plot_utils_interactive.plot_rolling_statistics(df, window='1h')

VBox(children=(HBox(children=(Dropdown(description='Window:', index=5, layout=Layout(width='200px'), options=(…

In [13]:
plot_utils_interactive.plot_anomaly_spikes(df)

VBox(children=(HBox(children=(Dropdown(description='Interval:', index=1, layout=Layout(width='200px'), options…

In [14]:
plot_utils_interactive.plot_status_evolution(df)

VBox(children=(HBox(children=(Dropdown(description='Interval:', index=10, layout=Layout(width='200px'), option…

In [15]:
plot_utils_interactive.plot_size_distribution(df)


VBox(children=(Label(value='Selected: 1995-07-01 00:00  to  1995-08-22 23:59'), SelectionRangeSlider(continuou…