# Exploratory Data Analysis: Oslo Bysykkel

Adapted from INF250 Lecture 12 companion notebook. This notebook applies basic EDA techniques to the Oslo city bike (Bysykkel) trip dataset, using the project's execution utilities for metadata tracking.

## Setup & Imports

In [3]:
# =============================================================================
# SETUP: Project paths and execution utils
# =============================================================================
from pathlib import Path
import json
import sys

cwd = Path.cwd()
project_root = cwd if (cwd / "package.json").exists() else cwd.parent.parent
raw_dir = project_root / "raw-data"
prepared_dir = project_root / "prepared-data"

sys.path.insert(0, str(project_root / "data-pipeline"))
from execution_utils import show_execution_banner, write_with_execution_metadata

print("Project root:", project_root)
print("Raw data:", raw_dir)
print("Prepared data:", prepared_dir)

out_path = prepared_dir / "eda_summary_stats.json"
_pipeline_start_time = show_execution_banner(out_path)

Project root: c:\Users\Nicol\Desktop\INF252-Course-Project
Raw data: c:\Users\Nicol\Desktop\INF252-Course-Project\raw-data
Prepared data: c:\Users\Nicol\Desktop\INF252-Course-Project\prepared-data
No previous execution info (file does not exist yet).


In [4]:
# =============================================================================
# IMPORTS
# =============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

plt.style.use('seaborn-v0_8-whitegrid')
FIGSIZE = (10, 6)
COLORS = ['#2196F3', '#FF9800', '#4CAF50', '#E91E63', '#9C27B0']
sns.set_palette(COLORS)
np.random.seed(42)

print('All imports loaded successfully.')

ModuleNotFoundError: No module named 'seaborn'

## Load Data

In [None]:
# =============================================================================
# Load trip data from raw-data/YYYY/MM.json
# =============================================================================
records = []
for year_dir in sorted(raw_dir.iterdir()):
    if not year_dir.is_dir():
        continue
    year = int(year_dir.name)
    for json_path in sorted(year_dir.glob("*.json")):
        month = int(json_path.stem)
        with open(json_path, encoding="utf-8") as f:
            data = json.load(f)
        trips = data if isinstance(data, list) else data.get("data", data.get("trips", []))
        for t in trips:
            records.append((year, month, t))

print(f"Loaded {len(records)} trips from {len({(y, m) for y, m, _ in records})} month(s).")
if records:
    _, _, sample = records[0]
    print("Sample trip keys:", list(sample.keys()))

In [None]:
# =============================================================================
# Convert to DataFrame for EDA
# =============================================================================
rows = []
for year, month, t in records:
    rows.append({
        "year": year,
        "month": month,
        "duration": t.get("duration"),
        "start_station_id": t.get("start_station_id"),
        "start_station_name": t.get("start_station_name"),
        "end_station_id": t.get("end_station_id"),
        "end_station_name": t.get("end_station_name"),
        "start_lat": t.get("start_station_latitude"),
        "start_lon": t.get("start_station_longitude"),
        "end_lat": t.get("end_station_latitude"),
        "end_lon": t.get("end_station_longitude"),
    })

df = pd.DataFrame(rows)
print(f"DataFrame shape: {df.shape}")
print("\nFirst 5 rows:")
df.head()

## Dataset Info & Missing Values

In [None]:
# =============================================================================
# Dataset info and summary statistics
# =============================================================================
print("=== Dataset Info ===")
df.info()
print("\n=== Summary Statistics (numeric) ===")
df.describe()

In [None]:
# =============================================================================
# Missing values per column
# =============================================================================
print("=== Missing values per column ===")
print(df.isnull().sum())
print(f"\nTotal missing: {df.isnull().sum().sum()}")

fig, ax = plt.subplots(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, yticklabels=False, cmap='YlOrRd', ax=ax)
ax.set_title('Missing Value Pattern (red = missing)', fontsize=14)
plt.tight_layout()
plt.show()

## Duration Distribution

In [None]:
# =============================================================================
# Filter to rows with valid duration for analysis
# =============================================================================
df_dur = df.dropna(subset=['duration']).copy()
df_dur['duration_min'] = df_dur['duration'] / 60
print(f"Trips with valid duration: {len(df_dur):,}")
print(f"Duration range: {df_dur['duration'].min():.0f}s - {df_dur['duration'].max():.0f}s")

In [None]:
# =============================================================================
# Outlier detection: duration (box plot and IQR)
# =============================================================================
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

axes[0].scatter(range(len(df_dur)), df_dur['duration_min'], alpha=0.3, s=5, c=COLORS[0])
axes[0].set_xlabel('Observation Index')
axes[0].set_ylabel('Duration (minutes)')
axes[0].set_title('Trip Duration: All Observations')

df_dur.boxplot(column='duration_min', ax=axes[1])
axes[1].set_title('Box Plot of Trip Duration')
axes[1].set_ylabel('Duration (minutes)')
plt.tight_layout()
plt.show()

Q1 = df_dur['duration_min'].quantile(0.25)
Q3 = df_dur['duration_min'].quantile(0.75)
IQR = Q3 - Q1
lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
outliers = df_dur[(df_dur['duration_min'] < lower) | (df_dur['duration_min'] > upper)]
print(f"IQR = {IQR:.1f} min, Fences: [{lower:.1f}, {upper:.1f}] min")
print(f"Potential outliers: {len(outliers):,} ({100*len(outliers)/len(df_dur):.1f}%)")

In [None]:
# =============================================================================
# Histogram of trip duration (excluding extreme outliers for visibility)
# =============================================================================
dur_capped = df_dur[df_dur['duration_min'] <= 60]['duration_min']
fig, ax = plt.subplots(figsize=FIGSIZE)
ax.hist(dur_capped, bins=50, color=COLORS[0], edgecolor='white', alpha=0.8)
ax.set_xlabel('Duration (minutes)')
ax.set_ylabel('Count')
ax.set_title('Distribution of Trip Duration (capped at 60 min)')
plt.tight_layout()
plt.show()

print(f"Mean: {df_dur['duration_min'].mean():.1f} min")
print(f"Median: {df_dur['duration_min'].median():.1f} min")
print(f"Std: {df_dur['duration_min'].std():.1f} min")

## Temporal Patterns

In [None]:
# =============================================================================
# Average duration and trip count by month
# =============================================================================
by_month = df_dur.groupby(['year', 'month']).agg(
    avg_duration_min=('duration_min', 'mean'),
    trip_count=('duration', 'count')
).reset_index()
by_month['year_month'] = by_month['year'].astype(str) + '-' + by_month['month'].astype(str).str.zfill(2)

fig, axes = plt.subplots(2, 1, figsize=(12, 8), sharex=True)

axes[0].bar(range(len(by_month)), by_month['avg_duration_min'], color=COLORS[0], alpha=0.8)
axes[0].set_ylabel('Avg Duration (min)')
axes[0].set_title('Average Trip Duration by Month')
axes[0].set_xticks(range(0, len(by_month), 6))
axes[0].set_xticklabels(by_month['year_month'].iloc[::6])

axes[1].bar(range(len(by_month)), by_month['trip_count'], color=COLORS[1], alpha=0.8)
axes[1].set_xlabel('Year-Month')
axes[1].set_ylabel('Trip Count')
axes[1].set_title('Trip Count by Month')
axes[1].set_xticks(range(0, len(by_month), 6))
axes[1].set_xticklabels(by_month['year_month'].iloc[::6])
plt.tight_layout()
plt.show()

## Categorical Exploration

In [None]:
# =============================================================================
# Top start stations by trip count
# =============================================================================
top_start = df_dur['start_station_name'].value_counts().head(10)
print("=== Top 10 start stations ===")
print(top_start)

fig, ax = plt.subplots(figsize=(10, 6))
top_start.plot(kind='barh', ax=ax, color=COLORS[0], alpha=0.8)
ax.set_xlabel('Trip Count')
ax.set_title('Top 10 Start Stations')
ax.invert_yaxis()
plt.tight_layout()
plt.show()

## Statistical Moments & Write Output

In [None]:
# =============================================================================
# Four statistical moments of trip duration
# =============================================================================
dur = df_dur['duration_min']
print("=== Four Moments of Trip Duration (minutes) ===")
print(f"Mean:     {dur.mean():.2f}")
print(f"Variance: {dur.var():.2f}")
print(f"Skewness: {dur.skew():.2f}")
print(f"Kurtosis: {dur.kurtosis():.2f}")

In [None]:
# =============================================================================
# Write EDA summary to prepared-data with execution metadata
# =============================================================================
eda_summary = {
    "n_trips": len(df_dur),
    "n_months": len(by_month),
    "duration_stats": {
        "mean_min": float(dur.mean()),
        "median_min": float(dur.median()),
        "std_min": float(dur.std()),
        "skewness": float(dur.skew()),
        "kurtosis": float(dur.kurtosis()),
    },
    "avg_by_month": [{"year": int(r["year"]), "month": int(r["month"]), "avg_duration_min": float(r["avg_duration_min"]), "trip_count": int(r["trip_count"])} for _, r in by_month[['year', 'month', 'avg_duration_min', 'trip_count']].iterrows()],
}

write_with_execution_metadata(out_path, eda_summary, _pipeline_start_time)
print(f"Wrote EDA summary to {out_path}")