# Bike analisys



### Set up

In [17]:
import pandas as pd
import json
from pathlib import Path
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
import re

# palette & template
COL1, COL2, COL3 = "#1E88E5", "#D81B60", "#7CB342"
TEMPLATE = "presentation"
pio.templates.default = TEMPLATE

def bump_fonts(fig, base=20):
    fig.update_layout(
        font=dict(size=base),
        title_font=dict(size=base+2),
        legend_font=dict(size=base),
    )
    fig.update_xaxes(title_font=dict(size=base), tickfont=dict(size=base-2))
    fig.update_yaxes(title_font=dict(size=base), tickfont=dict(size=base-2))
    return fig

def to_float(txt, unit=None):
    if pd.isna(txt): return None
    m = re.search(r"[-+]?\d*\.?\d+", str(txt))
    if not m: return None
    num = float(m.group())
    if unit=="in_to_mm":   num *= 25.4
    if unit=="lbft_to_Nm": num *= 1.35582
    return num

# File I/O
JSON_PATH = Path("data/raw.json")

# 1. load JSON
with open(JSON_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

# 2. normalize structure
if isinstance(data, dict):
    data = list(data.values())
elif not isinstance(data, list):
    raise ValueError("JSON must be list or dict")

# 3. to DataFrame
df = pd.json_normalize(data)

# 4. initial cleaning
df.columns = df.columns.str.strip()
df = df.loc[:, ~df.columns.str.fullmatch(r"Unnamed:.*|^$")]
df = df.dropna(axis=1, how="all")
df = df.rename(columns={
    "Power HP": "Power_HP",
    "Price as new Euros": "Price_EUR",
    "Model": "Model"
})

# 5. numeric conversions
df["Displacement_cc"]   = df["Displacement"].apply(to_float)
df["Torque_Nm"]         = df["Torque"].apply(lambda x: to_float(x, unit="lbft_to_Nm") or to_float(x))
df["Seat_height_mm"]    = df["Seat height"].apply(to_float)
df["Fuel_capacity_l"]   = df["Fuel capacity"].apply(to_float)

### First diagrams

In [18]:
# A) category pie
cat_counts = df["Category"].value_counts().reset_index()
cat_counts.columns = ["Category","count"]
fig1 = px.pie(cat_counts, names="Category", values="count",
              title="<b>Models by Category</b>", hole=0.35,
              color_discrete_sequence=[COL1,COL2,COL3])
bump_fonts(fig1).show()

# B) displacement by category
bins = [0, 250, 500, 1000, 2000, np.inf]
labels = ["0–250", "251–500", "501–1000", "1001–2000", ">2000"]
df["Disp_cat"] = pd.cut(df["Displacement_cc"], bins=bins, labels=labels, right=True)

disp_counts = df["Disp_cat"].value_counts().reindex(labels).reset_index()
disp_counts.columns = ["Displacement Range (cc)", "count"]

fig2 = px.bar(
    disp_counts,
    x="Displacement Range (cc)",
    y="count",
    title="<b>Count of Models by Displacement Range</b>",
    color_discrete_sequence=[COL1]
)
fig2.update_xaxes(categoryorder="array", categoryarray=labels)
bump_fonts(fig2).show()


# 6-A) Scatter: Power vs Torque
if {"Power_HP", "Torque_Nm"}.issubset(df.columns):
    fig_scatter = px.scatter(
        df,
        x="Displacement_cc", y="Power_HP",
        color="Torque_Nm",
        hover_name="Model",
        title="<b>Scatter: Displacement vs Power (color=Torque)</b>",
        color_continuous_scale=px.colors.sequential.Viridis,
        template=TEMPLATE
    )
    bump_fonts(fig_scatter).show()

# 6-B) Heatmap: correlation matrix of numeric specs
num = ["Displacement_cc", "Torque_Nm", "Seat_height_mm", "Fuel_capacity_l", "Power_HP"]
available = [c for c in num if c in df.columns]
if len(available) >= 2:
    corr = df[available].corr()
    fig_heat = go.Figure(
        go.Heatmap(
            z=corr.values,
            x=available, y=available,
            colorscale="Blues"
        )
    )
    fig_heat.update_layout(title="<b>Heatmap: Numeric Specs Correlation</b>", template=TEMPLATE)
    bump_fonts(fig_heat).show()

# 6-C) Boxplot: Seat height by Category
if "Seat_height_mm" in df.columns:
    fig_box = px.box(
        df,
        x="Category", y="Seat_height_mm",
        title="<b>Boxplot: Seat Height per Category</b>",
        color_discrete_sequence=[COL2],
        template=TEMPLATE
    )
    bump_fonts(fig_box).show()
