# Dataset Analisys

> to start, let's import all libraries we'll use, as well as our dataset


### Import libraries

In [170]:
import pandas as pd
import json
from pathlib import Path
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
import re
import matplotlib.pyplot as plt

### Import dataset from JSON

In [171]:
JSON_PATH = Path("data/raw.json")

# 1. Open JSON
with open(JSON_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

# 2. Proceed only if it's a list of records
if isinstance(data, dict):
    data = list(data.values())
elif not isinstance(data, list):
    raise ValueError("The JSON file seems to be empty or not a list of records.")

# 3. Convert to DataFrame, normalizing
df = pd.json_normalize(data)

# 4. Remove empty columns, or columns with all NaN values
df.columns = [col.strip() for col in df.columns]
df = df.loc[:, ~df.columns.str.fullmatch(r"Unnamed:.*|^$")]
df = df.dropna(axis=1, how="all")

### General info about the dataset

In [172]:
print(f"Il dataframe ha {df.shape[1]} colonne.")
print(f"Il dataframe ha {df.shape[0]} righe.")

# Rename some columns
df = df.rename(columns={
    "Power HP": "Power_HP",
})

print(f"La moto più vecchia è del {df['Year'].min()}")
print(f"La moto più recente è del {df['Year'].max()}.")

display(df.columns)

Il dataframe ha 77 colonne.
Il dataframe ha 38624 righe.
La moto più vecchia è del 1894
La moto più recente è del 2021.


Index(['Model', 'Year', 'Category', 'Rating', 'Displacement', 'Engine type',
       'Torque', 'Bore x stroke', 'Fuel system', 'Fuel control',
       'Cooling system', 'Gearbox', 'Transmission type,final drive',
       'Frame type', 'Rake (fork angle)', 'Trail', 'Front suspension',
       'Rear suspension', 'Rear wheel travel', 'Front tyre', 'Rear tyre',
       'Front brakes', 'Diameter', 'Rear brakes', 'Seat height',
       'Ground clearance', 'Wheelbase', 'Fuel capacity', 'Color options',
       'Starter', 'Comments', 'Insurance costs', 'Finance options',
       'Parts finder', 'Ask questions', 'Related bikes', 'Price as new',
       'Engine details', 'Power', 'Compression', 'Valves per cylinder',
       'Ignition', 'Lubrication system', 'Clutch', 'Driveline',
       'Fuel consumption', 'Greenhouse gases', 'Emission details',
       'Exhaust system', 'Front wheel travel', 'Wheels', 'Dry weight',
       'Weight incl. oil, gas, etc', 'Power/weight ratio', 'Overall height',
       'Overa

### Particular columns to analyze

In [173]:
def show_col(col):
    return df[col].head(5)

# Useless columns, to drop
display(show_col("Insurance costs"))

display(show_col("Ask questions"))


# Interesting columns, perhaps to analyze
display(show_col("Greenhouse gases"))



0    Compare US insurance quotes from the nation's ...
1    Compare US insurance quotes from the nation's ...
2    Compare US insurance quotes from the nation's ...
3    Compare US insurance quotes from the nation's ...
4    Compare US insurance quotes from the nation's ...
Name: Insurance costs, dtype: object

0    Join the 21 AJP PR7 discussion group or the ge...
1    Join the 21 Aprilia Dorsoduro 900 discussion g...
2    Join the 21 Aprilia RS 125 discussion group or...
3    Join the 21 Aprilia RS 125 GP Replica  discuss...
4    Join the 21 Aprilia RS 50 discussion group or ...
Name: Ask questions, dtype: object

0                                                 NaN
1    129.9 CO2 g/km. (CO2 - Carbon dioxide emission) 
2                                                 NaN
3     62.9 CO2 g/km. (CO2 - Carbon dioxide emission) 
4     57.8 CO2 g/km. (CO2 - Carbon dioxide emission) 
Name: Greenhouse gases, dtype: object

### Define useful functions

In [174]:
# Convert to numeric
def to_float(txt, unit=None):
    if pd.isna(txt): return None
    m = re.search(r"[-+]?\d*\.?\d+", str(txt))
    if not m: return None
    num = float(m.group())
    if unit=="in_to_mm":   num *= 25.4
    if unit=="lbft_to_Nm": num *= 1.35582
    return num

# We use this function to make the fonts bigger, as well as changing the background color
def set_theme(fig, font_size=20, bg_color="#141415", font_color="white", grid_color="#333333"):
    fig.update_layout(
        paper_bgcolor=bg_color,
        plot_bgcolor=bg_color,
        font=dict(size=font_size, color=font_color),
        xaxis=dict(gridcolor=grid_color, zerolinecolor=grid_color),
        yaxis=dict(gridcolor=grid_color, zerolinecolor=grid_color),
        legend=dict(bgcolor="rgba(0,0,0,0)")  # Transparent legend background
    )
    return fig

# Convert Price to numeric
def parse_price(value):
    if pd.isna(value):
        return np.nan
    
    # Search for currency and amount
    # Example: "Euro 9990,00" o "US$ 9990.00"
    match = re.search(r'(Euro|US\$)\s*([\d,\.]+)', str(value))
    if not match:
        return np.nan
    
    currency, amount_str = match.groups()
    
    # Convert to float and handle commas and periods
    # Example: "9990,00" -> 9990.00
    amount = float(amount_str.replace(',', '').replace('.', ''))
    
    # Handle different currencies, we use a fixed conversion rate for simplicity
    if currency == 'Euro':
        return amount * 1.1 
    elif currency == 'US$':
        return amount
    else:
        return np.nan
    
# Extract the numeric part of the Power column
def extract_hp(value):
    match = re.search(r'\d+', str(value))
    return int(match.group()) if match else None


### Cleaning of dataset and inizializations

In [175]:
def _num(s, patt=r"([\d\.]+)"):
    """estrae la prima parte numerica da una stringa"""
    return (s.str.extract(patt, expand=False)
              .str.replace(",", "", regex=False)
              .astype(float))

# Convert certain columns to numeric
df["Displacement_cc"]   = df["Displacement"].apply(to_float)
df["Torque_Nm"]         = df["Torque"].apply(lambda x: to_float(x, unit="lbft_to_Nm") or to_float(x))
df["Seat_height_mm"]    = df["Seat height"].apply(to_float)
df["Fuel_capacity_l"]   = df["Fuel capacity"].apply(to_float)
df["Year"]   = df["Year"].apply(to_float)
df["Power_hp"]        = _num(df["Power"])
df["Fuel_l_100"]      = _num(df["Fuel consumption"])
df["CO2_g_km"]        = _num(df["Greenhouse gases"])
df["Price_raw"]       = _num(df["Price as new"].fillna("0"))


df["Price"] = df["Price as new"].apply(parse_price)
df["HP"] = df["Power"].apply(extract_hp)

# palette & template
TEMPLATE = "presentation"
pio.templates.default = TEMPLATE



## Charts
 > Let's finnally see some charts

### Models by category
 > Let's see the most common type of motorcycle, leaving out those that appear less in the dataset


In [176]:
# Group by most used, leave the others in "Other"
df_grouped = df.groupby("Category").size().reset_index(name="count")

df_top_types = df_grouped[df_grouped["count"] >= 1500].sort_values(by="count", ascending=False)
other_types = df_grouped[df_grouped["count"] < 1500].sum()["count"]

# Create a new DataFrame with the top types and the "Other" category
df_grouped = pd.concat([df_top_types, pd.DataFrame([{'Category': 'Other', 'count': other_types}])], ignore_index=True)

# Concat the top types and the "Other" category
df_pie = pd.concat(
    [df_top_types, pd.DataFrame([{"Category": "Other", "count": other_types}])],
    ignore_index=True
)

# Sort the DataFrame by count in descending order
ordered_categories = list(df_top_types["Category"]) + ["Other"]
df_pie["Category"] = pd.Categorical(
    df_pie["Category"], categories=ordered_categories, ordered=True
)
gray_seq = ["gray"] + ["gray"] + ["gray"] + ["gray"] + ["white"]
color_seq = px.colors.qualitative.Plotly[:5] + gray_seq  # bianco per Other


# Create the pie chart
fig1 = px.pie(
    df_pie,
    names="Category",
    values="count",
    title="<b>Models by Type of bike</b>",
    hole=0.35,
    color_discrete_sequence=color_seq,
    category_orders={"Category": ordered_categories},
)

# Rotate the pie chart, make it counterclockwise, don't atutomatically sort
fig1.update_traces(
    direction="counterclockwise",
    sort=False,
    rotation=-61
)


set_theme(fig1).show()

### Counts of bikes per cc
 > Let's figure out how many bikes were produced per cc category

In [177]:
bins   = [0, 125, 400, 700, 1000, 2000, np.inf]
labels = ["0–125 (A1)", "125–400 (A-lim)", "400–700 (A)", 
          "700–1000 (A)", "1000-2000 (A)", "2000+ (A)"]

# A new column with the categories in which every model falls
df["Disp_cat"] = pd.cut(df["Displacement_cc"], bins=bins, labels=labels, right=True)

# For each category, we calculate the count and the mode            
count_df  = (df["Disp_cat"]
             .value_counts()
             .reindex(labels)
             .rename("Count")
             .reset_index()
             .rename(columns={"Disp_cat": "Displacement Range (cc)"}))

# Calulates the mode for each category
mode_df = (
    df.groupby("Disp_cat", observed=False)["Displacement_cc"]
    .agg(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
    .reindex(labels)
    .reset_index()
    .rename(columns={
        "Disp_cat": "Displacement Range (cc)",
        "Displacement_cc": "mode_cc"
    })
)

# Merge the two DataFrames
disp_summary = count_df.merge(mode_df, on="Displacement Range (cc)")
# Display the mode in a more readable format
disp_summary["mode_text"] = "Mode: " + disp_summary["mode_cc"].round(0).astype(int).astype(str)


fig2 = px.bar(
    disp_summary,
    x="Displacement Range (cc)",
    y="Count",
    #This will show the mode on the bar
    text="mode_text",

    title="<b>Count of Models by CC</b>",
    color_discrete_sequence=["#987434"],
    height=650
)

fig2.update_xaxes(categoryorder="array", categoryarray=labels)

fig2.update_traces(
    texttemplate="%{text}", 
    textposition="inside",
    insidetextanchor="middle", 
    textfont=dict(color="white", size=20)
)



set_theme(fig2).show()


### Price related to a lot of stuff

In [178]:

# Keep only the columns we need for this scatter
plot_df = df.dropna(subset=["Year", "Displacement_cc", "HP", "Price"])

# There's an error in the data, we need to remove the outliers
plot_df = plot_df[plot_df["HP"] < 400]

plot_df = plot_df[plot_df["Year"] > 1980]
plot_df = plot_df[plot_df["Displacement_cc"] < 3000]


fig3 = px.scatter(
    plot_df,
    x="Year",
    y="Displacement_cc",
    size="Price",
    color="HP",
    hover_name="Model", # when hovering, show the model, really cute
    color_continuous_scale="YlOrRd",
    range_color=[0, 200],
    opacity=0.6,
    height=700,
    title="Motorcycle Prices by Displacement and Year",
    labels={
        "Year": "Year",
        "Displacement_cc": "Displacement (cc)",
        "Price": "Price (scaled size)",
        "HP": "Horsepower"
    }
)
# Remove the borders from every point
fig3.update_traces(marker=dict(line=dict(width=0)))

# In this for we add the horizontal lines, these are the limits for the A1, A-lim, A and A2 licenses

fig3.update_yaxes(
    tickvals=[125, 250, 600, 1000, 2000],
    title="Displacement (cc)"
)

# Asse x con tick fissi (opzionale)
fig3.update_xaxes(
    tickvals=[1980, 1990, 2000, 2010, 2015, 2020],
    title="Year"
)

fig3.update_layout(
    coloraxis_colorbar=dict(title="Horsepower (HP)"),
)
set_theme(fig3)
fig3.show()


# riportare i prezzi tenendo conto dell'inflazione

#scatter plot su CC e HP, 
# magari scartare il tempo, interessarsi meglio sul prezzo


### New ones

In [205]:

# efficienza “km con 1 litro” – più alto ⇒ più verde
df["Fuel_km_l"] = 100 / df["Fuel_l_100"]

# 2. PREZZI ADEGUATI ALL’INFLAZIONE
cpi = {
    2019: 103.1, 2020: 104.2, 2021: 109.0, 2022: 119.0, 2023: 124.5, 2024: 128.0
}
current_cpi = max(cpi.values())  
df["Price_adj"] = (
    df
    .dropna(subset=["Year", "Price_raw"])
    .apply(lambda r: r.Price_raw * current_cpi / cpi.get(int(r.Year), current_cpi),
            axis=1)
) # ── 1. Colonne “green” / inflazione  ─────────────────────────────────────────
df["Fuel_l_100"] = df["Fuel consumption"].str.extract(r"([\d\.]+)").astype(float)
df["CO2_g_km"]   = df["Greenhouse gases"].str.extract(r"([\d\.]+)").astype(float)
df["Fuel_km_l"]  = 100 / df["Fuel_l_100"]

# Scatter “verde” (niente prezzi / inflazione)
fig_scatter = px.scatter(
    df.dropna(subset=["Displacement_cc", "HP", "Fuel_km_l"]),
    x="Displacement_cc",
    y="HP",
    color="Fuel_km_l",
    color_continuous_scale="Greens",
    hover_data=["Model", "Year", "Fuel_l_100", "Fuel_km_l"],
    title="<b>Cilindrata vs Potenza</b><br><sup>Tinta = km/l (più verde ⇒ più efficiente)</sup>",
    height=600
)
fig_scatter.update_traces(marker=dict(size=9, line=dict(width=0.5, color="black")))
fig_scatter.update_xaxes(title="Cilindrata (cc)")
fig_scatter.update_yaxes(title="Potenza (HP)")
set_theme(fig_scatter).show()

In [193]:
# ── Box CO₂ per categoria – con margini ampi ────────────────────────────────
fig_box = px.box(
    df.dropna(subset=["CO2_g_km"]),
    x="Category",
    y="CO2_g_km",
    color="Category",
    title="<b>Impronta CO₂ per segmento</b>",
    height=650,               # più alto, spazio extra sotto
    points=False
)

fig_box.update_yaxes(title="g CO₂ / km")

# margini più larghi (b = bottom)
fig_box.update_layout(
    margin=dict(t=80, r=40, l=90, b=340)
)

set_theme(fig_box).show()

In [196]:
# ───────── 4. Scatter Top-Speed vs HP (dimensione = peso) ────────────────────
fig_speed = px.scatter(
    df.dropna(subset=["Top_speed_kmh","HP","Weight_kg"]),
    x="HP", y="Top_speed_kmh",
    size="Weight_kg", size_max=18,
    color="Category",
    hover_data=["Model","Year","Weight_kg"],
    title="<b>Velocità massima vs Cavalli</b><br><sup>Bolla = peso</sup>",
    height=600
)
fig_speed.update_xaxes(title="Potenza (HP)")
fig_speed.update_yaxes(title="Top speed (km/h)")
set_theme(fig_speed).show()

In [204]:
# ── Box-plot 0-100 km/h – senza Scooter / Allround / Sport ──────────────────
if "Zero100_s" not in df.columns:
    df["Zero100_s"] = df["0-100 km/h (0-62 mph)"].str.extract(r"([\d\.]+)").astype(float)

# Filtra le categorie da escludere
mask = ~df["Category"].str.contains(r"^(Scooter|Allround|Sport|Trial|Touring|Minibike, sport)$", case=False, na=False)
box_df = df[mask].dropna(subset=["Zero100_s"])

fig_box2 = px.box(
    box_df,
    x="Category",
    y="Zero100_s",
    color="Category",
    title="<b>0–100 km/h (s) per segmento</b>",
    points=False,
    height=650          # più alto
)

fig_box2.update_yaxes(title="Secondi (più basso = più scattante)")

fig_box2.update_layout(margin=dict(t=80, r=60, l=60, b=180))

set_theme(fig_box2).show()


This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.



In [186]:
# ───────────────────────────────────────────────
# 3 ─ Line-plot  ▸ trend medio annuo CO₂
# ───────────────────────────────────────────────
avg_co2 = (df.dropna(subset=["Year","CO2_g_km"])
             .groupby("Year", as_index=False)["CO2_g_km"]
             .mean())

fig_line = px.line(
    avg_co2, x="Year", y="CO2_g_km", markers=True,
    title="<b>Media annua delle emissioni CO₂</b>",
    height=500
)
fig_line.update_yaxes(title="g CO₂ / km")
set_theme(fig_line).show()