# Dataset Analisys

> to start, let's import all libraries we'll use, as well as our dataset


### Import libraries

In [1]:
import pandas as pd
import json
from pathlib import Path
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
import re
import matplotlib.pyplot as plt

### Import dataset from JSON

In [2]:
JSON_PATH = Path("data/raw.json")

# 1. Open JSON
with open(JSON_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

# 2. Proceed only if it's a list of records
if isinstance(data, dict):
    data = list(data.values())
elif not isinstance(data, list):
    raise ValueError("The JSON file seems to be empty or not a list of records.")

# 3. Convert to DataFrame, normalizing
df = pd.json_normalize(data)

# 4. Remove empty columns, or columns with all NaN values
df.columns = [col.strip() for col in df.columns]
df = df.loc[:, ~df.columns.str.fullmatch(r"Unnamed:.*|^$")]
df = df.dropna(axis=1, how="all")

### General info about the dataset

In [3]:
print(f"Il dataframe ha {df.shape[1]} colonne.")
print(f"Il dataframe ha {df.shape[0]} righe.")

# Rename some columns
df = df.rename(columns={
    "Power HP": "Power_HP",
})

print(f"La moto più vecchia è del {df['Year'].min()}")
print(f"La moto più recente è del {df['Year'].max()}.")

display(df.columns)

Il dataframe ha 77 colonne.
Il dataframe ha 38624 righe.
La moto più vecchia è del 1894
La moto più recente è del 2021.


Index(['Model', 'Year', 'Category', 'Rating', 'Displacement', 'Engine type',
       'Torque', 'Bore x stroke', 'Fuel system', 'Fuel control',
       'Cooling system', 'Gearbox', 'Transmission type,final drive',
       'Frame type', 'Rake (fork angle)', 'Trail', 'Front suspension',
       'Rear suspension', 'Rear wheel travel', 'Front tyre', 'Rear tyre',
       'Front brakes', 'Diameter', 'Rear brakes', 'Seat height',
       'Ground clearance', 'Wheelbase', 'Fuel capacity', 'Color options',
       'Starter', 'Comments', 'Insurance costs', 'Finance options',
       'Parts finder', 'Ask questions', 'Related bikes', 'Price as new',
       'Engine details', 'Power', 'Compression', 'Valves per cylinder',
       'Ignition', 'Lubrication system', 'Clutch', 'Driveline',
       'Fuel consumption', 'Greenhouse gases', 'Emission details',
       'Exhaust system', 'Front wheel travel', 'Wheels', 'Dry weight',
       'Weight incl. oil, gas, etc', 'Power/weight ratio', 'Overall height',
       'Overa

### Define useful functions

In [None]:
# Convert to numeric
def to_float(txt, unit=None):
    if pd.isna(txt): return None
    m = re.search(r"[-+]?\d*\.?\d+", str(txt))
    if not m: return None
    num = float(m.group())
    if unit=="in_to_mm":   num *= 25.4
    if unit=="lbft_to_Nm": num *= 1.35582
    return num

# We use this function to make the fonts bigger, as well as changing the background color
def set_theme(fig, font_size=20, bg_color="#141415", font_color="white", grid_color="#333333"):
    fig.update_layout(
        paper_bgcolor=bg_color,
        plot_bgcolor=bg_color,
        font=dict(size=font_size, color=font_color),
        xaxis=dict(gridcolor=grid_color, zerolinecolor=grid_color),
        yaxis=dict(gridcolor=grid_color, zerolinecolor=grid_color),
        legend=dict(bgcolor="rgba(0,0,0,0)")  # Transparent legend background
    )
    return fig

# Convert Price to numeric
def parse_price(value):
    if pd.isna(value):
        return np.nan
    
    # Search for currency and amount
    # Example: "Euro 9990,00" o "US$ 9990.00"
    match = re.search(r'(Euro|US\$)\s*([\d,\.]+)', str(value))
    if not match:
        return np.nan
    
    currency, amount_str = match.groups()
    
    # Convert to float and handle commas and periods
    # Example: "9990,00" -> 9990.00
    amount = float(amount_str.replace(',', '').replace('.', ''))
    
    # Handle different currencies, we use a fixed conversion rate for simplicity
    if currency == 'Euro':
        return amount * 1.1 
    elif currency == 'US$':
        return amount
    else:
        return np.nan
    
# Extract the numeric part of the Power column
def extract_hp(value):
    match = re.search(r'\d+', str(value))
    return int(match.group()) if match else None
  
# Extract the float part of the rating column
def extract_rating(text):
    if isinstance(text, str):
        match = re.search(r"([0-9]+(?:\.[0-9]+)?)", text)
        if match:
            return float(match.group(1))
    return np.nan


# Tassi di inflazione cumulativi rispetto al 2021 (fonte: dati ISTAT/BCE, valori indicativi)
inflation_factors = {
    2000: 1.37, 2001: 1.34, 2002: 1.31, 2003: 1.28, 2004: 1.25, 2005: 1.22,
    2006: 1.19, 2007: 1.16, 2008: 1.13, 2009: 1.12, 2010: 1.10, 2011: 1.08,
    2012: 1.06, 2013: 1.05, 2014: 1.04, 2015: 1.03, 2016: 1.03, 2017: 1.02,
    2018: 1.01, 2019: 1.01, 2020: 1.00, 2021: 1.00
}
# We adjust the prices for inflation using the dictionary above
def adjust_price_for_inflation(row):
    year = int(row["Year"])
    if year in inflation_factors:
        return row["Price"] * inflation_factors[year]
    else:
        return row["Price"]



def show_col(col):
    # Returns head(5) of a column, but ignores NaNs
    return df[df[col].notna()].sample(5)[col]



### Particular columns to analyze

In [5]:
# Useless columns, to drop

print("\nUSELESS COLUMNS\n\n")

display(show_col("Insurance costs"))

display(show_col("Ask questions"))

display(show_col("Rating"))

display(show_col("Comments"))

display(show_col("Modifications compared to previous model"))

df.drop(columns=["Insurance costs", "Ask questions", "Comments"], inplace=True)



# Interesting columns, perhaps to analyze
print("\nUSEFUL COLUMNS\n\n")

display(show_col("Greenhouse gases"))
# injection or carbuettor
display(show_col("Fuel system"))

display(show_col("Model"))



USELESS COLUMNS




23001    Compare US insurance quotes from the nation's ...
19572    Compare US insurance quotes from the nation's ...
6404     Compare US insurance quotes from the nation's ...
25221    Compare US insurance quotes from the nation's ...
13961    Compare US insurance quotes from the nation's ...
Name: Insurance costs, dtype: object

36069    Join the 60 AJS Model 16 350MS discussion grou...
3355     Join the 20 Triumph Tiger 800 XCX discussion g...
37648    Join the 40 Crocker V-Twin discussion group or...
12322    Join the 13 Ducati Monster 795 discussion grou...
35431    Join the 65 Allstate SR 250 discussion group o...
Name: Ask questions, dtype: object

33293     3.4  Check out the detailed rating of off-roa...
27934     3.5  Check out the detailed rating of racing ...
25789     3.9  See the detailed rating of touring capab...
23145     3.7  View the detailed rating of value for mo...
5135     Do you know this bike?Click here to rate it. W...
Name: Rating, dtype: object

5825                                     Traction control.
1138                                       Sold in Brazil.
5551                       Traction Control. Ride-by-wire.
6101     Traction Control, Slide Control, Wheel Lift Co...
12733    Sold in Japan only. Optional ABS brakes adds 4...
Name: Comments, dtype: object

38532    The 1913 motorsycle was home-built from a comp...
22885    The bike presents a new kick starter that help...
20780    Technical Modifications 2008:\r\nNew fork sett...
32062    differnces to the 500 model:\r\ncylinders; gas...
13457                  New, high-grip, durable tyre design
Name: Modifications compared to previous model, dtype: object


USEFUL COLUMNS




6816     127.6 CO2 g/km. (CO2 - Carbon dioxide emission) 
23033    116.0 CO2 g/km. (CO2 - Carbon dioxide emission) 
5179      97.4 CO2 g/km. (CO2 - Carbon dioxide emission) 
4849      44.3 CO2 g/km. (CO2 - Carbon dioxide emission) 
3882      99.8 CO2 g/km. (CO2 - Carbon dioxide emission) 
Name: Greenhouse gases, dtype: object

18383                       Carburettor. Dell´Orto PHBN 16
23861                                       Injection. EFI
18387                               Carburettor. Dell´Orto
6246     Injection. Bimota by Athena/Electronic Fuel In...
36818     Carburettor. SU MC2 carburettor with air cleaner
Name: Fuel system, dtype: object

19149                             Blata Origami B2 Victory
9756     Harley-Davidson Sportster Forty-Eight Dark Custom
1966                                         Italika AT110
18064                                     Kymco Yager 200i
2859                                     Raybar  Fuego 200
Name: Model, dtype: object

### Cleaning of dataset and inizializations

In [19]:

# Convert certain columns to numeric
df["Displacement_cc"]   = df["Displacement"].apply(to_float)
df["Torque_Nm"]         = df["Torque"].apply(lambda x: to_float(x, unit="lbft_to_Nm") or to_float(x))
df["Seat_height_mm"]    = df["Seat height"].apply(to_float)
df["Fuel_capacity_l"]   = df["Fuel capacity"].apply(to_float)
df["Year"]   = df["Year"].apply(to_float)

# Show only the first word of each model, this is the manufacturer
df["Manufacturer"] = df["Model"].apply(lambda x: re.sub(r"\s.*", "", x) if isinstance(x, str) else x)

df["Price"] = df["Price as new"].apply(parse_price)
df["HP"] = df["Power"].apply(extract_hp)

df["Rating"] = df["Rating"].apply(extract_rating)

# palette & template
TEMPLATE = "presentation"
pio.templates.default = TEMPLATE


display(df["Rating"].sample(10))



12641   NaN
5280    NaN
23683   NaN
22392   NaN
13778   NaN
17195   NaN
14359   NaN
18546   NaN
17575   NaN
26534   NaN
Name: Rating, dtype: float64

### Analysis of duplicates


In [7]:
emission_cols = [
    "Emission details",        # ← contiene "Euro 4", "Euro 5", ecc.
    "Greenhouse gases",        # ← valore numerico g/km
    "Fuel consumption"         # ← utile in confronto alle emissioni
]
performance_cols = [
    "Power",                   # può differire se depotenziata
    "Torque",                  # idem
    "Weight incl. oil, gas, etc", # peso a pieno carico
    "Dry weight",              # a secco
    "Power/weight ratio",      # calcolato ma utile per confronto
    "Top speed",               # a volte limitata
]
mechanical_cols = [
    "Engine type",
    "Displacement",
    "Fuel control", 
    "Clutch", 
    "Exhaust system"
]
important_cols = [
    "Manufacturer", "Model", "Category", "Year"
] + emission_cols + performance_cols + mechanical_cols


# Trova i duplicati solo basati su Model, Manufacturer, Category, Year
dups = df.duplicated(subset=["Manufacturer", "Model", "Category", "Year"], keep=False)

# Seleziona solo le righe duplicate
df_dups = df[dups]

# Conta il numero di versioni uniche in base a un sottoinsieme tecnico
df_unique_tech = df_dups.drop_duplicates(subset=important_cols)

print(f"Duplicati totali: {df_dups.shape[0]}")
print(f"Unici in base a info tecniche: {df_unique_tech.shape[0]}")

print(f"Righe originali: {df.shape[0]}")
df = df.drop_duplicates(subset=important_cols, keep="first")

print(f"Righe nuove: {df.shape[0]}")




Duplicati totali: 6063
Unici in base a info tecniche: 3010
Righe originali: 38624
Righe nuove: 35571


## Charts
 > Let's finnally see some charts

### Models by category
 > Let's see the most common type of motorcycle, leaving out those that appear less in the dataset


In [8]:
# Group by most used, leave the others in "Other"
df_grouped = df.groupby("Category").size().reset_index(name="count")


df_top_types = df_grouped[df_grouped["count"] >= 1500].sort_values(by="count", ascending=False)
other_types = df_grouped[df_grouped["count"] < 1500].sum()["count"]

# Create a new DataFrame with the top types and the "Other" category
df_grouped = pd.concat([df_top_types, pd.DataFrame([{'Category': 'Other', 'count': other_types}])], ignore_index=True)

# Concat the top types and the "Other" category
df_pie = pd.concat(
    [df_top_types, pd.DataFrame([{"Category": "Other", "count": other_types}])],
    ignore_index=True
)

# Sort the DataFrame by count in descending order
ordered_categories = list(df_top_types["Category"]) + ["Other"]
df_pie["Category"] = pd.Categorical(
    df_pie["Category"], categories=ordered_categories, ordered=True
)
gray_seq = ["gray"] + ["gray"] + ["gray"] + ["white"]
color_seq = px.colors.qualitative.Plotly[:6] + gray_seq  # bianco per Other


# Create the pie chart
fig1 = px.pie(
    df_pie,
    names="Category",
    values="count",
    title="<b>Models by Type of bike</b>",
    hole=0.35,
    color_discrete_sequence=color_seq,
    category_orders={"Category": ordered_categories},
)

# Rotate the pie chart, make it counterclockwise, don't atutomatically sort
fig1.update_traces(
    direction="counterclockwise",
    sort=False,
    rotation=-61
)


set_theme(fig1).show()

### How many models were created each year
 > Let's see for each category of bike, how many bikes were created

In [9]:
# Seleziona le 6 categorie più presenti
top6 = df_grouped.sort_values("count", ascending=False).head(7)["Category"].tolist()

# Filtra il dataframe originale per le sole 6 categorie
df_top6 = df[df["Category"].isin(top6)]

# Filtra le righe con Year >= 1980
df_top6 = df_top6[df_top6["Year"] >= 1980]
df_top6 = df_top6[df_top6["Year"] < 2021]

# Raggruppa per anno e categoria, conta le moto
year_cat_counts = (
    df_top6.groupby(["Year", "Category"])
    .size()
    .reset_index(name="count")
)

# Grafico a linee
fig = px.line(
    year_cat_counts,
    x="Year",
    y="count",
    color="Category",
    title="<b>Models by Year</b>",
    markers=True,
    color_discrete_sequence=color_seq,
    category_orders={"Category": ordered_categories},
)
set_theme(fig).show()

# 757 000 unità nel 1980 a 217 000 nel 1993
# dal 2020, le normative euro 5 diventano obbligatorie su ogni nuovo modello
#  Per non perdere omologazioni, i costruttori registrano a raffica versioni Euro 4 “final edition” 
#  Ma alcune omologano già anche Euro 5 nello stesso anno, avendo due versioni

dup_cols = ["Manufacturer","Model","Category","Year", "Greenhouse gases"]
dups = df.duplicated(subset=dup_cols, keep=False)


#### What happened in 1993 and '94? Most importantly what happened in 2020, is it something in our dataset? 
 > Thanks to the duplicates analysis we can confidently say that our dataset had an error, but still in 2020 there are lots of new models created

### Counts of bikes per cc
 > Let's figure out how many bikes were produced per cc category

In [10]:
bins   = [0, 125, 400, 700, 1000, 2000, np.inf]
labels = ["0–125 (A1)", "125–400 (A-lim)", "400–700 (A)", 
          "700–1000 (A)", "1000-2000 (A)", "2000+ (A)"]

# A new column with the categories in which every model falls
df["Disp_cat"] = pd.cut(df["Displacement_cc"], bins=bins, labels=labels, right=True)

# For each category, we calculate the count and the mode            
count_df  = (df["Disp_cat"]
             .value_counts()
             .reindex(labels)
             .rename("Count")
             .reset_index()
             .rename(columns={"Disp_cat": "Displacement Range (cc)"}))

# Calulates the mode for each category
mode_df = (
    df.groupby("Disp_cat", observed=False)["Displacement_cc"]
    .agg(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
    .reindex(labels)
    .reset_index()
    .rename(columns={
        "Disp_cat": "Displacement Range (cc)",
        "Displacement_cc": "mode_cc"
    })
)

# Merge the two DataFrames
disp_summary = count_df.merge(mode_df, on="Displacement Range (cc)")
# Display the mode in a more readable format
disp_summary["mode_text"] = "Mode: " + disp_summary["mode_cc"].round(0).astype(int).astype(str)


fig2 = px.bar(
    disp_summary,
    x="Displacement Range (cc)",
    y="Count",
    #This will show the mode on the bar
    text="mode_text",

    title="<b>Count of Models by CC</b>",
    color_discrete_sequence=["#987434"],
    height=650
)

fig2.update_xaxes(categoryorder="array", categoryarray=labels)

fig2.update_traces(
    texttemplate="%{text}", 
    textposition="inside",
    insidetextanchor="middle", 
    textfont=dict(color="white", size=20)
)



set_theme(fig2).show()


### Price related to a lot of stuff

In [11]:

# Keep only the columns we need for this scatter
plot_df = df.dropna(subset=["Year", "Displacement_cc", "HP", "Price"])

# There's an error in the data, we need to remove the outliers
plot_df = plot_df[plot_df["HP"] < 400]

plot_df = plot_df[plot_df["Year"] > 2000]
plot_df = plot_df[plot_df["Displacement_cc"] < 3000]


plot_df["Price_adj"] = plot_df.apply(adjust_price_for_inflation, axis=1)


fig3 = px.scatter(
    plot_df,
    x="Year",
    y="Displacement_cc",
    size="Price_adj",
    color="HP",
    hover_name="Model", # when hovering, show the model, really cute
    color_continuous_scale="YlOrRd",
    range_color=[0, 200],
    opacity=0.6,
    height=700,
    title="Motorcycle Prices by Displacement and Year",
    labels={
        "Year": "Year",
        "Displacement_cc": "Displacement (cc)",
        "Price": "Price (scaled size)",
        "HP": "Horsepower"
    }
)
# Remove the borders from every point
fig3.update_traces(marker=dict(line=dict(width=0)))

# In this for we add the horizontal lines, these are the limits for the A1, A-lim, A and A2 licenses

fig3.update_yaxes(
    tickvals=[125, 250, 600, 1000, 2000],
    title="Displacement (cc)"
)

# Asse x con tick fissi (opzionale)
fig3.update_xaxes(
    tickvals=[1980, 1990, 2000, 2010, 2015, 2020],
    title="Year"
)

fig3.update_layout(
    coloraxis_colorbar=dict(title="Horsepower (HP)"),
)
set_theme(fig3)
fig3.show()


# riportare i prezzi tenendo conto dell'inflazione

#scatter plot su CC e HP, 
# magari scartare il tempo, interessarsi meglio sul prezzo


### Green fuel, electric, how do they compare