# Analysis of the dataset of discovered exoplanets (2021) - Group D (Teggi, Verdolin)

link to the dataset: https://www.kaggle.com/datasets/shivamb/all-exoplanets-dataset?resource=download

## 0. Data import, filtering and preparation for analysis

In [11]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go

# Load the dataset
dataset_path = "data.csv"
exoplanet_data = pd.read_csv(dataset_path)

# Filter the columns from the total 23
selected_columns = [
    "Planet Name",                   # string (name of the planet)
    "Planet Host",                   # string (name of the star system)
    "Num Stars",                     # int (number of stars in that star system)
    "Num Planets",                   # int (number of planets discovered in that star system)
    "Discovery Method",              # string (method used)
    "Discovery Year",                # int (year of discovery)
    "Discovery Facility",            # string (observatory)
    "Orbital Period Days",           # double (orbital period of the planet around the star [earth days])
    "Orbit Semi-Major Axis",         # double (greater distance from the star [AU (149 597 870 700 m)])
    "Mass",                          # double (mass of the planet [earth mass])
    "Stellar Effective Temperature", # double (average surface temperature of the star [Kelvin])
    "Stellar Radius",                # double (radius of the star [solar radius])
    "Stellar Mass",                  # double (mass of the star [solar masses])
    "Stellar Surface Gravity",       # double (average surface gravity [log g])
    "Distance"                       # double (distance from our system [Parsec])
]
filtered_data = exoplanet_data[selected_columns]

filtered_data.head(5)
    

Unnamed: 0,Planet Name,Planet Host,Num Stars,Num Planets,Discovery Method,Discovery Year,Discovery Facility,Orbital Period Days,Orbit Semi-Major Axis,Mass,Stellar Effective Temperature,Stellar Radius,Stellar Mass,Stellar Surface Gravity,Distance
0,11 Com b,11 Com,2,1,Radial Velocity,2007,Xinglong Station,326.03,1.29,6165.6,4742.0,19.0,2.7,2.31,93.1846
1,11 UMi b,11 UMi,1,1,Radial Velocity,2009,Thueringer Landessternwarte Tautenburg,516.21997,1.53,4684.8142,4213.0,29.79,2.78,1.93,125.321
2,14 And b,14 And,1,1,Radial Velocity,2008,Okayama Astrophysical Observatory,185.84,0.83,1525.5,4813.0,11.0,2.2,2.63,75.4392
3,14 Her b,14 Her,1,2,Radial Velocity,2002,W. M. Keck Observatory,1773.40002,2.93,1481.0878,5338.0,0.93,0.9,4.45,17.9323
4,16 Cyg B b,16 Cyg B,3,1,Radial Velocity,1996,Multiple Observatories,798.5,1.66,565.7374,5750.0,1.13,1.08,4.36,21.1397


In [12]:
filtered_data.shape

(4575, 15)

## 1. Basic analysis

### Number of Discovered Planets per Year

In [28]:
planets_per_year = filtered_data['Discovery Year'].value_counts().sort_index()

# Creare un istogramma
discovery_year_histogram = px.bar(
    planets_per_year,
    x=planets_per_year.index,
    y=planets_per_year.values,
    labels={'x': 'Discovery Year', 'y': 'Number of Planets Discovered'},
    title='Number of Exoplanets Discovered Per Year',
    text_auto=True  # Mostra i valori sopra le barre
)

# Modificare l'asse X per aggiungere tick con linee visibili
discovery_year_histogram.update_layout(
    yaxis=dict(
        type="log",  # Scala logaritmica
        title="Number of Planets Discovered (Log Scale)"
    ),
    xaxis=dict(
        title="Discovery Year",
        tickmode="linear",  # Tick lineari
        dtick=1,  # Tick ogni anno
        ticklen=5,  # Lunghezza delle righe dei tick
        tickwidth=1,  # Spessore delle righe dei tick
        tickvals=planets_per_year.index,  # Mostra righe dei tick per ogni anno
        ticktext=[str(year) if year % 5 == 0 else '' for year in planets_per_year.index],  # Mostra etichette ogni 5 anni
        showline=True,  # Mostra la linea dell'asse
        showgrid=False  # Disabilita le linee della griglia
    )
)

# Visualizzare il grafico
discovery_year_histogram.show()


### Discovery Methods used

In [14]:
from plotly.subplots import make_subplots

# Filtrare i metodi di scoperta con almeno 3 entries
method_counts = filtered_data["Discovery Method"].value_counts()
filtered_methods = method_counts[method_counts >= 3].index

# Filtrare i dati originali per i metodi selezionati
filtered_discovery_data = filtered_data[filtered_data["Discovery Method"].isin(filtered_methods)]

# Conteggio dei metodi dopo il filtraggio e ordinamento in ordine crescente
method_counts_filtered = filtered_discovery_data["Discovery Method"].value_counts().sort_values(ascending=True)

# Creazione del grafico subplot
fig = make_subplots(
    rows=1,
    cols=2,
    subplot_titles=("Number of Planets Discovered by Method", "Distance Distribution by Method"),
    horizontal_spacing=0.15,
    shared_yaxes=True  # Condivide l'asse Y
)

# Primo subplot: Istogramma del numero di pianeti scoperti per metodo
fig.add_trace(
    go.Bar(
        x=method_counts_filtered.values,
        y=method_counts_filtered.index,
        orientation='h',  # Orientazione orizzontale
        text=method_counts_filtered.values,  # Aggiungere il numero effettivo come testo
        textposition='auto',  # Posizionare il testo automaticamente
        name="Number of Planets"
    ),
    row=1,
    col=1
)

# Secondo subplot: Boxplot delle distanze per metodo
fig.add_trace(
    go.Box(
        x=filtered_discovery_data["Distance"],
        y=filtered_discovery_data["Discovery Method"],
        name="Distance Distribution",
        orientation='h',  # Orientazione orizzontale
        boxmean=True  # Mostra la media nel boxplot
    ),
    row=1,
    col=2
)

# Impostare l'asse X del primo subplot in scala logaritmica
fig.update_xaxes(
    type="log",  # Scala logaritmica
    title_text="Number of Planets (Log Scale)",
    row=1,
    col=1
)

# Configurazione del layout generale
fig.update_layout(
    title_text="Discovery Methods Analysis: Number of Planets and Distance Distribution",
    xaxis2_title="Distance [pc]",  # Asse X del secondo subplot
    yaxis_title="Discovery Method",
    showlegend=False,
    height=600,  # Altezza del grafico
    template="plotly_white"
)


fig.show()


### Discovery Methods and Distances

In [15]:
# Calcolo della distanza media e del numero di entries per metodo di rivelazione
average_distance_by_method = (
    filtered_data.groupby("Discovery Method")
    .agg(Average_Distance=("Distance", "mean"), Count=("Distance", "size"))
    .reset_index()
)

# Filtra i metodi con almeno 5 entries
average_distance_by_method = average_distance_by_method[average_distance_by_method["Count"] >= 5]

# Ordina per distanza media e prendi i primi 10
average_distance_by_method = (
    average_distance_by_method.sort_values(by="Average_Distance", ascending=True).head(10)
)

# Creazione dell'istogramma
fig = px.bar(
    average_distance_by_method,
    y="Discovery Method",
    x="Average_Distance",
    title="Average distance based on Discovery method (Min. 5 Entries)",
    labels={
        "Average_Distance": "Average Distance [pc]",
        "Discovery Method": "Discovery Method"
    }
)
# Visualizzazione del grafico
fig.show()

#DA eliminare


### Top 9 Discovery Facilities by Number of Planets Discovered

In [16]:
# Conteggiare il numero di pianeti scoperti da ciascuna struttura
facility_counts = filtered_data['Discovery Facility'].value_counts()

# Separare i primi 10 osservatori
top_10_facilities = facility_counts.head(10)

# Calcolare il totale di "Others" includendo anche "Multiple Observatories"
others_count = facility_counts[10:].sum()
if "Multiple Observatories" in top_10_facilities.index:
    others_count += top_10_facilities["Multiple Observatories"]
    top_10_facilities = top_10_facilities.drop("Multiple Observatories")

# Aggiungere "Others" ai dati
top_10_facilities_with_others = pd.concat([top_10_facilities, pd.Series({"Others": others_count})])

# Ordinare i valori (escludendo temporaneamente "Others")
top_10_facilities_sorted = top_10_facilities_with_others.drop("Others").sort_values(ascending=True)

# Aggiungere "Others" all'inizio
top_10_facilities_sorted = pd.concat([pd.Series({"Others": others_count}), top_10_facilities_sorted])

# Creare l'istogramma
facility_histogram = px.bar(
    top_10_facilities_sorted,
    y=top_10_facilities_sorted.index,
    x=top_10_facilities_sorted.values,
    labels={'x': 'Number of Planets Discovered', 'y': 'Discovery Facility'},
    title='Top Discovery Facilities by Number of Planets Discovered',
    text_auto=True  # Mostra i valori sopra le barre
)

# Aggiungere opzioni per migliorare la leggibilità
facility_histogram.update_layout(
    yaxis_title="Discovery Facility",
    xaxis_title="Number of Planets Discovered",
    template="plotly_white"  # Tema chiaro
)

# Lista di osservatori non terrestri
custom_space_facilities = ["Kepler", "K2", "Transiting Exoplanet Survey Satellite (TESS)"]

# Colori specifici per le barre
facility_histogram.update_traces(marker_color=[
    'gray' if i == "Others" else 
    '#636EFA' if i in custom_space_facilities else 
    'red' for i in top_10_facilities_sorted.index
])

# Aggiungere la legenda personalizzata
facility_histogram.add_trace(
    go.Scatter(
        x=[None], y=[None],
        mode="markers",
        marker=dict(size=10, color='#636EFA'),
        name="Space Observatories"
    )
)

facility_histogram.add_trace(
    go.Scatter(
        x=[None], y=[None],
        mode="markers",
        marker=dict(size=10, color='red'),
        name="Ground-based Observatories"
    )
)

# Visualizzare il grafico
facility_histogram.show()


### Number of Stars and Planets per System

In [17]:
# Contare la frequenza di stelle per sistema
stars_per_system = filtered_data['Num Stars'].value_counts()

# Contare la frequenza di pianeti per sistema
planets_per_system = filtered_data['Num Planets'].value_counts()

# Creare la sottotrama con due istogrammi
fig = sp.make_subplots(
    rows=1, cols=2, 
    subplot_titles=['Number of Stars per System', 'Number of Planets per System']
)

# Aggiungere il grafico istogramma per il numero di stelle per sistema
fig.add_trace(
    go.Bar(
        x=stars_per_system.index, 
        y=stars_per_system.values, 
        name="Stars per System"
    ), 
    row=1, col=1
)

# Aggiungere il grafico istogramma per il numero di pianeti per sistema
fig.add_trace(
    go.Bar(
        x=planets_per_system.index, 
        y=planets_per_system.values, 
        name="Planets per System"
    ), 
    row=1, col=2
)

# Modificare il layout per assicurare che le etichette siano visibili
fig.update_layout(
    title_text="Distribution of the Number of Stars and Planets per System",
    xaxis_title="Number of Stars",
    yaxis_title="Frequency (Log Scale)",
    xaxis2_title="Number of Planets",
    yaxis2=dict(title="Frequency (Log Scale)"),  # Modifica titolo asse Y
    showlegend=False,  # Rimuove la legenda
)

# Impostare l'asse Y in scala logaritmica per entrambi i grafici
fig.update_yaxes(type="log", row=1, col=1)  # Grafico di sinistra
fig.update_yaxes(type="log", row=1, col=2)  # Grafico di destra

# Assicurare che ogni valore sull'asse X del secondo grafico sia mostrato
fig.update_xaxes(tickmode="linear", row=1, col=2)

# Visualizzare il grafico
fig.show()


### Distances

## 2. Analysis about Discovery methods

In [18]:
# Filtrare i metodi di scoperta con almeno 5 entries
method_counts = filtered_data["Discovery Method"].value_counts()
filtered_methods = method_counts[method_counts >= 10].index
filtered_data_filtered = filtered_data[filtered_data["Discovery Method"].isin(filtered_methods)]


### Mass

In [19]:
fig1 = px.scatter(
    filtered_data_filtered,
    x="Stellar Mass",
    y="Mass",
    color="Discovery Method",
    labels={
        "Stellar Mass": "Stellar Mass (Solar Masses)",
        "Mass": "Planet Mass (Earth Mass)"
    },
    title="Stellar Mass vs Planet Mass",
    template="plotly_white",
    log_x=True,  # Scala logaritmica per gestire i dati
    log_y=True
)
fig1.show()


### Radius

In [20]:
fig2 = px.scatter(
    filtered_data_filtered,
    x="Stellar Radius",
    y="Orbit Semi-Major Axis",  # Sostituisco "raggio del pianeta" con "distanza orbitale", poiché il dataset non contiene un valore diretto del raggio del pianeta
    color="Discovery Method",
    labels={
        "Stellar Radius": "Stellar Radius (Solar Radii)",
        "Orbit Semi-Major Axis": "Planet Orbit Semi-Major Axis (AU)"
    },
    title="Stellar Radius vs Planet Orbit Semi-Major Axis",
    template="plotly_white",
    log_x=True,
    log_y=True
)
fig2.show()


### Relationship Between distance and Discovery Method

In [21]:
fig3 = px.scatter(
    filtered_data_filtered,
    x="Stellar Surface Gravity",
    y="Stellar Mass",
    color="Discovery Method",
    labels={
        "Stellar Surface Gravity": "Stellar Surface Gravity (log g)",
        "Stellar Mass": "Stellar Mass (Solar Masses)"
    },
    title="Stellar Surface Gravity vs Stellar Mass",
    template="plotly_white",
    log_x=False,
    log_y=False
)
fig3.show()


### Relationship between the Mass of the Planet and the Mass of the Star

In [22]:
# Raggruppare per anno e metodo di scoperta e contare il numero di pianeti scoperti
discovery_counts = (
    filtered_data.groupby(["Discovery Year", "Discovery Method"])["Planet Name"]
    .count()
    .reset_index(name="Count")
)

# Trovare il metodo più usato per ogni anno
most_used_method_per_year = (
    discovery_counts.loc[discovery_counts.groupby("Discovery Year")["Count"].idxmax()]
)

# Raggruppare per anno e osservatorio e contare il numero di pianeti scoperti
observatory_counts = (
    filtered_data.groupby(["Discovery Year", "Discovery Facility"])["Planet Name"]
    .count()
    .reset_index(name="Count")
)

# Trovare l'osservatorio con più rilevazioni ogni anno
most_used_observatory_per_year = (
    observatory_counts.loc[observatory_counts.groupby("Discovery Year")["Count"].idxmax()]
)

# Unire i risultati: metodo di scoperta e osservatorio più usato
most_used_combined = pd.merge(
    most_used_method_per_year,
    most_used_observatory_per_year[["Discovery Year", "Discovery Facility"]],
    on="Discovery Year"
)

# Ordinare i risultati per anno
most_used_combined = most_used_combined.sort_values(by="Discovery Year")

# Creare un grafico a barre per visualizzare i risultati
fig = px.bar(
    most_used_combined,
    x="Discovery Year",
    y="Count",
    color="Discovery Method",
    text="Discovery Method",
    labels={
        "Count": "Number of Planets Discovered",
        "Discovery Year": "Year",
        "Discovery Method": "Discovery Method",
    },
    title="Most Used Discovery Method and Observatory Per Year",
    template="plotly_white",
)

# Aggiungere il nome dell'osservatorio sopra le barre
fig.update_traces(
    text=most_used_combined.apply(
        lambda row: f"{row['Discovery Method']} ({row['Discovery Facility']})", axis=1
    ),
    textposition="outside"
)

# Impostare l'asse Y in scala logaritmica
fig.update_layout(
    yaxis=dict(
        type="log",  # Scala logaritmica
        title="Number of Planets Discovered (Log Scale)"
    )
)

# Visualizzare il grafico
fig.show()


### Correlation between Stellar Temperature and discovery method

In [23]:
# Filtrare i metodi con almeno 5 entries
method_counts = filtered_data["Discovery Method"].value_counts()
filtered_methods = method_counts[method_counts >= 10].index
filtered_data_filtered = filtered_data[filtered_data["Discovery Method"].isin(filtered_methods)]

# Creare il grafico a box con assi invertiti e senza legenda
fig1 = px.box(
    filtered_data_filtered,
    y="Discovery Method",
    x="Stellar Effective Temperature",
    labels={
        "Stellar Effective Temperature": "Stellar Effective Temperature (K)",
        "Discovery Method": "Method"
    },
    title="Stellar Effective Temperature Distribution by Discovery Method (Filtered)",
    template="plotly_white"
)

# Rimuovere la legenda
fig1.update_layout(
    showlegend=False
)

# Visualizzare il grafico
fig1.show()


In [24]:
# Filtrare i metodi con almeno 5 scoperte totali
method_totals = filtered_data["Discovery Method"].value_counts()
valid_discovery_methods = method_totals[method_totals >= 10].index
filtered_discovery_data = filtered_data[filtered_data["Discovery Method"].isin(valid_discovery_methods)]

# Raggruppare per anno e metodo di scoperta e contare il numero di pianeti scoperti
discovery_trend_filtered = (
    filtered_discovery_data.groupby(["Discovery Year", "Discovery Method"])["Planet Name"]
    .count()
    .reset_index(name="Count")
)

# Creare il grafico a linee
fig4 = px.line(
    discovery_trend_filtered,
    x="Discovery Year",
    y="Count",
    color="Discovery Method",
    labels={
        "Discovery Year": "Year",
        "Count": "Number of Planets Discovered"
    },
    title="Trend of Discovery Methods Over Time (Filtered)",
    template="plotly_white"
)

# Impostare l'asse Y in scala logaritmica
fig4.update_layout(
    yaxis=dict(
        type="log",  # Scala logaritmica
        title="Number of Planets Discovered (Log Scale)"
    )
)

# Visualizzare il grafico
fig4.show()


In [25]:
fig = px.scatter(
    filtered_data,
    x="Stellar Surface Gravity",
    y="Orbital Period Days",
    labels={
        "Stellar Surface Gravity": "Stellar Surface Gravity (log g)",
        "Orbital Period Days": "Orbital Period (Days)"
    },
    title="Orbital Period vs Stellar Surface Gravity",
    template="plotly_white"
)

# Impostare l'asse Y in scala logaritmica
fig.update_layout(
    yaxis=dict(
        type="log",  # Scala logaritmica
        title="Orbital Period (Days, Log Scale)"
    )
)

fig.show()


In [35]:
fig = px.scatter(
    filtered_data,
    x="Stellar Mass",
    y="Stellar Surface Gravity",
    color="Stellar Radius",  # Utilizza il raggio come scala colore
    labels={
        "Stellar Mass": "Stellar Mass (Solar Masses)",
        "Stellar Surface Gravity": "Stellar Surface Gravity (log g)",
        "Stellar Radius": "Stellar Radius (Solar Radii)"
    },
    title="Stellar Surface Gravity vs Stellar Mass (with Stellar Radius)",
    template="plotly_white",
    color_continuous_scale="Jet"  # Utilizza una scala colore più contrastata
)

# Configurare layout dell'asse Y
fig.update_layout(
    yaxis=dict(
        range=[0, 6],  # Limita l'intervallo tra 0 e 6
        title="Stellar Surface Gravity (log g)"
    ),
    xaxis=dict(
        range=[0, 5],  # Limita l'intervallo dell'asse X
        title="Stellar Mass (Solar Masses)"
    )
)

# Aggiornare la barra dei colori per enfatizzare le differenze nei valori bassi
fig.update_coloraxes(
    colorbar_title="Stellar Radius (Solar Radii)",
    cmin=filtered_data["Stellar Radius"].min(),  # Valore minimo
    cmax=filtered_data["Stellar Radius"].max()   # Valore massimo
)

# Mostrare il grafico
fig.show()


In [27]:
#continuare

In [31]:
import pandas as pd

# Carica il dataset (sostituisci 'dataset.csv' con il percorso corretto)
df = pd.read_csv('data.csv')

# Filtra il dataset per trovare i pianeti scoperti nel 1989
planets_1989 = df[df['Discovery Year'] == 1994]

# Mostra i nomi dei pianeti scoperti nel 1989
print(planets_1989['Planet Name'])


4162    PSR B1257+12 b
Name: Planet Name, dtype: object
