# 🚀 Houston, we have liftoff! 🛰️

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
eu = pd.read_csv(r"C:\Users\datan\OneDrive\Desktop\Capstone\Exoplanets\data\everything_exoplanet_eu.csv")

In [None]:
eu.head(4)

# Cleaning 🧹

In [None]:
eu.isnull().sum()

In [None]:
eu.columns

In [None]:
# removing any unnecessary columns for my analysis

eu.drop(['mass_error_min', 'mass_error_max',
        'mass_sini', 'mass_sini_error_min', 
        'mass_sini_error_max', 'radius_error_min', 
        'radius_error_max', 'orbital_period_error_min', 
        'orbital_period_error_max', 'semi_major_axis_error_min',
       'semi_major_axis_error_max', 'eccentricity_error_min',
       'eccentricity_error_max', 'inclination_error_min',
       'inclination_error_max', 'angular_distance', 'updated',
       'omega', 'omega_error_min', 'omega_error_max', 'tperi',
       'tperi_error_min', 'tperi_error_max', 'tconj', 'tconj_error_min',
       'tconj_error_max', 'tzero_tr', 'tzero_tr_error_min',
       'tzero_tr_error_max', 'tzero_tr_sec', 'tzero_tr_sec_error_min',
       'tzero_tr_sec_error_max', 'lambda_angle', 'lambda_angle_error_min',
       'lambda_angle_error_max', 'impact_parameter',
       'impact_parameter_error_min', 'impact_parameter_error_max', 'tzero_vr',
       'tzero_vr_error_min', 'tzero_vr_error_max', 'k', 'k_error_min',
       'k_error_max', 'temp_calculated', 'temp_calculated_error_min',
       'temp_calculated_error_max', 'temp_measured', 'hot_point_lon',
       'geometric_albedo', 'geometric_albedo_error_min',
       'geometric_albedo_error_max', 'log_g', 'publication', 'mass_measurement_type',
        'radius_measurement_type', 'alternate_names',
       'molecules', 'star_name', 'ra', 'dec', 'mag_v', 'mag_i', 'mag_j',
       'mag_h', 'mag_k', 'star_distance_error_min',
       'star_distance_error_max', 'star_metallicity',
       'star_metallicity_error_min', 'star_metallicity_error_max', 'star_mass_error_min', 
        'star_mass_error_max', 'star_radius_error_min', 'star_radius_error_max', 
        'star_age', 'star_age_error_min', 'star_age_error_max', 'star_teff',
       'star_teff_error_min', 'star_teff_error_max', 'star_detected_disc',
       'star_magnetic_field', 'star_alternate_names'], axis=1, inplace=True)

In [None]:
eu.info()

In [None]:
eu.isnull().sum()

In [None]:
# replacing all nulls with zero

eu.fillna(0, inplace=True)

In [None]:
eu.isnull().sum()

In [None]:
# cleaning the star_sp_type column

In [None]:
eu['star_sp_type'].value_counts()

In [None]:
# fill NaN values with an empty string and changing the data type to string
eu['star_sp_type'] = eu['star_sp_type'].fillna('')
eu['star_sp_type'] = eu['star_sp_type'].astype(str)

# defining the function to extract the first letter of the star type
def simplify_star_type(star_sp_type):
    return star_sp_type[0] if star_sp_type else 'Unknown'

eu['star_type'] = eu['star_sp_type'].apply(simplify_star_type)

In [None]:
eu['star_type'].value_counts()

In [None]:
desired_star_types = ['O', 'B', 'A', 'F', 'G', 'K', 'M'] # Morgan-Keenan system simplified

eu = eu[eu['star_type'].isin(desired_star_types)]

In [None]:
eu['star_type'].value_counts()

In [None]:
eu.info()

In [None]:
# Fixing the detection_type column to focus on the most effective ones
main_types = ['Radial Velocity', 'Microlensing', 'Primary Transit', 'Imaging', 'Astrometry']

# making the function to keep only the main detection types and remmove everything after the first comma
def filter_main_types(detection_type):
    for main_type in main_types:
        if main_type in detection_type:
            return main_type
    return 'Other'

eu['detection_type_filtered'] = eu['detection_type'].apply(lambda x: x.split(',')[0].strip())
eu['detection_type_final'] = eu['detection_type_filtered'].apply(filter_main_types)

print(eu[['detection_type', 'detection_type_final']])

In [None]:
eu['detection_type_final'].value_counts() # need to plot everything but 'other'

In [None]:
# removing rows with 'Other' in the detection_type column
eu = eu[eu['detection_type_final'] != 'Other']

In [None]:
eu['detection_type_final'].value_counts() # need to plot everything but 'other'

In [None]:
# removing any unnecessary columns

eu.drop(['star_sp_type', 'detection_type', 'detection_type_filtered'], axis=1, inplace=True)

In [None]:
# fixing my year column

In [None]:
eu['discovered'].value_counts().head()

In [None]:
eu['discovered'] = eu['discovered'].apply(lambda x: int(x))

In [None]:
eu['discovered'].dtype

In [None]:
eu['discovered'].value_counts()

In [None]:
# removing the yeear 2025 and 0
eu = eu[(eu['discovered'] != 2025) & (eu['discovered'] != 0)]

# coming back and removing every year before the year 2000
eu = eu[eu['discovered'] >= 2000]

In [None]:
eu['discovered'].value_counts()

In [None]:
# renaming columns

eu = eu.rename(columns= {'name':'planet_name', 'mass':'planet_mass', 'radius':'planet_radius', 'detection_type_final':'detection_type', 'discovered':'year'})

In [None]:
eu.head(4) # now I have something to work with! woo hoo!

In [None]:
# Saaving the DataFrame to a CSV file to use in PowerBI
# eu.to_csv('pandasClean_exoplanetEU.csv', index=False)

# Adding Columns 🏛️

In [None]:
# Classifying exoplanets into categories based on their characteristics, such as: terrestrial planets, gas giants, and super-earths

In [None]:
conditions = [
    (eu['planet_mass'] < 0.1),
    (eu['planet_mass'] >= 0.1) & (eu['planet_mass'] < 0.5),
    (eu['planet_mass'] >= 0.5)
]
choices = ['Terrestrial', 'Super-Earth', 'Gas Giant']
eu['planet_type'] = np.select(conditions, choices, default='Unknown')

In [None]:
eu.head(4)

In [None]:
# This categorizes exoplanets into 'Low Inclination', 'Medium Inclination', and 'High Inclination' based on their orbital inclinations.

In [None]:
conditions_inclination = [
    (eu['inclination'] < 10),
    (eu['inclination'] >= 10) & (eu['inclination'] < 30),
    (eu['inclination'] >= 30)
]
choices_inclination = ['Low Inclination', 'Medium Inclination', 'High Inclination']
eu['inclination_category'] = np.select(conditions_inclination, choices_inclination, default='Unknown')

In [None]:
# Proportion of exoplanets in each inclination category

In [None]:
eu.head(4)

In [None]:
inclination_distribution = eu['inclination_category'].value_counts(normalize=True) * 100
print(inclination_distribution)

In [None]:
# Now attempting to calculate the habitable zone for each star

In [None]:
def habitable_zone(star_luminosity):
    inner_bound = 0.75 * (star_luminosity ** 0.5)
    outer_bound = 1.75 * (star_luminosity ** 0.5)
    return inner_bound, outer_bound

# this is assuming star_luminosity is related to star_mass... I will double check this if I use this.
eu['star_luminosity'] = eu['star_mass'] ** 3.5
eu['hz_inner'], eu['hz_outer'] = zip(*eu['star_luminosity'].apply(habitable_zone))

# exoplanets in the habitable zone
eu['in_habitable_zone'] = (eu['semi_major_axis'] >= eu['hz_inner']) & (eu['semi_major_axis'] <= eu['hz_outer'])
habitable_exoplanets = eu[eu['in_habitable_zone']]
print(habitable_exoplanets[['planet_name', 'semi_major_axis', 'star_type']])

In [None]:
eu.head(4)

In [None]:
# Saaving the DataFrame to a CSV file to use in PowerBI
# eu.to_csv('pandasClean_exoplanetEU_02.csv', index=False)

# Outer Worlds: Analyzing the Discovery and Characteristics of Exoplanets

Working the Data Questions 👨🏻‍🚀 

# #1. What are some of the most common detection methods and their effectiveness?

In [None]:
detection_counts = eu['detection_type'].value_counts()
print(detection_counts)
print('')
total_detections = eu['detection_type'].value_counts().sum()
print('Total number of detected planets')
print(total_detections)


detection_counts.plot(kind='barh')
plt.xlabel('Number of Exoplanets')
plt.ylabel('')
plt.title('Common Detection Methods');

How have the methods for detecting exoplanets evolved over time?

In [None]:
detection_method_trend = eu.groupby(['year', 'detection_type']).size().unstack().fillna(0)
detection_method_trend.plot(kind='line', stacked=False)
plt.xlabel('Year')
plt.ylabel('')
plt.title('Detection Methods Over Time')
plt.legend(title='Detection Type');

This scatter plot provides a comprehensive view of the relationships between planet mass, star distance, detection method, and planet radius. It allows us to explore how detection methods vary with exoplanet characteristics and the environments in which they are found.

In [None]:
palette = sns.color_palette(n_colors=5)

g = sns.relplot(data=eu,
                x="planet_mass", 
                y="star_distance",
                hue="detection_type", 
                size="planet_radius",
                palette=palette, 
                sizes=(10, 200),
                height=6, 
                aspect=1.5) 

g.set_axis_labels("Planet Mass", "Star Distance")
g.fig.suptitle("Exoplanet Mass and Distance: Detection Method and Radius Insights", fontsize=16, fontweight='bold')

g.set(xscale="log", yscale="log")
g.ax.xaxis.grid(True, "minor", linewidth=.25)
g.ax.yaxis.grid(True, "minor", linewidth=.25)
g.despine(left=True, bottom=True);

Success of Detection Methods

In [None]:
# Grouping by detection method and calculate average characteristics
detection_method_stats = eu.groupby('detection_type').agg({'planet_mass': 'mean', 'planet_radius': 'mean'})
print(detection_method_stats)

# #2. What are the most common types of exoplanets and some key characteristics discovered so far?

In [None]:
detection_counts = eu['planet_type'].value_counts()
print(detection_counts)
print('')
total_detections = eu['planet_type'].value_counts().sum()
print('Total number of detected planets')
print(total_detections)

detection_counts.plot(kind='barh')
plt.xlabel('Detection Method')
plt.ylabel('Number of Exoplanets Discovered')
plt.title('Common Detection Methods');

In [None]:
eu_filtered = eu[(eu['year'] >= 2014) & (eu['year'] <= 2024)]

planet_discovery_counts = eu_filtered.groupby(['year', 'planet_type']).size().unstack().fillna(0)

planet_discovery_counts.plot(kind='bar', stacked=False, figsize=(12, 8))
plt.xticks(rotation = 0, fontsize = 12)   
plt.xlabel('')
plt.ylabel('')
plt.title('Number of Planets Discovered Each Year by Planet Type (2014-2024)')
plt.legend(title='Planet Type');

In [None]:
# Line plot of average exoplanet mass by discovery year
avg_mass_by_year = eu.groupby('year')['planet_mass'].mean()
plt.plot(avg_mass_by_year.index, avg_mass_by_year.values)
plt.yscale('log')
plt.xlabel('')
plt.ylabel('Average Exoplanet Mass')
plt.title('Average Exoplanet Mass by Discovery Year');

What is the range of orbital periods and semi-major axes of exoplanets?

In [None]:
# This scatter plot helps visualize the relationship between the orbital period of exoplanets and their semi-major axis.
plt.scatter(eu['semi_major_axis'], eu['orbital_period'])
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Semi-Major Axis (AU)')
plt.ylabel('Orbital Period (days)')
plt.title('Orbital Period vs. Semi-Major Axis');

In [None]:
orbital_period_summary = eu['orbital_period'].describe()
semi_major_axis_summary = eu['semi_major_axis'].describe()

print('Orbital Period Summary:')
print(orbital_period_summary)
print('\nSemi-Major Axis Summary:')
print(semi_major_axis_summary)

# #3. What are the most common types of stars hosting exoplanets?

In [None]:
star_type_counts = eu['star_type'].value_counts()
print(star_type_counts)

star_type_counts = eu['star_type'].value_counts()
star_type_counts.plot(kind='bar')
plt.xticks(rotation = 0, fontsize = 12)   
plt.xlabel('Star Type')
plt.ylabel('Number of Exoplanets Discovered')
plt.title('Star Types Hosting Exoplanets');

In [None]:
type_discoveries = eu.groupby(['year', 'star_type']).size().unstack().fillna(0)
type_discoveries.plot(kind='barh', stacked=True)
plt.xlabel('Number of Stars Discovered')
plt.ylabel('')
plt.title('Discovery of Exoplanet Star Types per Year')
plt.legend(title='Star Type');

In [None]:
# Summary statistics for star characteristics
star_mass_summary = eu['star_mass'].describe()
star_radius_summary = eu['star_radius'].describe()
star_luminosity_summary = eu['star_luminosity'].describe()

print('Star Mass Summary:')
print(star_mass_summary)
print('\nStar Radius Summary:')
print(star_radius_summary)
print('\nStar Luminosity Summary:')
print(star_luminosity_summary)

Scatterplotting to give a comprehensive view of the relationships between star mass, distance, detection method, and star radius.

In [None]:
palette = sns.color_palette(n_colors=5)

g = sns.relplot(data=eu,
                x="star_mass", 
                y="star_distance",
                hue="detection_type", 
                size="star_radius",
                palette=palette, 
                sizes=(10, 200),
                height=6, 
                aspect=1.5) 

g.set_axis_labels("Star Mass", "Star Distance")

g.fig.suptitle("Distribution of Exoplanets: Star Mass vs. Star Distance", fontsize=16, fontweight='bold')

g.set(xscale="log", yscale="log")
g.ax.xaxis.grid(True, "minor", linewidth=.25)
g.ax.yaxis.grid(True, "minor", linewidth=.25)
g.despine(left=True, bottom=True);

# #4. What factors determine Habitability?

Habitable Zone Workings 🌎

In [None]:
in_habitable_zone_counts = eu['in_habitable_zone'].value_counts()
print(in_habitable_zone_counts)

star_type_counts = eu['in_habitable_zone'].value_counts()
star_type_counts.plot(kind='bar')
plt.xticks(rotation = 0, fontsize = 12)   
plt.xlabel('In Habitable Zone')
plt.ylabel('')
plt.title('Exoplanets Discovered in H-Zone');

In [None]:
type_discoveries = eu.groupby(['year', 'in_habitable_zone']).size().unstack().fillna(0)
type_discoveries.plot(kind='barh', stacked=True)
plt.ylabel('')
plt.xlabel('')
plt.title('Number of Discovered Exoplanets in the H-Zone')
plt.legend(title='In Habitable Zone');

Bringing in Our Solar System 👽

In [None]:
sol = pd.read_csv(r"C:\Users\datan\OneDrive\Desktop\Capstone\Exoplanets\data\solar_system_nasa_scrape_github.csv")

In [None]:
sol.head(9)

In [None]:
sol.columns