# **Exoplanet Data Cleaning**

In [19]:
# Importing
import numpy as np
import pandas as pd

In [20]:
exoplanets = pd.read_csv("Updated ExoplanetData.csv")

In [21]:
exoplanets.columns

Index(['planet_name', 'host_star_name', 'num_of_stars', 'num_of_planets',
       'orbital_period_day', 'orbit_semi_major_axis_au',
       'planet_rad_earth_rad', 'planet_rad_jup_rad', 'planet_mass_earth_mass',
       'planet_mass_jup_mass', 'Eccentricity', 'planet_equil_temp_k',
       'spectral_type', 'stellar_effective_temp_k', 'stellar_rad_solar_rad',
       'stellar_mass_solar_mass', 'distance_parsec', 'stellar_metallicity',
       'stellar_metallicity_ratio', 'stellar_surf_grav', 'declination_deg',
       'discovery_method', 'discovery_year', 'last_update_date',
       'ref_pub_date', 'solution_type', 'discovery_facility'],
      dtype='object')

In [22]:
subset = exoplanets.iloc[:5, :]
subset

Unnamed: 0,planet_name,host_star_name,num_of_stars,num_of_planets,orbital_period_day,orbit_semi_major_axis_au,planet_rad_earth_rad,planet_rad_jup_rad,planet_mass_earth_mass,planet_mass_jup_mass,...,stellar_metallicity,stellar_metallicity_ratio,stellar_surf_grav,declination_deg,discovery_method,discovery_year,last_update_date,ref_pub_date,solution_type,discovery_facility
0,11 Com b,11 Com,2,1,,1.21,,,5434.7,17.1,...,,,,17.793252,Radial Velocity,2007,7/23/2014,2011-08,Published Confirmed,Xinglong Station
1,11 Com b,11 Com,2,1,326.03,1.29,,,6165.6,19.4,...,-0.35,[Fe/H],2.31,17.793252,Radial Velocity,2007,5/14/2014,2008-01,Published Confirmed,Xinglong Station
2,11 Com b,11 Com,2,1,323.21,1.178,,,4914.89849,15.464,...,-0.26,[Fe/H],2.45,17.793252,Radial Velocity,2007,9/19/2023,2023-08,Published Confirmed,Xinglong Station
3,11 UMi b,11 UMi,1,1,516.21997,1.53,,,4684.8142,14.74,...,-0.02,[Fe/H],1.93,71.823943,Radial Velocity,2009,9/4/2018,2017-03,Published Confirmed,Thueringer Landessternwarte Tautenburg
4,11 UMi b,11 UMi,1,1,,1.51,,,3432.4,10.8,...,,,,71.823943,Radial Velocity,2009,4/25/2018,2011-08,Published Confirmed,Thueringer Landessternwarte Tautenburg


In [23]:
exoplanets.shape

(35964, 27)

In [24]:
exoplanets.describe

<bound method NDFrame.describe of       planet_name host_star_name  num_of_stars  num_of_planets  \
0        11 Com b         11 Com             2               1   
1        11 Com b         11 Com             2               1   
2        11 Com b         11 Com             2               1   
3        11 UMi b         11 UMi             1               1   
4        11 UMi b         11 UMi             1               1   
...           ...            ...           ...             ...   
35959   ups And d        ups And             2               3   
35960   ups Leo b        ups Leo             1               1   
35961    xi Aql b         xi Aql             1               1   
35962    xi Aql b         xi Aql             1               1   
35963    xi Aql b         xi Aql             1               1   

       orbital_period_day  orbit_semi_major_axis_au  planet_rad_earth_rad  \
0                     NaN                     1.210                   NaN   
1               326

In [53]:
exoplanets_copy = exoplanets.copy()

def flag_outliers_zscore(series, threshold=3):
    mean = series.mean()
    std_dev = series.std()
    
    # Check if standard deviation is lower than the threshold
    if std_dev <= 0.5:
        return pd.Series([False] * len(series), index=series.index)
    
    z_scores = (series - mean) / std_dev
    return abs(z_scores) > threshold

# Function to flag outliers for a specific column using z-score method within each group
def flag_outliers_for_column_zscore(df, column_name, threshold=3):
    # Group the DataFrame by the "planet_name" column
    grouped = df.groupby("planet_name")
    
    # Apply the outlier detection function to each group
    for group_name, group_data in grouped:
        outlier_flags = flag_outliers_zscore(group_data[column_name], threshold)
        # Update the corresponding rows in the original DataFrame with outlier flags
        df.loc[group_data.index, column_name + '_outlier'] = outlier_flags

# Get numerical columns in the DataFrame
numerical_columns = exoplanets_copy.select_dtypes(include=[np.number]).columns

# Apply the outlier detection using z-score method within each group for each numerical column
for column_name in numerical_columns:
    flag_outliers_for_column_zscore(exoplanets_copy, column_name)

# Display the DataFrame with outlier flags
print(exoplanets_copy)


      planet_name host_star_name  num_of_stars  num_of_planets  \
0        11 Com b         11 Com             2               1   
1        11 Com b         11 Com             2               1   
2        11 Com b         11 Com             2               1   
3        11 UMi b         11 UMi             1               1   
4        11 UMi b         11 UMi             1               1   
...           ...            ...           ...             ...   
35959   ups And d        ups And             2               3   
35960   ups Leo b        ups Leo             1               1   
35961    xi Aql b         xi Aql             1               1   
35962    xi Aql b         xi Aql             1               1   
35963    xi Aql b         xi Aql             1               1   

       orbital_period_day  orbit_semi_major_axis_au  planet_rad_earth_rad  \
0                     NaN                     1.210                   NaN   
1               326.03000                     1.290  

In [54]:
exoplanets_copy.to_csv('cleaned_exoplanets_z_score.csv', index=False)