In [9]:
# keplar cleaning script

import pandas as pd

# Read the Kepler dataset, skipping comment lines
df = pd.read_csv("Kepler Objects of Interest.csv", comment="#")

print(f"Original dataset size: {len(df)} rows")
print(f"koi_disposition value counts:")
print(df["koi_disposition"].value_counts())

# remove all candidate rows from koi_disposition
df_filtered = df[df["koi_disposition"] != "CANDIDATE"]

KEPLAR_COLUMNS_TO_DROP = [
    "ra",
    "koi_steff",
    "koi_steff_err1",
    "koi_steff_err2",
    "koi_slogg",
    "koi_slogg_err1",
    "koi_slogg_err2",
    "koi_srad_err1",
    "koi_kepmag",
    "koi_kepmag",
    "koi_tce_plnt_num",
    "koi_insol_err1",
    "koi_insol_err2",
    "koi_insol",
    "koi_teq",
    "koi_prad_err2",
    "koi_prad_err1",
    "koi_prad",
    "koi_depth_err1",
    "koi_duration_err2",
    "koi_duration_err1",
    "koi_impact_err2",
    "koi_impact_err1",
    "koi_time0bk_err2",
    "koi_time0bk_err1",
    "koi_time0bk",
    "koi_period_err1",
    "koi_period_err1",
]

# drop koi_disposition column
df_filtered = df_filtered.drop(
    columns=["koi_pdisposition", "koi_teq_err1", "koi_teq_err2"]
)
df_filtered = df_filtered.drop(
    columns=["kepler_name", "kepid", "kepoi_name", "kepler_name"]
)
df_filtered = df_filtered.drop(columns=["koi_tce_delivname"])
# if any of the columns in KEPLAR_COLUMNS_TO_DROP do not exist in df_filtered, ignore the error
df_filtered = df_filtered.drop(columns=KEPLAR_COLUMNS_TO_DROP, errors='ignore')

print(f"\nFiltered dataset size: {len(df_filtered)} rows")
print(f"Rows removed: {len(df) - len(df_filtered)}")

# Save to CSV
output_file = "data_without_candidates/Kepler Objects of Interest - Filtered.csv"
df_filtered.to_csv(output_file, index=False)
print(f"\nFiltered data saved to '{output_file}'")

Original dataset size: 9564 rows
koi_disposition value counts:
koi_disposition
FALSE POSITIVE    4839
CONFIRMED         2746
CANDIDATE         1979
Name: count, dtype: int64

Filtered dataset size: 7585 rows
Rows removed: 1979

Filtered data saved to 'data_without_candidates/Kepler Objects of Interest - Filtered.csv'


In [4]:
# tess cleaning script

import pandas as pd

# Read the TESS dataset, skipping comment lines
df = pd.read_csv('tess data.csv', comment='#')

print(f"Original dataset size: {len(df)} rows")
print(f"tfopwg_disp value counts:")
print(df['tfopwg_disp'].value_counts())


# ESS Follow-up Observing Program Working Group (TOPWG) Dispostion: APC=ambiguous planetary candidate CP=confirmed planet :O. FA=false alarm :O. FP=false positive KP=known planet :O. PC=planetary candidate :O. 08h08m42 77c -48/48m10 12c _4 406+0 

# in tfopwg_disp, remove all PC, ie remove all candidates
df_filtered = df[df['tfopwg_disp'] != 'PC']
df_filtered = df[df['tfopwg_disp'] != 'APC']

# drop tfopwg_disp column
df_filtered = df_filtered.drop(columns=['toi', 'tid', 'pl_insolerr1', 'pl_insolerr2', 'pl_insollim', 'pl_eqterr1', 'pl_eqterr2', 'pl_eqtlim'])
df_filtered = df_filtered.drop(columns=['rastr', 'decstr', 'toi_created', 'rowupdate'])

print(f"\nFiltered dataset size: {len(df_filtered)} rows")
print(f"Rows removed: {len(df) - len(df_filtered)}")

# Save to CSV
output_file = 'data_without_candidates/TESS Objects of Interest - Filtered.csv'
df_filtered.to_csv(output_file, index=False)
print(f"\nFiltered data saved to '{output_file}'")

Original dataset size: 7703 rows
tfopwg_disp value counts:
tfopwg_disp
PC     4679
FP     1197
CP      684
KP      583
APC     462
FA       98
Name: count, dtype: int64

Filtered dataset size: 7241 rows
Rows removed: 462

Filtered data saved to 'data_without_candidates/TESS Objects of Interest - Filtered.csv'


In [5]:
# k2 cleaning script

import pandas as pd

# Read the TESS dataset, skipping comment lines
df = pd.read_csv('k2 planets and candidates.csv', comment='#')

print(f"Original dataset size: {len(df)} rows")
print(f"disposition value counts:")
print(df['disposition'].value_counts())

# remove all candidate rows from disposition
df_filtered = df[df['disposition'] != 'CANDIDATE']
# drop disposition column
df_filtered = df_filtered.drop(columns=['pl_name', 'hostname', 'default_flag'])

print(f"\nFiltered dataset size: {len(df_filtered)} rows")
print(f"Rows removed: {len(df) - len(df_filtered)}")

# Save to CSV
output_file = 'data_without_candidates/K2 Planets and Candidates - Filtered.csv'
df_filtered.to_csv(output_file, index=False)
print(f"\nFiltered data saved to '{output_file}'")

Original dataset size: 4004 rows
disposition value counts:
disposition
CONFIRMED         2315
CANDIDATE         1374
FALSE POSITIVE     293
REFUTED             22
Name: count, dtype: int64

Filtered dataset size: 2630 rows
Rows removed: 1374

Filtered data saved to 'data_without_candidates/K2 Planets and Candidates - Filtered.csv'


In [6]:
# k2 candidates extraction

import pandas as pd

# Read the K2 dataset, skipping comment lines
df = pd.read_csv('k2 planets and candidates.csv', comment='#')

print(f"Original dataset size: {len(df)} rows")

# keep only candidate rows from disposition
df_candidates = df[df['disposition'] == 'CANDIDATE']
# drop disposition column
df_candidates = df_candidates.drop(columns=['pl_name', 'hostname', 'default_flag'])

print(f"Candidates dataset size: {len(df_candidates)} rows")

# Save to CSV
output_file = 'data_with_candidates/K2 Planets and Candidates - Candidates.csv'
df_candidates.to_csv(output_file, index=False)
print(f"\nCandidates data saved to '{output_file}'")

Original dataset size: 4004 rows
Candidates dataset size: 1374 rows

Candidates data saved to 'data_with_candidates/K2 Planets and Candidates - Candidates.csv'


In [7]:
# tess candidates extraction

import pandas as pd

# Read the TESS dataset, skipping comment lines
df = pd.read_csv('tess data.csv', comment='#')

print(f"Original dataset size: {len(df)} rows")

# in tfopwg_disp, keep only PC (planetary candidates) and APC (ambiguous planetary candidates)
df_candidates = df[df['tfopwg_disp'].isin(['PC', 'APC'])]

# drop tfopwg_disp column
df_candidates = df_candidates.drop(columns=['toi', 'tid', ])
df_candidates = df_candidates.drop(columns=['rastr', 'decstr', 'toi_created', 'rowupdate'])
print(f"Candidates dataset size: {len(df_candidates)} rows")

# Save to CSV
output_file = 'data_with_candidates/TESS Objects of Interest - Candidates.csv'
df_candidates.to_csv(output_file, index=False)
print(f"\nCandidates data saved to '{output_file}'")

Original dataset size: 7703 rows
Candidates dataset size: 5141 rows

Candidates data saved to 'data_with_candidates/TESS Objects of Interest - Candidates.csv'


In [8]:
# keplar candidates extraction

import pandas as pd

# Read the Kepler dataset, skipping comment lines
df = pd.read_csv('Kepler Objects of Interest.csv', comment='#')

print(f"Original dataset size: {len(df)} rows")

# keep only candidate rows from koi_disposition
df_candidates = df[df['koi_disposition'] == 'CANDIDATE']
# drop koi_disposition column
# drop koi_disposition column
df_candidates = df_candidates.drop(columns=['koi_pdisposition', 'koi_teq_err1', 'koi_teq_err2'])
df_candidates = df_candidates.drop(columns=['kepler_name', 'kepid', 'kepoi_name', 'kepler_name'])
df_candidates = df_candidates.drop(columns=['koi_tce_delivname'])

print(f"Candidates dataset size: {len(df_candidates)} rows")

# Save to CSV
output_file = 'data_with_candidates/Kepler Objects of Interest - Candidates.csv'
df_candidates.to_csv(output_file, index=False)
print(f"\nCandidates data saved to '{output_file}'")

Original dataset size: 9564 rows
Candidates dataset size: 1979 rows

Candidates data saved to 'data_with_candidates/Kepler Objects of Interest - Candidates.csv'
