In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)

In [20]:
file_path = '../data/raw/sbdb_query_results.csv'
df = pd.read_csv(file_path)
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39852 entries, 0 to 39851
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   pha     39722 non-null  object 
 1   H       39849 non-null  float64
 2   e       39852 non-null  float64
 3   a       39852 non-null  float64
 4   q       39852 non-null  float64
 5   i       39852 non-null  float64
 6   moid    39722 non-null  float64
dtypes: float64(6), object(1)
memory usage: 2.1+ MB
None
  pha      H       e      a      q      i    moid
0   N  10.38  0.2228  1.458  1.133  10.83  0.1480
1   N  15.59  0.5466  2.637  1.195  11.57  0.2010
2   N  13.81  0.5712  2.474  1.061   9.40  0.0797
3   N   9.18  0.5332  2.665  1.244  26.68  0.3430
4   N  17.37  0.4346  1.920  1.085  11.87  0.1080


In [21]:
# 1. Check unique values
print("Values in 'pha' before cleaning:", df['pha'].unique())

# 2. Replace NaN with 'N'
df['pha'] = df['pha'].fillna('N')

# 3. Check again
print("Values in 'pha' after fillna():", df['pha'].unique())

# 4. Convert 'Y'/'N' to 1/0
df['pha'] = df['pha'].map({'Y': 1, 'N': 0})

# 5. Show result
print(df.head())

# 6. Check class balance
print("\nClass balance:")
print(df['pha'].value_counts())

Values in 'pha' before cleaning: ['N' 'Y' nan]
Values in 'pha' after fillna(): ['N' 'Y']
   pha      H       e      a      q      i    moid
0    0  10.38  0.2228  1.458  1.133  10.83  0.1480
1    0  15.59  0.5466  2.637  1.195  11.57  0.2010
2    0  13.81  0.5712  2.474  1.061   9.40  0.0797
3    0   9.18  0.5332  2.665  1.244  26.68  0.3430
4    0  17.37  0.4346  1.920  1.085  11.87  0.1080

Balance des classes :
pha
0    37338
1     2514
Name: count, dtype: int64


In [22]:
# 1. Check missing values
print("Missing values BEFORE cleaning (H, moid, etc.):")
print(df.isnull().sum())

# 2. Remove rows with missing H or moid
print(f"\nSize before removing NaN: {len(df)}")
df_cleaned = df.dropna()
print(f"Size after removing NaN: {len(df_cleaned)}")

# 3. Verify
print("\nMissing values AFTER cleaning:")
print(df_cleaned.isnull().sum())  # Should show 0 everywhere


Valeurs manquantes AVANT nettoyage (H, moid, etc.) :
pha       0
H         3
e         0
a         0
q         0
i         0
moid    130
dtype: int64

Taille avant suppression NaN: 39852
Taille apr√®s suppression NaN: 39719

Valeurs manquantes APRES nettoyage :
pha     0
H       0
e       0
a       0
q       0
i       0
moid    0
dtype: int64


In [23]:
print(df_cleaned.info())

<class 'pandas.core.frame.DataFrame'>
Index: 39719 entries, 0 to 39851
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   pha     39719 non-null  int64  
 1   H       39719 non-null  float64
 2   e       39719 non-null  float64
 3   a       39719 non-null  float64
 4   q       39719 non-null  float64
 5   i       39719 non-null  float64
 6   moid    39719 non-null  float64
dtypes: float64(6), int64(1)
memory usage: 2.4 MB
None


In [24]:
# Save cleaned file to 'processed' folder
output_path = '../data/processed/asteroids_cleaned.csv'
df_cleaned.to_csv(output_path, index=False)

print(f"Cleaned data saved to {output_path}")

Cleaned data saved to ../data/processed/asteroids_cleaned.csv
