# Explore ionosphere data

The goal of this notebook is to explore the ionosphere data set.
The following steps are taken:
1. Load the data
2. Explore the data
3. Generate a report on the data


### Load the data

In [None]:
from ucimlrepo import fetch_ucirepo
from loguru import logger

In [None]:
try:
    # fetch dataset 
    ionosphere = fetch_ucirepo(id=52)
    logger.info("Dataset loaded successfully!")
except Exception as e:
    logger.error(f"Error loading dataset: {e}")
    raise e
  
# format data (as pandas dataframes) 
X = ionosphere.data.features 
y = ionosphere.data.targets 
y = y["Class"]
y = y == "g"




### Quick data exploration

In [None]:
logger.info(f"Size of the design matrix X: {X.shape} ")
logger.info(f"Size of the labels y: {y.shape} ")


In [None]:
X.head()

In [None]:
y.head()

In [None]:
logger.info(f"Class distribution: {y.value_counts()}")

### Create a profile report


In [None]:
from ydata_profiling import ProfileReport

profile = ProfileReport(X, title="Profiling Report")
profile.to_file("report_data.html")

We notice from the report that the `Attribute2` column is constant and hence not useful for the classification task.

In [None]:
X = X.drop(columns=["Attribute2"])
logger.info(f"Size of the design matrix X: {X.shape} ")

### Save the processed data

In [None]:
import os
output_dir = "../data/processed"
# check if the directory exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
X.to_csv(os.path.join(output_dir, "X.csv"), index=False)
y.to_csv(os.path.join(output_dir, "y.csv"), index=False)
logger.info(f"Data saved to {output_dir}")
