# Data Preprocessing Classification

## Import libraries

In [12]:
from sklearn.datasets import load_iris, load_wine, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import pickle

## Load the dataset

In [13]:
## USER SELECT DATASET
# Options: "iris", "wine", "breast_cancer"
selected_dataset = "wine"
###########################################


if selected_dataset == "iris":
    data = load_iris(as_frame=True)
elif selected_dataset == "wine":
    data = load_wine(as_frame=True)
elif selected_dataset == "breast_cancer":
    data = load_breast_cancer(as_frame=True)
else:
    raise ValueError("Invalid dataset selected. Choose from 'iris', 'wine', or 'breast_cancer'.")


X = data.data
y = data.target

label_names = data.target_names

## Handle missing values

In [None]:
## USER change to handle NaN: "remove" or "impute"
handle_missing = "remove"
###################################################


if X.isnull().any().any():
    print("Found NaN values.")
    if handle_missing == "remove":
        print("Removing rows with NaN values...")
        X = X.dropna()
        y = y[X.index]
    elif handle_missing == "impute":
        print("Imputing missing values with mean...")
        from sklearn.impute import SimpleImputer
        imputer = SimpleImputer(strategy='mean')
        X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
    else:
        raise ValueError("Invalid option for 'handle_missing'. Choose 'remove' or 'impute'.")
else:
    print("No missing values found.")

## Data overview

In [None]:
print("Dataset Overview:")
display(X.head())
print("_"*100)
print("\nDataset Info:")
print(X.info())
print("_"*100)
print(f"\nNumber of samples: {X.shape[0]}")
print(f"Number of features: {X.shape[1]}")
print("_"*100)


label_count = y.value_counts().sort_index()
print("\nLabels and their counts:")
for i, label in enumerate(label_names):
    print(f"{label}: {label_count[i]}")


## Standardize dataset

In [5]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Save data

In [11]:
# Save standardized data for PCA (notebook 2) and raw data for CV (notebook 3) 
preprocessed_data = {
    "X_scaled": X_scaled,  # For PCA
    "X_raw": X,           # For CV
    "y": y,
    "label_names": label_names
}

with open('preprocessed_data.pkl', 'wb') as f:
    pickle.dump(preprocessed_data, f)