# Data Preprocessing Classification

## Import libraries

In [12]:
from sklearn.datasets import load_iris, load_wine, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import pickle

## Load the dataset

In [13]:
# User selects the dataset
# Options: "iris", "wine", "breast_cancer"
selected_dataset = "wine"  # Change this variable to choose a dataset

# Load the selected dataset
if selected_dataset == "iris":
    data = load_iris(as_frame=True)
elif selected_dataset == "wine":
    data = load_wine(as_frame=True)
elif selected_dataset == "breast_cancer":
    data = load_breast_cancer(as_frame=True)
else:
    raise ValueError("Invalid dataset selected. Choose from 'iris', 'wine', or 'breast_cancer'.")


X = data.data
y = data.target

label_names = data.target_names

## Handle missing values

In [3]:
# User change to handle NaN: "remove" or "impute"
handle_missing = "remove"


if X.isnull().any().any():
    print("Found NaN values.")
    if handle_missing == "remove":
        print("Removing rows with NaN values...")
        X = X.dropna()
        y = y[X.index]
    elif handle_missing == "impute":
        print("Imputing missing values with mean...")
        from sklearn.impute import SimpleImputer
        imputer = SimpleImputer(strategy='mean')
        X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
    else:
        raise ValueError("Invalid option for 'handle_missing'. Choose 'remove' or 'impute'.")
else:
    print("No missing values found.")

No missing values found.


## Data overview

In [10]:
print("Dataset Overview:")
display(X.head())
print("_"*100)
print("\nDataset Info:")
print(X.info())
print("_"*100)
print(f"\nNumber of samples: {X.shape[0]}")
print(f"Number of features: {X.shape[1]}")
print("_"*100)


label_count = y.value_counts().sort_index()
print("\nLabels and their counts:")
for i, label in enumerate(label_names):
    print(f"{label}: {label_count[i]}")


Dataset Overview:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


____________________________________________________________________________________________________

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
dtypes: float64(4)
memory usage: 4.8 KB
None
____________________________________________________________________________________________________

Number of samples: 150
Number of features: 4
____________________________________________________________________________________________________

Labels and their counts:
setosa: 50
versicolor: 50
virginica: 50


## Standardize dataset

In [5]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Save data

In [11]:
# Save standardized data for PCA and raw data for CV
preprocessed_data = {
    "X_scaled": X_scaled,  # For PCA
    "X_raw": X,           # For CV
    "y": y,
    "label_names": label_names
}

# Save data
with open('preprocessed_data.pkl', 'wb') as f:
    pickle.dump(preprocessed_data, f)