<a href="https://colab.research.google.com/github/LuisKolb/viz2-2023S/blob/main/feature_space_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA
import itertools
import os

## IRIS

In [26]:
# dataset file generation

# base_dir = 'src/resources/iris'
base_dir = "files_out/iris/"

# Load the Iris dataset
iris = load_iris()

for combination in list(itertools.combinations(enumerate(iris.feature_names), 2)):
    print(combination[0], combination[1])

    # Select the features to use for classification
    X = iris.data[
        :, [combination[0][0], combination[1][0]]
    ]  # sepal length and petal width
    y = iris.target

    # print(f'min(X0)={min(X[:,0])}   ; max(X0)={max(X[:,0])} ;   step(X0)={(max(X[:,0])-min(X[:,0]))/100}')

    # Split the dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Train a neural network classifier with a logistic output layer
    clf = MLPClassifier(
        hidden_layer_sizes=(7,),
        max_iter=5000,
        random_state=42,
        activation="logistic",
        solver="adam",
        alpha=0.0001,
        batch_size="auto",
        learning_rate="constant",
        learning_rate_init=0.001,
        power_t=0.5,
        momentum=0.9,
        nesterovs_momentum=True,
        early_stopping=False,
        validation_fraction=0.1,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-08,
        n_iter_no_change=10,
        tol=0.0001,
        verbose=False,
        warm_start=False,
    )
    clf.fit(X_train, y_train)

    # Test classifier
    # Use the trained classifier to make predictions on the test dataset
    y_pred = clf.predict(np.concatenate((X_train, X_test), axis=0))

    # Compare the predicted labels with the true labels
    correct_samples = sum(y_pred == np.concatenate((y_train, y_test), axis=0))
    incorrect_samples = sum(y_pred != np.concatenate((y_train, y_test), axis=0))

    # Print the number of correct and incorrect identified samples
    print(f"Correct samples: {correct_samples}")
    print(f"Incorrect samples: {incorrect_samples}")

    # Generate a grid of feature combinations ()
    grid_size = 60  # controls plot "resolution"
    X0_range = np.arange(
        min(X[:, 0]), max(X[:, 0]), (max(X[:, 0]) - min(X[:, 0])) / grid_size
    )
    X1_range = np.arange(
        min(X[:, 1]), max(X[:, 1]), (max(X[:, 1]) - min(X[:, 1])) / grid_size
    )
    X0, X1 = np.meshgrid(X0_range, X1_range)
    X_grid = np.column_stack((X0.ravel(), X1.ravel()))


    # Predict the class probabilities for each feature combination
    y_grid_prob = clf.predict_proba(X_grid)

    if not os.path.exists(base_dir):
      os.makedirs(base_dir)
    
    for i, class_name in enumerate(iris.target_names):
        # save format: X0 X1 <probability of class 1> <probability of class 2> <probability of class 3>
        X0_X1_classprobs = np.column_stack((X_grid, y_grid_prob))
        np.savetxt(
            fname=f"{base_dir}/{combination[0][1]}_vs_{combination[1][1]}.txt",
            X=X0_X1_classprobs,
            delimiter=",",
            header=f"{combination[0][1]},{combination[1][1]},{','.join(iris.target_names)}",
            comments=""
        )

(0, 'sepal length (cm)') (1, 'sepal width (cm)')
Correct samples: 111
Incorrect samples: 39
(0, 'sepal length (cm)') (2, 'petal length (cm)')
Correct samples: 143
Incorrect samples: 7
(0, 'sepal length (cm)') (3, 'petal width (cm)')
Correct samples: 144
Incorrect samples: 6
(1, 'sepal width (cm)') (2, 'petal length (cm)')
Correct samples: 140
Incorrect samples: 10
(1, 'sepal width (cm)') (3, 'petal width (cm)')
Correct samples: 143
Incorrect samples: 7
(2, 'petal length (cm)') (3, 'petal width (cm)')
Correct samples: 144
Incorrect samples: 6


## palmerpenguins

In [54]:
# dataset file generation

# base_dir = 'src/resources/penguins'
base_dir = "files_out/penguins/"

# Load the penguins dataset
penguins = pd.read_csv("data/penguins.csv")
penguins.dropna(inplace=True)
# display(penguins.isna().sum())
penguins.feature_names = ["bill_length_mm","bill_depth_mm","flipper_length_mm","body_mass_g"]
penguins.data = penguins[penguins.feature_names]

penguins.target_names = ["Torgersen", "Biscoe", "Dream"]
penguins.target = pd.factorize(penguins["island"])


for combination in list(itertools.combinations(enumerate(penguins.feature_names), 2)):
    print([combination[0][1], combination[1][1]])

    X1 = combination[0][1]
    
    X2 = combination[1][1]
    # Select the features to use for classification
    X = penguins[[X1, X2]] 
    y = penguins.target[0]

    # Split the dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Train a neural network classifier with a logistic output layer
    clf = MLPClassifier(
        hidden_layer_sizes=(5,),
        max_iter=1000,
        random_state=42,
        activation="logistic",
        solver="adam",
        alpha=0.0001,
        batch_size="auto",
        learning_rate="constant",
        learning_rate_init=0.001,
        power_t=0.5,
        momentum=0.9,
        nesterovs_momentum=True,
        early_stopping=False,
        validation_fraction=0.1,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-08,
        n_iter_no_change=10,
        tol=0.0001,
        verbose=False,
        warm_start=False,
    )
    clf.fit(X_train.values, y_train)

    # Test classifier
    # Use the trained classifier to make predictions on the test dataset
    y_pred = clf.predict(np.concatenate((X_train, X_test), axis=0))

    # Compare the predicted labels with the true labels
    correct_samples = sum(y_pred == np.concatenate((y_train, y_test), axis=0))
    incorrect_samples = sum(y_pred != np.concatenate((y_train, y_test), axis=0))

    # Print the number of correct and incorrect identified samples
    print(f"Correct samples: {correct_samples}")
    print(f"Incorrect samples: {incorrect_samples}")

    # Generate a grid of feature combinations ()
    grid_size = 20  # controls plot "resolution"
    X0_range = np.arange(
        X.min()[0], X.max()[0], (X.max()[0] - X.min()[0]) / grid_size
    )
    X1_range = np.arange(
        X.min()[1], X.max()[1], (X.max()[1] - X.min()[1]) / grid_size
    )
    X0, X1 = np.meshgrid(X0_range, X1_range)
    X_grid = np.column_stack((X0.ravel(), X1.ravel()))


    # Predict the class probabilities for each feature combination
    y_grid_prob = clf.predict_proba(X_grid)


    if not os.path.exists(base_dir):
      os.makedirs(base_dir)
    
    for i, class_name in enumerate(penguins.target_names):
        # save format: X0 X1 <probability of class 1> <probability of class 2> <probability of class 3>
        X0_X1_classprobs = np.column_stack((X_grid, y_grid_prob))
        np.savetxt(
            fname=f"{base_dir}/{combination[0][1]}_vs_{combination[1][1]}.txt",
            X=X0_X1_classprobs,
            delimiter=",",
            header=f"{combination[0][1]},{combination[1][1]},{','.join(penguins.target_names)}",
            comments=""
        )

  penguins.feature_names = ["bill_length_mm","bill_depth_mm","flipper_length_mm","body_mass_g"]
  penguins.data = penguins[penguins.feature_names]
  penguins.target_names = ["Torgersen", "Biscoe", "Dream"]
  penguins.target = pd.factorize(penguins["species"])


['bill_length_mm', 'bill_depth_mm']




Correct samples: 322
Incorrect samples: 11
['bill_length_mm', 'flipper_length_mm']
Correct samples: 246
Incorrect samples: 87
['bill_length_mm', 'body_mass_g']
Correct samples: 146
Incorrect samples: 187
['bill_depth_mm', 'flipper_length_mm']
Correct samples: 265
Incorrect samples: 68
['bill_depth_mm', 'body_mass_g']
Correct samples: 146
Incorrect samples: 187
['flipper_length_mm', 'body_mass_g']
Correct samples: 146
Incorrect samples: 187
