In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load and prepare the dataset
data = load_iris()
X, y = data.data, data.target

In [3]:
feature_names = data.feature_names
target_names = data.target_names
print("Feature Names:", feature_names)
print("Target Names:", target_names)

Feature Names: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Target Names: ['setosa' 'versicolor' 'virginica']


In [4]:
X.shape

(150, 4)

In [5]:
X[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [7]:
y[:20]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [8]:
# Standardize the dataset
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
# f_classif Feature Selection
selector = SelectKBest(score_func=f_classif, k=2)
X_selected = selector.fit_transform(X_scaled, y)
selected_features = [feature_names[i] for i in selector.get_support(indices=True)]

print(f"Selected features based on Z-score: {selected_features}")

Selected features based on Z-score: ['petal length (cm)', 'petal width (cm)']


In [10]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

In [11]:
# Function to train and evaluate the model
def train_evaluate(X_train, X_test, y_train, y_test, DR_name):
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, digits=4)
    print(f"\n{DR_name} - Accuracy: {accuracy}\n")
    print(f"{DR_name} - Classification Report:\n{report}\n")


In [12]:
# Train and evaluate the model on the selected features
train_evaluate(X_train, X_test, y_train, y_test, "f_classif Feature Selection")


f_classif Feature Selection - Accuracy: 1.0

f_classif Feature Selection - Classification Report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        19
           1     1.0000    1.0000    1.0000        13
           2     1.0000    1.0000    1.0000        13

    accuracy                         1.0000        45
   macro avg     1.0000    1.0000    1.0000        45
weighted avg     1.0000    1.0000    1.0000        45




### Key Differences:

| **Feature**                  | **PCA**                                          | **LDA**                                             | **t-SNE**                              |
| ---------------------------- | ------------------------------------------------ | --------------------------------------------------- | -------------------------------------- |
| **Type**                     | Unsupervised                                     | Supervised                                          | Unsupervised                           |
| **Objective**                | Maximize variance                                | Maximize class separation                           | Preserve local structures              |
| **Assumptions**              | Linear relationships                             | Linear and Gaussian                                 | Non-linear relationships               |
| **Dimensionality**           | Reduces based on variance                        | Reduces based on class separability                 | Reduces for visualization purposes     |
| **Computational Complexity** | Relatively fast                                  | Faster than t-SNE                                   | Computationally expensive              |
| **Best for**                 | Exploratory data analysis and feature extraction | Classification tasks, preserving class separability | Visualizing high-dimensional data      |
| **Scale Sensitivity**        | Sensitive (need to scale data)                   | Sensitive (need to scale data)                      | Sensitive (often needs careful tuning) |

### Summary:

* **PCA** is ideal when you want to retain the largest variance without regard to class structure, typically for feature extraction.
* **LDA** is best for reducing dimensions while maintaining class separability, making it ideal for classification tasks.
* **t-SNE** is most useful for visualizing high-dimensional data in lower dimensions, especially when you need to understand clustering or relationships between data points.

Each technique has its own strengths and is suited to different types of data and goals.


In [13]:
# Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.3, random_state=42)

# Train and evaluate the model on PCA features
train_evaluate(X_train_pca, X_test_pca, y_train_pca, y_test_pca, "PCA")

# Apply LDA
lda = LDA(n_components=2)
X_lda = lda.fit_transform(X_scaled, y)
X_train_lda, X_test_lda, y_train_lda, y_test_lda = train_test_split(X_lda, y, test_size=0.3, random_state=42)

# Train and evaluate the model on LDA features
train_evaluate(X_train_lda, X_test_lda, y_train_lda, y_test_lda, "LDA")

# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)
X_train_tsne, X_test_tsne, y_train_tsne, y_test_tsne = train_test_split(X_tsne, y, test_size=0.3, random_state=42)
# Train and evaluate the model on t-SNE features
train_evaluate(X_train_tsne, X_test_tsne, y_train_tsne, y_test_tsne, "t-SNE")


PCA - Accuracy: 0.9555555555555556

PCA - Classification Report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        19
           1     0.9231    0.9231    0.9231        13
           2     0.9231    0.9231    0.9231        13

    accuracy                         0.9556        45
   macro avg     0.9487    0.9487    0.9487        45
weighted avg     0.9556    0.9556    0.9556        45



LDA - Accuracy: 1.0

LDA - Classification Report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        19
           1     1.0000    1.0000    1.0000        13
           2     1.0000    1.0000    1.0000        13

    accuracy                         1.0000        45
   macro avg     1.0000    1.0000    1.0000        45
weighted avg     1.0000    1.0000    1.0000        45



t-SNE - Accuracy: 0.9777777777777777

t-SNE - Classification Report:
              precision    recall  f1-score   sup