In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.colors as mcolors
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap.umap_ as umap
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.model_selection import GridSearchCV, learning_curve, validation_curve
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix, ConfusionMatrixDisplay,
    roc_curve, auc, roc_auc_score,
    adjusted_rand_score, normalized_mutual_info_score,
    fowlkes_mallows_score, silhouette_score,
    calinski_harabasz_score, davies_bouldin_score
)


## 1. Preprocessing

In [None]:
X_train = pd.read_csv(
    r"D:\\Coding\\DSAA2011_project\\human+activity+recognition+using+smartphones\\UCI HAR Dataset\\UCI HAR Dataset\\train\\X_train.txt",
    sep=r'\s+', header=None)
y_train = pd.read_csv(
    r"D:\\Coding\\DSAA2011_project\\human+activity+recognition+using+smartphones\\UCI HAR Dataset\\UCI HAR Dataset\\train\\y_train.txt",
    sep=r'\s+', header=None).squeeze()
X_test = pd.read_csv(
    r"D:\\Coding\\DSAA2011_project\\human+activity+recognition+using+smartphones\\UCI HAR Dataset\\UCI HAR Dataset\\test\\X_test.txt",
    sep=r'\s+', header=None)
y_test = pd.read_csv(
    r"D:\\Coding\\DSAA2011_project\\human+activity+recognition+using+smartphones\\UCI HAR Dataset\\UCI HAR Dataset\\test\\y_test.txt",
    sep=r'\s+', header=None).squeeze()

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



**Applied Methods:**
We read the UCI HAR dataset and used StandardScaler to normalize the feature values so they are on the same scale.

**Observed Patterns or Insights:**
The dataset has 561 continuous features and clear labels. There are no missing values, and the data is ready for further analysis like classification or clustering.

## 2. Visualization

In [None]:
methods = {
    "PCA": PCA(n_components=2),
    "t-SNE": TSNE(n_components=2, perplexity=30, random_state=42),
    "UMAP": umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1, random_state=42)
}

plt.figure(figsize=(18, 5))
for i, (name, model) in enumerate(methods.items()):
    X_vis = model.fit_transform(X_train_scaled)
    plt.subplot(1, 3, i + 1)
    plt.scatter(X_vis[:, 0], X_vis[:, 1], c=y_train, cmap='tab10', s=10)
    plt.title(name)
    plt.xlabel("Dim 1")
    plt.ylabel("Dim 2")

plt.tight_layout()
plt.show()


**Applied Methods:**
We used StandardScaler to normalize the data, then applied PCA, t-SNE, and UMAP to reduce the data to 2D for visualization.

**Observed Patterns or Insights:**
PCA results were scattered with no clear clusters. t-SNE showed clear groupings by label. UMAP also formed tight clusters and kept some global structure.

## 3. Clustering

In [None]:
algorithms = {
    'KMeans': KMeans(n_clusters=6, random_state=42),
    'GMM': GaussianMixture(n_components=6, random_state=42),
    'Agglomerative': AgglomerativeClustering(n_clusters=6, linkage='ward'),
    'DBSCAN': DBSCAN(eps=0.5, min_samples=3)
}

def purity_score(y_train, y_pred):
    matrix = confusion_matrix(y_train, y_pred)
    return np.sum(np.max(matrix, axis=0)) / np.sum(matrix)

def evaluate_clustering(y_train, X_train, labels):
    valid = len(set(labels)) > 1 and -1 not in set(labels)
    return {
        'ARI': adjusted_rand_score(y_train, labels),
        'NMI': normalized_mutual_info_score(y_train, labels),
        'FMI': fowlkes_mallows_score(y_train, labels),
        'Purity': purity_score(y_train, labels),
        'Silhouette': silhouette_score(X_train, labels) if valid else np.nan,
        'CH': calinski_harabasz_score(X_train, labels) if valid else np.nan,
        'DBI': davies_bouldin_score(X_train, labels) if valid else np.nan,
    }

results = {}
cluster_labels = {}

for name, algo in algorithms.items():
    print(f"Clustering with {name}...")
    if name == 'GMM':
        labels = algo.fit_predict(X_train)
    else:
        labels = algo.fit(X_train).labels_ if hasattr(algo, 'fit_predict') else algo.fit_predict(X_train)
    cluster_labels[name] = labels
    results[name] = evaluate_clustering(y_train, X_train, labels)

results_df = pd.DataFrame(results).T
print("\n=== Clustering Evaluation Results ===")
print(results_df)

results_df.to_csv("clustering_evaluation_summary.csv")

tsne = TSNE(n_components=2, perplexity=40, random_state=42)
X_tsne = tsne.fit_transform(X_train)

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.ravel()

for i, (name, labels) in enumerate(cluster_labels.items()):
    axes[i].scatter(X_tsne[:, 0], X_tsne[:, 1], c=labels, cmap='tab10', s=10)
    axes[i].set_title(f"{name} Clustering")
    axes[i].set_xticks([])
    axes[i].set_yticks([])

plt.tight_layout()
plt.show()

external_scores = results_df[['ARI', 'NMI', 'FMI', 'Purity']].mean(axis=1)
top2 = external_scores.sort_values(ascending=False).head(2).index.tolist()
print(f"Top 2 clustering methods selected for ensemble: {top2}")

ensemble_features = np.vstack([
    cluster_labels[top2[0]],
    cluster_labels[top2[1]]
]).T

ensemble_kmeans = KMeans(n_clusters=6, random_state=42)
ensemble_labels = ensemble_kmeans.fit_predict(ensemble_features)

cluster_labels['Ensemble'] = ensemble_labels
results['Ensemble'] = evaluate_clustering(y_train, X_train, ensemble_labels)

results_df = pd.DataFrame(results).T
print("\n=== Updated Clustering Evaluation Results (with Ensemble) ===")
print(results_df)
results_df.to_csv("clustering_evaluation_with_ensemble.csv")

plt.figure(figsize=(6, 5))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_train, cmap='tab10', s=10)
plt.title("Ground Truth Labels")
plt.xticks([])
plt.yticks([])
plt.grid(True)
plt.tight_layout()
plt.show()

plt.figure(figsize=(6, 5))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=ensemble_labels, cmap='tab10', s=10)
plt.title("Ensemble Clustering")
plt.xticks([])
plt.yticks([])
plt.grid(True)
plt.tight_layout()
plt.show()


### Clustering Methods

- **K-Means**: Fast and simple; works well for spherical clusters; commonly used as a baseline.  
- **GMM**: Probabilistic model; supports soft clustering; good for overlapping clusters.  
- **Agglomerative Clustering**: Hierarchical approach; captures structure; doesn’t rely on initial centroids.  
- **DBSCAN**: Density-based; no need to set cluster number; can detect noise.


### Evaluation Metrics

**External Metrics**:

- **ARI**: Adjusted similarity to ground truth.  
- **NMI**: Measures shared information.  
- **FMI**, **Purity**: Accuracy and cluster purity.

**Internal Metrics**:

- **Silhouette Score**: Cluster separation and cohesion.  
- **Calinski-Harabasz Index (CH)**: Ratio of between- to within-cluster variance.  
- **Davies-Bouldin Index (DB)**: Cluster similarity (lower is better).


### Results & Insights

- **GMM**: Best performance overall; closely matches true labels.  
- **K-Means**: Fast and stable; decent results, but weaker on internal metrics.  
- **Agglomerative**: Moderate results; affected by high dimensionality.  
- **DBSCAN**: Poor performance; struggled with structure and density variation.


### Conclusion

**GMM** and **K-Means** are the most suitable clustering methods for this dataset.


## 4. Model Training
- Logistic Regression 
- SVM
- Random Forest 

The data is read separately in the following code

In [None]:
# Load features
X_train = np.loadtxt('train/X_train.txt')
X_test = np.loadtxt('test/X_test.txt')

y_train = np.loadtxt('train/y_train.txt').astype(int)
y_test = np.loadtxt('test/y_test.txt').astype(int)

# Train logistic regression model
clf = LogisticRegression(max_iter=200, solver='lbfgs', multi_class='multinomial')
clf.fit(X_train, y_train)

# Predict on test set
y_pred = clf.predict(X_test)
y_pred_train = clf.predict(X_train)

# Calculate performance metrics for training set and test set
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred)
print(f'Logistic Regression Train Accuracy: {train_accuracy:.4f}')
print(f'Logistic Regression Test Accuracy: {test_accuracy:.4f}')

# Precision
train_precision = precision_score(y_train, y_pred_train, average='weighted')
test_precision = precision_score(y_test, y_pred, average='weighted')
print(f'Logistic Regression Train Precision: {train_precision:.4f}')
print(f'Logistic Regression Test Precision: {test_precision:.4f}')

# Recall
train_recall = recall_score(y_train, y_pred_train, average='weighted')
test_recall = recall_score(y_test, y_pred, average='weighted')
print(f'Logistic Regression Train Recall: {train_recall:.4f}')
print(f'Logistic Regression Test Recall: {test_recall:.4f}')

# F1
train_f1 = f1_score(y_train, y_pred_train, average='weighted')
test_f1 = f1_score(y_test, y_pred, average='weighted')
print(f'Logistic Regression Train F1 Score: {train_f1:.4f}')
print(f'Logistic Regression Test F1 Score: {test_f1:.4f}')

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Visualize performance metrics
plt.figure(figsize=(10, 6))

metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
train_scores = [train_accuracy, train_precision, train_recall, train_f1]
test_scores = [test_accuracy, test_precision, test_recall, test_f1]

x = np.arange(len(metrics))
width = 0.35

plt.bar(x - width/2, train_scores, width, label='Train', color='skyblue')
plt.bar(x + width/2, test_scores, width, label='Test', color='orange')

plt.ylim(0, 1)
plt.ylabel('Score')
plt.title('Logistic Regression Performance Metrics')
plt.xticks(x, metrics)
plt.legend(loc='lower right')
plt.grid(axis='y', alpha=0.3)

# Add text annotations for each bar
for i in range(len(metrics)):
    plt.text(i - width/2, train_scores[i] + 0.01, f'{train_scores[i]:.4f}', ha='center', va='bottom', fontsize=9)
    plt.text(i + width/2, test_scores[i] + 0.01, f'{test_scores[i]:.4f}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig('logistic_performance_metrics.png')
plt.show()

# Visualize accuracy
plt.figure(figsize=(5, 4))
plt.bar(['Train', 'Test'], [train_accuracy, test_accuracy], color=['skyblue', 'orange'])
plt.ylim(0, 1)
plt.ylabel('Accuracy')
plt.title('Logistic Regression Accuracy')
for i, v in enumerate([train_accuracy, test_accuracy]):
    plt.text(i, v + 0.01, f'{v:.4f}', ha='center', fontsize=12)
plt.tight_layout()
plt.show()

# Confusion matrix visualization
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.tight_layout()
plt.savefig('logistic_confusion_matrix.png')
plt.show()


# ROC and AUC visualization
print("\nGenerating ROC and AUC visualization...")

n_classes = len(np.unique(y_test))
# Binarize the output for multi-class ROC

y_test_bin = label_binarize(y_test, classes=np.arange(1, n_classes+1))
y_score = clf.predict_proba(X_test)
fpr = dict()
tpr = dict()
roc_auc = dict()
plt.figure(figsize=(8, 6))
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
    plt.plot(fpr[i], tpr[i], lw=2, label=f'Class {i+1} (AUC = {roc_auc[i]:.2f})')
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test_bin.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
plt.plot(fpr["micro"], tpr["micro"],
         label=f'micro-average ROC curve (AUC = {roc_auc["micro"]:.2f})',
         color='deeppink', linestyle=':', linewidth=4)
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multi-class ROC Curve (One-vs-Rest)')
plt.legend(loc='lower right')
plt.tight_layout()
plt.savefig('logistic_multiclass_roc_auc.png')
plt.close()
print("Multi-class ROC and AUC visualization saved as 'logistic_multiclass_roc_auc.png'")


# Learning curve visualization
train_sizes, train_scores, test_scores = learning_curve(
    clf, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 10), shuffle=True, random_state=42)
train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
plt.figure(figsize=(8, 4))
plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training score')
plt.plot(train_sizes, test_scores_mean, 'o-', color='g', label='Cross-validation score')
plt.xlabel('Training Examples')
plt.ylabel('Score')
plt.title('Learning Curve (Logistic Regression)')
plt.legend(loc='best')
plt.grid()
plt.tight_layout()
plt.savefig('logistic_learning_curve.png')
plt.close()

# Visualize decision boundary with PCA dimensionality reduction
print("\nGenerating decision boundary visualization...")

# Load activity labels
labels = []
with open('activity_labels.txt', 'r') as f:
    for line in f:
        labels.append(line.strip().split(' ')[1])

# Use a subset of test data for visualization
sample_size = min(1000, X_test.shape[0])
indices = np.random.choice(X_test.shape[0], sample_size, replace=False)
X_test_sample = X_test[indices]
y_test_sample = y_test[indices]

# Apply PCA to reduce dimensions to 2D
pca = PCA(n_components=2)
X_test_pca = pca.fit_transform(X_test_sample)

# Train a new logistic regression model on the PCA-reduced data
# We'll use a binary classification for visualization
# Select the most common class versus all others
unique_classes, class_counts = np.unique(y_test_sample, return_counts=True)
most_common_class = unique_classes[np.argmax(class_counts)]
y_binary = (y_test_sample == most_common_class).astype(int)

# Train logistic regression on 2D data
lr_2d = LogisticRegression(max_iter=200, solver='lbfgs')
lr_2d.fit(X_test_pca, y_binary)

# Create a mesh grid for visualization
h = 0.02  # Step size
x_min, x_max = X_test_pca[:, 0].min() - 1, X_test_pca[:, 0].max() + 1
y_min, y_max = X_test_pca[:, 1].min() - 1, X_test_pca[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Predict class probabilities for each point in the mesh
Z = lr_2d.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)

# Plot the decision boundary
plt.figure(figsize=(12, 10))
plt.contourf(xx, yy, Z, levels=np.linspace(0, 1, 11), cmap=plt.cm.RdBu, alpha=0.8)
plt.colorbar(label='Probability of Class 1')

# Plot decision boundary contour
plt.contour(xx, yy, Z, levels=[0.5], colors='k', linestyles='-', linewidths=2)

# Plot class samples
plt.scatter(X_test_pca[y_binary == 0, 0], X_test_pca[y_binary == 0, 1], 
           c='blue', label=f'Other Classes', s=50, alpha=0.8, edgecolors='k')
plt.scatter(X_test_pca[y_binary == 1, 0], X_test_pca[y_binary == 1, 1], 
           c='red', label=f'Class {most_common_class} ({labels[most_common_class-1]})', s=50, alpha=0.8, edgecolors='k')

plt.title('Logistic Regression Decision Boundary (PCA-Reduced Data)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(loc='best')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('logistic_decision_boundary.png')
plt.close()
print("Logistic regression decision boundary visualization saved as 'logistic_decision_boundary.png'")

# Visualize multi-class decision boundary if possible
print("\nGenerating multi-class decision boundary visualization...")

# Train model on all classes with 2D data
lr_multi = LogisticRegression(max_iter=200, solver='lbfgs', multi_class='multinomial')
lr_multi.fit(X_test_pca, y_test_sample)

# Create a mesh grid for visualization
Z_multi = lr_multi.predict(np.c_[xx.ravel(), yy.ravel()])
Z_multi = Z_multi.reshape(xx.shape)

# Plot the decision boundary
plt.figure(figsize=(12, 10))

# Create a custom colormap with the number of colors equal to the number of classes
n_classes = len(unique_classes)
colors = list(plt.cm.tab10.colors)[:n_classes]
cmap = mcolors.ListedColormap(colors)

# Plot the decision regions
plt.contourf(xx, yy, Z_multi, alpha=0.3, cmap=cmap)

# Plot class samples
scatter = plt.scatter(X_test_pca[:, 0], X_test_pca[:, 1], 
                     c=y_test_sample, cmap=cmap, s=50, alpha=0.9, edgecolors='k')

plt.title('Multi-class Logistic Regression Decision Boundaries (PCA-Reduced Data)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')

# Add legend with class labels
legend_elements = []
for i, class_id in enumerate(unique_classes):
    legend_elements.append(plt.Line2D([0], [0], marker='o', color='w', 
                          markerfacecolor=colors[i % len(colors)], 
                          markersize=10, label=f'Class {class_id} ({labels[class_id-1]})'))
    
plt.legend(handles=legend_elements, loc='best')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('logistic_multiclass_boundary.png')
plt.close()
print("Multi-class logistic regression boundaries visualization saved as 'logistic_multiclass_boundary.png'")




1. **Data Loading & Preparation**  
   - Read training and test features (`X_train`, `X_test`) and labels (`y_train`, `y_test`) from text files.  
   - No additional preprocessing is shown, assuming data is ready for model fitting.

2. **Model Training**  
   - Instantiate a multinomial **Logistic Regression** classifier (`lbfgs` solver, `max_iter=200`).  
   - Fit the model on the training set (`X_train`, `y_train`).

3. **Performance Evaluation**  
   - Compute **accuracy**, **precision**, **recall**, and **F1-score** on both train and test sets.  
   - Display a **classification report** and **confusion matrix** to inspect per-class results.  
   - Plot a bar chart comparing train vs. test metrics for quick visual assessment.

4. **ROC & AUC (Multi-class)**  
   - Binarize labels and compute **one-vs-rest** ROC curves for each class.  
   - Calculate area under the curve (AUC) and plot micro-average ROC.

5. **Learning Curve**  
   - Use `learning_curve` to measure training and cross-validation accuracy as a function of training set size.  
   - Visualize how model performance scales with more data.

6. **Decision Boundary Visualization**  
   - Apply **PCA** to project high-dimensional test samples into 2D.  
   - Train a binary logistic model on the two principal components to show a single-class decision boundary.  
   - Train a multinomial logistic model on the same 2D data to display multi-class decision regions.  
   - Overlay sample points and contour plots to illustrate how the classifier separates classes.


**SVM**

In [None]:
# Loading dataset
print("loading data...")
X_train = np.loadtxt('train/X_train.txt')
X_test = np.loadtxt('test/X_test.txt')

y_train = np.loadtxt('train/y_train.txt').astype(int)
y_test = np.loadtxt('test/y_test.txt').astype(int)

# Load feature names
labels = []
with open('activity_labels.txt', 'r') as f:
    for line in f:
        labels.append(line.strip().split(' ')[1])

print(f"Data loading completed: {X_train.shape[0]} training samples, {X_test.shape[0]} test samples")
print(f"Number of features: {X_train.shape[1]}")
print(f"Activity classes: {len(set(y_train))}")

# Training basic SVM model
print("Training basic SVM model...")
svm_model = SVC(kernel='rbf', gamma='scale', C=1.0)
svm_model.fit(X_train, y_train)

# Predicting on test set
y_pred = svm_model.predict(X_test)

# Evaluating basic model
# Calculate performance metrics for training set
y_pred_train = svm_model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
train_precision = precision_score(y_train, y_pred_train, average='weighted')
train_recall = recall_score(y_train, y_pred_train, average='weighted')
train_f1 = f1_score(y_train, y_pred_train, average='weighted')

# Calculate performance metrics for test set
test_accuracy = accuracy_score(y_test, y_pred)
test_precision = precision_score(y_test, y_pred, average='weighted')
test_recall = recall_score(y_test, y_pred, average='weighted')
test_f1 = f1_score(y_test, y_pred, average='weighted')

# Print all metrics
print(f"SVM Train Accuracy: {train_accuracy:.4f}")
print(f"SVM Test Accuracy: {test_accuracy:.4f}")
print(f"SVM Train Precision: {train_precision:.4f}")
print(f"SVM Test Precision: {test_precision:.4f}")
print(f"SVM Train Recall: {train_recall:.4f}")
print(f"SVM Test Recall: {test_recall:.4f}")
print(f"SVM Train F1 Score: {train_f1:.4f}")
print(f"SVM Test F1 Score: {test_f1:.4f}")



# Using grid search for hyperparameter tuning
print("\nPerforming hyperparameter tuning...")
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto', 0.1, 0.01],
    'kernel': ['rbf', 'linear']
}

# Using a subset of training data for hyperparameter search to reduce computation time
sample_size = min(5000, X_train.shape[0])
indices = np.random.choice(X_train.shape[0], sample_size, replace=False)
X_sample = X_train[indices]
y_sample = y_train[indices]

grid_search = GridSearchCV(SVC(), param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_sample, y_sample)

print(f"\nBest parameters: {grid_search.best_params_}")

# Train final model using best parameters
final_model = SVC(**grid_search.best_params_)
final_model.fit(X_train, y_train)

# Evaluate final model on test set
# Get predictions for both training and test sets
y_pred_final_train = final_model.predict(X_train)
y_pred_final = final_model.predict(X_test)

# Calculate performance metrics for training set
final_train_accuracy = accuracy_score(y_train, y_pred_final_train)
final_train_precision = precision_score(y_train, y_pred_final_train, average='weighted')
final_train_recall = recall_score(y_train, y_pred_final_train, average='weighted')
final_train_f1 = f1_score(y_train, y_pred_final_train, average='weighted')

# Calculate performance metrics for test set
final_test_accuracy = accuracy_score(y_test, y_pred_final)
final_test_precision = precision_score(y_test, y_pred_final, average='weighted')
final_test_recall = recall_score(y_test, y_pred_final, average='weighted')
final_test_f1 = f1_score(y_test, y_pred_final, average='weighted')

# Print results for optimized model
print("\nOptimized SVM Model Performance Metrics:")
print(f"Optimized SVM Train Accuracy: {final_train_accuracy:.4f}")
print(f"Optimized SVM Test Accuracy: {final_test_accuracy:.4f}")
print(f"Optimized SVM Train Precision: {final_train_precision:.4f}")
print(f"Optimized SVM Test Precision: {final_test_precision:.4f}")
print(f"Optimized SVM Train Recall: {final_train_recall:.4f}")
print(f"Optimized SVM Test Recall: {final_test_recall:.4f}")
print(f"Optimized SVM Train F1 Score: {final_train_f1:.4f}")
print(f"Optimized SVM Test F1 Score: {final_test_f1:.4f}")

print("\nClassification report for each activity after optimization:")
print(classification_report(y_test, y_pred_final, target_names=labels))


# Visualization Section

print("\nGenerating visualizations...")

# Visualize performance metrics
plt.figure(figsize=(10, 6))

metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
train_scores = [final_train_accuracy, final_train_precision, final_train_recall, final_train_f1]
test_scores = [final_test_accuracy, final_test_precision, final_test_recall, final_test_f1]

x = np.arange(len(metrics))
width = 0.35

plt.bar(x - width/2, train_scores, width, label='Train', color='skyblue')
plt.bar(x + width/2, test_scores, width, label='Test', color='orange')

plt.ylim(0, 1)
plt.ylabel('Score')
plt.title('SVM Performance Metrics')
plt.xticks(x, metrics)
plt.legend(loc='lower right')
plt.grid(axis='y', alpha=0.3)

# Add value labels
for i in range(len(metrics)):
    plt.text(i - width/2, train_scores[i] + 0.01, f'{train_scores[i]:.4f}', ha='center', va='bottom', fontsize=9)
    plt.text(i + width/2, test_scores[i] + 0.01, f'{test_scores[i]:.4f}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig('svm_performance_metrics.png')
plt.close()
print("Performance metrics visualization saved as 'svm_performance_metrics.png'")

# 1. Confusion Matrix Visualization
plt.figure(figsize=(10, 8))
sns.heatmap(confusion_matrix(y_test, y_pred_final), annot=True, fmt='d', cmap='Blues',
            xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix - SVM Classification')
plt.tight_layout()
plt.savefig('svm_confusion_matrix.png')
plt.close()
print("Confusion matrix visualization saved as 'svm_confusion_matrix.png'")

# 1.5 ROC and AUC Visualization
print("\nGenerating ROC and AUC visualization...")
n_classes = len(np.unique(y_test))

y_test_bin = label_binarize(y_test, classes=np.arange(1, n_classes+1))
y_score = final_model.decision_function(X_test)
fpr = dict()
tpr = dict()
roc_auc = dict()
plt.figure(figsize=(8, 6))
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
    plt.plot(fpr[i], tpr[i], lw=2, label=f'Class {i+1} (AUC = {roc_auc[i]:.2f})')
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test_bin.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
plt.plot(fpr["micro"], tpr["micro"],
         label=f'micro-average ROC curve (AUC = {roc_auc["micro"]:.2f})',
         color='deeppink', linestyle=':', linewidth=4)
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multi-class ROC Curve (One-vs-Rest, SVM)')
plt.legend(loc='lower right')
plt.tight_layout()
plt.savefig('svm_multiclass_roc_auc.png')
plt.close()
print("Multi-class ROC and AUC visualization saved as 'svm_multiclass_roc_auc.png'")


# 2. Learning Curves - Training and Validation Performance with varying training set sizes
print("\nGenerating learning curves (this may take a few minutes)...")
train_sizes, train_scores, valid_scores = learning_curve(
    estimator=final_model, X=X_sample, y=y_sample, 
    train_sizes=np.linspace(0.1, 1.0, 5), cv=3, n_jobs=-1
)

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, np.mean(train_scores, axis=1), 'o-', color='blue', label='Training Accuracy')
plt.plot(train_sizes, np.mean(valid_scores, axis=1), 'o-', color='red', label='Validation Accuracy')
plt.title('Learning Curves - SVM Model')
plt.xlabel('Training Set Size')
plt.ylabel('Accuracy')
plt.legend(loc='best')
plt.grid(True)
plt.savefig('svm_learning_curve.png')
plt.close()
print("Learning curve visualization saved as 'svm_learning_curve.png'")

# 3. Validation Curve - Model Performance vs. Hyperparameter (C parameter)
print("\nGenerating validation curve for C parameter...")
c_range = np.logspace(-3, 3, 5)
train_scores, valid_scores = validation_curve(
    estimator=SVC(kernel=final_model.kernel, gamma=final_model.gamma), 
    X=X_sample, y=y_sample, param_name='C', param_range=c_range, cv=3, n_jobs=-1
)

plt.figure(figsize=(10, 6))
plt.semilogx(c_range, np.mean(train_scores, axis=1), 'o-', color='blue', label='Training Accuracy')
plt.semilogx(c_range, np.mean(valid_scores, axis=1), 'o-', color='red', label='Validation Accuracy')
plt.title('Validation Curve - C Parameter (SVM)')
plt.xlabel('C Parameter (log scale)')
plt.ylabel('Accuracy')
plt.legend(loc='best')
plt.grid(True)
plt.savefig('svm_validation_curve.png')
plt.close()
print("Validation curve visualization saved as 'svm_validation_curve.png'")

# 4. Dimensionality Reduction Visualization
print("\nPerforming dimensionality reduction for visualization...")

# 4.1 Using PCA for dimensionality reduction
pca = PCA(n_components=2)
X_test_pca = pca.fit_transform(X_test)

plt.figure(figsize=(12, 10))
scatter = plt.scatter(X_test_pca[:, 0], X_test_pca[:, 1], c=y_pred_final, cmap='viridis', alpha=0.8, edgecolors='w', s=50)
plt.title('PCA Visualization of SVM Classification Results')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(label='Activity Class')
plt.legend(handles=scatter.legend_elements()[0], labels=labels, title="Activities", loc="best")
plt.grid(alpha=0.3)
plt.savefig('svm_pca_visualization.png')
plt.close()
print("PCA visualization saved as 'svm_pca_visualization.png'")

# 4.2 Using t-SNE for better cluster visualization (takes longer but often gives better separation)
print("\nPerforming t-SNE dimensionality reduction (this may take a few minutes)...")
# Using a subset of test data for t-SNE to speed up computation
sample_size_tsne = min(1000, X_test.shape[0])
indices_tsne = np.random.choice(X_test.shape[0], sample_size_tsne, replace=False)
X_test_sample = X_test[indices_tsne]
y_test_sample = y_test[indices_tsne]
y_pred_sample = y_pred_final[indices_tsne]

# Perform t-SNE
tsne = TSNE(n_components=2, perplexity=30, n_iter=300, random_state=42)
X_test_tsne = tsne.fit_transform(X_test_sample)

plt.figure(figsize=(12, 10))
scatter = plt.scatter(X_test_tsne[:, 0], X_test_tsne[:, 1], c=y_pred_sample, cmap='viridis', alpha=0.8, edgecolors='w', s=50)
plt.title('t-SNE Visualization of SVM Classification Results')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.colorbar(label='Activity Class')
plt.legend(handles=scatter.legend_elements()[0], labels=labels, title="Activities", loc="best")
plt.grid(alpha=0.3)
plt.savefig('svm_tsne_visualization.png')
plt.close()
print("t-SNE visualization saved as 'svm_tsne_visualization.png'")

# 5. Decision Boundary Visualization (using PCA-reduced data)
print("\nGenerating SVM decision boundary visualization...")

# Use PCA to reduce to 2D for visualization (using a smaller subset for speed)
sample_size_db = min(1000, X_test.shape[0])
indices_db = np.random.choice(X_test.shape[0], sample_size_db, replace=False)
X_test_subset = X_test[indices_db]
y_test_subset = y_test[indices_db]

# Apply PCA
pca_boundary = PCA(n_components=2)
X_test_pca_boundary = pca_boundary.fit_transform(X_test_subset)

# Train a new SVM model on the PCA-reduced data
# Since we have multiple classes, we'll visualize a binary version for demonstration
# Let's choose the most common class vs all others
unique_classes, class_counts = np.unique(y_test_subset, return_counts=True)
most_common_class = unique_classes[np.argmax(class_counts)]
y_binary = (y_test_subset == most_common_class).astype(int)

# Train SVM on 2D data
svm_2d = SVC(kernel=final_model.kernel, C=final_model.C, gamma=final_model.gamma)
svm_2d.fit(X_test_pca_boundary, y_binary)

# Create a mesh grid for the decision boundary
h = 0.02  # Step size in the mesh
x_min, x_max = X_test_pca_boundary[:, 0].min() - 1, X_test_pca_boundary[:, 0].max() + 1
y_min, y_max = X_test_pca_boundary[:, 1].min() - 1, X_test_pca_boundary[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Predict class for each point in the mesh
Z = svm_2d.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Plot the decision boundary
plt.figure(figsize=(12, 10))
plt.contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.coolwarm)

# Plot class samples
plt.scatter(X_test_pca_boundary[y_binary == 0, 0], X_test_pca_boundary[y_binary == 0, 1], 
           c='blue', label=f'Other Classes', s=50, alpha=0.8, edgecolors='k')
plt.scatter(X_test_pca_boundary[y_binary == 1, 0], X_test_pca_boundary[y_binary == 1, 1], 
           c='red', label=f'Class {most_common_class} ({labels[most_common_class-1]})', s=50, alpha=0.8, edgecolors='k')

# Plot support vectors
plt.scatter(svm_2d.support_vectors_[:, 0], svm_2d.support_vectors_[:, 1], 
           s=100, facecolors='none', edgecolors='green', label='Support Vectors')

plt.title('SVM Decision Boundary (PCA-Reduced Data)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(loc='best')
plt.grid(alpha=0.3)
plt.savefig('svm_decision_boundary.png')
plt.close()
print("SVM decision boundary visualization saved as 'svm_decision_boundary.png'")

print("\nAll visualizations completed!")



1. **Data Loading**  
   - Read training and test feature matrices and labels from text files.  
   - Load activity names for later visualization.

2. **Baseline SVM Training**  
   - Fit an RBF‐kernel SVM (`C=1.0`, `gamma='scale'`) on the full training set.  
   - Evaluate accuracy, precision, recall, F1 on both train and test splits.

3. **Hyperparameter Tuning**  
   - Sample a subset of training data (≤5000 samples).  
   - Run `GridSearchCV` over `C`, `gamma`, and `kernel` to find the best SVM configuration.

4. **Final Model & Evaluation**  
   - Retrain SVM with optimal parameters on the entire training set.  
   - Compute detailed metrics and print a per‐class classification report.

5. **Visualization Suite**  
   - **Performance Metrics:** Bar charts comparing train vs. test scores.  
   - **Confusion Matrix:** Heatmap of true vs. predicted classes.  
   - **ROC/AUC:** One‐vs‐rest ROC curves and micro‐average AUC.  
   - **Learning & Validation Curves:** Show impact of sample size and hyperparameter `C`.  
   - **Dimensionality Reduction:**  
     - PCA and t-SNE scatter plots colored by predicted labels.  
     - 2D decision boundary plots (binary demo with PCA).  


**Random Forest**

In [None]:

# 创建图像输出文件夹
os.makedirs("figures", exist_ok=True)

# ---------------- 数据加载与映射 ---------------- #
features = pd.read_csv("features.txt", sep=r'\s+', header=None)
feature_names = features[1].values

activity_labels = pd.read_csv("activity_labels.txt", sep=r'\s+', header=None, index_col=0)
activity_map = activity_labels[1].to_dict()

X_train = pd.read_csv("train/X_train.txt", sep=r'\s+', header=None)
y_train = pd.read_csv("train/y_train.txt", header=None)[0].map(activity_map)

X_test = pd.read_csv("test/X_test.txt", sep=r'\s+', header=None)
y_test = pd.read_csv("test/y_test.txt", header=None)[0].map(activity_map)

X_train.columns = feature_names
X_test.columns = feature_names

# ---------------- 标准化 ---------------- #
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=feature_names)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=feature_names)

# ---------------- 网格搜索调参 ---------------- #
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'max_features': ['sqrt', 'log2']
}

rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    verbose=2,
    scoring='accuracy'
)

grid_search.fit(X_train_scaled, y_train)

# ---------------- 最佳模型评估 ---------------- #
best_rf = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)
print("Best CV Accuracy:", grid_search.best_score_)

y_pred = best_rf.predict(X_test_scaled)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ---------------- 混淆矩阵 ---------------- #
cm = confusion_matrix(y_test, y_pred, labels=best_rf.classes_)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=best_rf.classes_,
            yticklabels=best_rf.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix (Tuned Random Forest)')
plt.tight_layout()
plt.savefig("figures/rf_confusion_matrix.png")
plt.close()

# ---------------- 多类别 ROC 曲线 ---------------- #
class_names = sorted(y_train.unique())
y_test_bin = label_binarize(y_test, classes=class_names)
y_score = best_rf.predict_proba(X_test_scaled)
n_classes = y_test_bin.shape[1]

fpr, tpr, roc_auc = dict(), dict(), dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Macro-average
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# 绘图
plt.figure(figsize=(8, 6))
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label=f"{class_names[i]} (AUC = {roc_auc[i]:.2f})")

plt.plot(fpr["macro"], tpr["macro"],
         label=f"Macro-average (AUC = {roc_auc['macro']:.2f})",
         color='navy', linestyle='--', linewidth=2)

plt.plot([0, 1], [0, 1], 'k--', lw=1)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve (Random Forest)")
plt.legend(loc="lower right")
plt.tight_layout()
plt.savefig("figures/rf_roc_auc.png")
plt.close()



1. **Data Loading & Label Mapping**
   - Read feature names and activity labels.
   - Load training and test sets (`X_train`, `X_test`) and map integer labels to activity names (`y_train`, `y_test`).

2. **Feature Standardization**
   - Apply `StandardScaler` to zero–mean, unit-variance normalize all features.

3. **Hyperparameter Tuning**
   - Define a grid over:
     - Number of trees (`n_estimators`: 100, 200)
     - Maximum tree depth (`max_depth`: 10, 20, None)
     - Minimum samples to split (`min_samples_split`: 2, 5)
     - Feature subset strategy (`max_features`: “sqrt”, “log2”)
   - Run 3-fold `GridSearchCV` (accuracy scoring) to find the best Random Forest settings.

4. **Model Evaluation**
   - Fit the best estimator on the scaled training data.
   - Report:
     - **Best CV Accuracy** and parameter combination.
     - **Test Accuracy** and detailed **classification report** (precision, recall, F1).

5. **Results Visualization**
   - **Confusion Matrix**: heatmap of true vs. predicted labels.
   - **Multi-class ROC Curves**:
     - Binarize labels.
     - Compute per-class and macro-average ROC AUC.
     - Plot all ROC curves with AUC annotations.

All figures are saved under the `figures/` directory.


## Open exploration


**Ridge**

In [None]:
# ====================== 1. 数据加载 ======================
features = pd.read_csv("features.txt", sep=r'\s+', header=None)
feature_names = features[1].values

activity_labels = pd.read_csv("activity_labels.txt", sep=r'\s+', header=None, index_col=0)
activity_map = activity_labels[1].to_dict()

X_train = pd.read_csv("train/X_train.txt", sep=r'\s+', header=None)
y_train = pd.read_csv("train/y_train.txt", header=None)[0].map(activity_map)

X_test = pd.read_csv("test/X_test.txt", sep=r'\s+', header=None)
y_test = pd.read_csv("test/y_test.txt", header=None)[0].map(activity_map)

X_train.columns = feature_names
X_test.columns = feature_names

# ====================== 2. 特征标准化 ======================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ====================== 3. 超参数网格搜索 ======================
param_grid = {
    'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]
}

ridge_clf = RidgeClassifier()

grid = GridSearchCV(
    ridge_clf,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)

grid.fit(X_train_scaled, y_train)

# ====================== 4. 最优参数和验证得分 ======================
print("Best alpha:", grid.best_params_['alpha'])
print("Best Cross-Validated Accuracy:", grid.best_score_)

# ====================== 5. 在测试集上评估最优模型 ======================
best_ridge = grid.best_estimator_
y_train_pred = best_ridge.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Train Accuracy:", train_accuracy)

y_pred = best_ridge.predict(X_test_scaled)

print("\nTest Accuracy (RidgeClassifier):", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# ====================== 6. 混淆矩阵可视化 ======================
cm = confusion_matrix(y_test, y_pred, labels=best_ridge.classes_)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=best_ridge.classes_,
            yticklabels=best_ridge.classes_)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Ridge Classifier Confusion Matrix (alpha tuned)")
plt.tight_layout()
plt.show()


1. **Data Loading & Label Mapping**  
   - Read feature names and activity labels.  
   - Load training and test feature matrices (`X_train`, `X_test`) and map integer labels to activity names (`y_train`, `y_test`).

2. **Feature Standardization**  
   - Apply `StandardScaler` to normalize features to zero mean and unit variance.

3. **Hyperparameter Grid Search**  
   - Define a grid over regularization strengths (`alpha` ∈ {0.01, 0.1, 1, 10, 100}).  
   - Use 3-fold `GridSearchCV` (accuracy scoring) to find the best `alpha` for `RidgeClassifier`.

4. **Model Evaluation**  
   - Report the best `alpha` and cross-validated accuracy.  
   - Retrain on the full training set and compute accuracy on both training and test sets.  
   - Print a detailed classification report (precision, recall, F1-score).

5. **Results Visualization**  
   - Plot a confusion matrix heatmap for the test predictions to illustrate per-class performance.









**KNN**

In [None]:
# ====================== 1. 数据加载 ======================
features = pd.read_csv("features.txt", sep=r'\s+', header=None)
feature_names = features[1].values

activity_labels = pd.read_csv("activity_labels.txt", sep=r'\s+', header=None, index_col=0)
activity_map = activity_labels[1].to_dict()

X_train = pd.read_csv("train/X_train.txt", sep=r'\s+', header=None)
y_train = pd.read_csv("train/y_train.txt", header=None)[0].map(activity_map)

X_test = pd.read_csv("test/X_test.txt", sep=r'\s+', header=None)
y_test = pd.read_csv("test/y_test.txt", header=None)[0].map(activity_map)

X_train.columns = feature_names
X_test.columns = feature_names

# ====================== 2. 特征标准化 ======================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ====================== 3. 构建 Pipeline ======================
pipe = Pipeline([
    ('select', SelectKBest(score_func=f_classif)),
    ('knn', KNeighborsClassifier())
])

# ====================== 4. 参数搜索范围 ======================
param_grid = {
    'select__k': [50, 100, 150, 200, 300],
    'knn__n_neighbors': [3, 5, 7, 9]
}

# ====================== 5. 网格搜索 GridSearchCV ======================
grid = GridSearchCV(
    pipe,
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,
    verbose=2,
    n_jobs=-1
)

grid.fit(X_train_scaled, y_train)

# ====================== 6. 输出最优参数 ======================
print("Best Parameters:", grid.best_params_)
print("Best Cross-Validated Accuracy:", grid.best_score_)

# ====================== 7. 测试集评估 ======================
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test_scaled)

print("\nTest Accuracy with Best KNN Model:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# ====================== 8. 混淆矩阵可视化 ======================
cm = confusion_matrix(y_test, y_pred, labels=best_model.named_steps['knn'].classes_)

plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=best_model.named_steps['knn'].classes_,
            yticklabels=best_model.named_steps['knn'].classes_)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("KNN Confusion Matrix (GridSearchCV Optimized)")
plt.tight_layout()
plt.show()


1. **Data Loading & Label Mapping**  
   - Read feature names and activity labels.  
   - Load training/testing feature matrices and map integer labels to activity names.

2. **Feature Scaling**  
   - Apply `StandardScaler` to normalize all features to zero mean and unit variance.

3. **Pipeline Construction**  
   - Chain `SelectKBest(f_classif)` for univariate feature selection and `KNeighborsClassifier` for classification.

4. **Hyperparameter Grid**  
   - Tune number of features (`k` ∈ [50,100,150,200,300]) and KNN neighbors (`n_neighbors` ∈ [3,5,7,9]) via 3-fold `GridSearchCV`.

5. **Model Training & Selection**  
   - Fit the pipeline on scaled training data, select the best parameter combination by accuracy.

6. **Evaluation**  
   - Report cross-validated and test accuracy, print a detailed classification report.

7. **Results Visualization**  
   - Plot a confusion matrix heatmap of test predictions to inspect per-class performance.