Grid Search
Exhaustively searches all possible combinations of hyperparameters specified in a parameter grid.
Deterministic: Always finds the same best parameters given the same data and grid.
Can be computationally expensive, especially with a large parameter grid or dataset.
Best when the search space is relatively small.

Random Search
Randomly samples hyperparameter combinations from a distribution over possible values.
Non-deterministic: Can find different best parameters in different runs.
Usually more efficient than grid search, particularly when the search space is large.
More likely to find good hyperparameter combinations in less time, especially if some hyperparameters are less important than others.


In [2]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_neighbors': np.arange(1, 31),  # Test neighbors from 1 to 30
    'weights': ['uniform', 'distance'],  # Test different weighting schemes
    'p': [1, 2]  # Test Manhattan and Euclidean distances
}

# Create a KNN classifier
knn = KNeighborsClassifier()

# Perform GridSearchCV
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', n_jobs=-1)  # Use all cores
grid_search.fit(X_train, y_train)

# Print the best parameters and accuracy for GridSearchCV
print("GridSearchCV best parameters:", grid_search.best_params_)
print("GridSearchCV best accuracy:", grid_search.best_score_)


# Define the parameter distribution for RandomizedSearchCV
param_dist = {
    'n_neighbors': np.arange(1, 31),
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(knn, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

# Print the best parameters and accuracy for RandomizedSearchCV
print("\nRandomizedSearchCV best parameters:", random_search.best_params_)
print("RandomizedSearchCV best accuracy:", random_search.best_score_)

# Evaluate the best models on the test set (optional)
print("\nGridSearchCV test accuracy:", grid_search.score(X_test, y_test))
print("RandomizedSearchCV test accuracy:", random_search.score(X_test, y_test))


GridSearchCV best parameters: {'n_neighbors': np.int64(14), 'p': 1, 'weights': 'uniform'}
GridSearchCV best accuracy: 0.9666666666666666

RandomizedSearchCV best parameters: {'weights': 'uniform', 'p': 2, 'n_neighbors': np.int64(3)}
RandomizedSearchCV best accuracy: 0.9583333333333334

GridSearchCV test accuracy: 1.0
RandomizedSearchCV test accuracy: 1.0


In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
# Load your data (replace 'your_data.csv' with your actual file)
try:
    data = pd.read_csv('./sample_data/mnist_train_small.csv', header=None)
except FileNotFoundError:
    print("Error: 'your_data.csv' not found. Please upload your data file.")
    data = None

# Split into features and target
X = data.iloc[:, 1:]  # All columns except first
y = data.iloc[:, 0]   # First column is target

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")


Accuracy: 0.8323


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
import pandas as pd

# Load the MNIST data
data = pd.read_csv('./sample_data/mnist_train_small.csv', header=None)

# Split into features (X) and target (y)
X = data.iloc[:, 1:]  # Pixel values (columns 1-784)
y = data.iloc[:, 0]   # Digit labels (column 0)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for GridSearchCV
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create and fit GridSearchCV
dt = DecisionTreeClassifier(random_state=42)
grid_search_dt = GridSearchCV(dt, param_grid_dt, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_dt.fit(X_train, y_train)

# Print results
print("\nDecision Tree GridSearchCV results:")
print("Best parameters:", grid_search_dt.best_params_)
print("Best cross-validation accuracy: {:.4f}".format(grid_search_dt.best_score_))
print("Test set accuracy: {:.4f}".format(grid_search_dt.score(X_test, y_test)))

In [None]:
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

# Load the MNIST data
data = pd.read_csv('./sample_data/mnist_train_small.csv', header=None)

# Split into features (X) and target (y)
X = data.iloc[:, 1:]  # Pixel values (columns 1-784)
y = data.iloc[:, 0]   # Digit labels (column 0)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter distribution for RandomizedSearchCV
param_dist_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None] + list(np.arange(5, 101, 5)),  # From 5 to 100 in steps of 5
    'min_samples_split': np.arange(2, 21),  # From 2 to 20
    'min_samples_leaf': np.arange(1, 11),   # From 1 to 10
    'max_features': [None, 'sqrt', 'log2', 0.5, 0.8],
    'max_leaf_nodes': [None, 50, 100, 200, 500],
    'min_impurity_decrease': [0.0, 0.01, 0.05, 0.1]
}

# Create and fit RandomizedSearchCV
dt = DecisionTreeClassifier(random_state=42)
random_search_dt = RandomizedSearchCV(
    dt,
    param_distributions=param_dist_dt,
    n_iter=100,               # Number of parameter settings sampled
    cv=5,                     # 5-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,                # Use all available cores
    random_state=42,          # Reproducible results
    verbose=1                 # Show progress
)

random_search_dt.fit(X_train, y_train)

# Print results
print("\nDecision Tree RandomizedSearchCV results:")
print("Best parameters:", random_search_dt.best_params_)
print("Best cross-validation accuracy: {:.4f}".format(random_search_dt.best_score_))
print("Test set accuracy: {:.4f}".format(random_search_dt.score(X_test, y_test)))

# Get feature importances from the best model
best_dt = random_search_dt.best_estimator_
feature_importances = best_dt.feature_importances_

import matplotlib.pyplot as plt
plt.figure(figsize=(10, 5))
plt.bar(range(len(feature_importances)), feature_importances)
plt.title("Feature Importances from Decision Tree")
plt.xlabel("Pixel Index")
plt.ylabel("Importance Score")
plt.show()

In [None]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import time

# Load the MNIST data (no headers)
try:
    data = pd.read_csv('./sample_data/mnist_train_small.csv', header=None)
except FileNotFoundError:
    print("Error: 'mnist_train_small.csv' not found. Please check the file path.")
    exit()

# Split into features (pixels) and target (digit labels)
X = data.iloc[:, 1:].values  # All columns except first (pixel values)
y = data.iloc[:, 0].values   # First column is the digit label (0-9)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data (important for SVM)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train SVM classifier with timing
print("Training SVM...")
start_time = time.time()

# Using a linear kernel for MNIST (works well and is faster than RBF)
clf = svm.SVC(kernel='linear', C=1.0, random_state=42)
clf.fit(X_train_scaled, y_train)

training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds")

# Make predictions and evaluate
y_pred = clf.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

print(f"\nSVM Classifier Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Number of support vectors per class: {clf.n_support_}")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics import accuracy_score
import time

# Load the MNIST data (no headers)
try:
    data = pd.read_csv('./sample_data/mnist_train_small.csv', header=None)
except FileNotFoundError:
    print("Error: 'mnist_train_small.csv' not found. Please check the file path.")
    exit()

# Split into features (pixels) and target (digit labels)
X = data.iloc[:, 1:]  # All columns except first (pixel values)
y = data.iloc[:, 0]   # First column is the digit label (0-9)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with StandardScaler and SVC
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', svm.SVC(random_state=42))
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'svm__C': [0.1, 1, 10],  # Reduced from [0.1, 1, 10, 100] for faster search
    'svm__kernel': ['linear', 'rbf'],  # Removed 'poly' to reduce search space
    'svm__gamma': ['scale', 'auto', 0.01, 0.1],  # Adjusted values
    'svm__class_weight': [None, 'balanced']  # Added for imbalanced classes
}

print("Starting GridSearchCV...")
start_time = time.time()

# Perform GridSearchCV with reduced n_jobs if memory is limited
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,  # Reduced from 5 for faster search
    scoring='accuracy',
    n_jobs=4,  # Reduced from -1 to prevent memory issues
    verbose=2  # Shows progress
)

grid_search.fit(X_train, y_train)

print(f"GridSearchCV completed in {time.time()-start_time:.2f} seconds")

# Print results
print("\nSVM GridSearchCV Results:")
print("Best parameters:", grid_search.best_params_)
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")
print(f"Test set accuracy: {grid_search.score(X_test, y_test):.4f}")

# Get the best model
best_svm = grid_search.best_estimator_

# Additional evaluation
y_pred = best_svm.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time

# Load the MNIST data (no headers)
try:
    data = pd.read_csv('./sample_data/mnist_train_small.csv', header=None)
except FileNotFoundError:
    print("Error: 'mnist_train_small.csv' not found. Please check the file path.")
    exit()

# Split into features (pixels) and target (digit labels)
X = data.iloc[:, 1:]  # All columns except first (pixel values)
y = data.iloc[:, 0]   # First column is the digit label (0-9)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=0.2,
                                                   random_state=42,
                                                   stratify=y)  # Added stratification

# Create a pipeline with StandardScaler and SVC
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', svm.SVC(random_state=42))
])

# Define the parameter distribution for RandomizedSearchCV
param_dist_svm = {
    'svm__C': np.logspace(-3, 3, 7),  # 7 values from 0.001 to 1000
    'svm__kernel': ['linear', 'rbf', 'poly'],
    'svm__gamma': ['scale', 'auto'] + list(np.logspace(-3, 1, 5)),  # 7 options total
    'svm__degree': [2, 3, 4],  # Only used with poly kernel
    'svm__coef0': np.linspace(-1, 1, 5),  # Only used with poly/sigmoid
    'svm__class_weight': [None, 'balanced']
}

print("Starting RandomizedSearchCV...")
start_time = time.time()

# Perform RandomizedSearchCV
random_search_svm = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist_svm,
    n_iter=20,  # Increased from 10 to sample more combinations
    cv=3,       # Reduced from 5 for faster search
    scoring='accuracy',
    n_jobs=-1,  # Use all available cores
    random_state=42,
    verbose=2   # Show progress
)

random_search_svm.fit(X_train, y_train)

print(f"\nRandomizedSearchCV completed in {(time.time()-start_time)/60:.2f} minutes")

# Print results
print("\nSVM RandomizedSearchCV Results:")
print("Best parameters:", random_search_svm.best_params_)
print(f"Best cross-validation accuracy: {random_search_svm.best_score_:.4f}")
print(f"Test set accuracy: {random_search_svm.score(X_test, y_test):.4f}")

# Get the best model
best_svm = random_search_svm.best_estimator_

# Additional evaluation
y_pred = best_svm.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))