In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import key scikit-learn modules
import sklearn

# For reproducibility
np.random.seed(42)
plt.style.use('seaborn-v0_8-whitegrid')


In [None]:
# Check scikit-learn version
print(f"scikit-learn version: {sklearn.__version__}")

# Display major modules
major_modules = [
    'sklearn.datasets', 'sklearn.preprocessing', 'sklearn.model_selection',
    'sklearn.feature_extraction', 'sklearn.feature_selection', 'sklearn.decomposition',
    'sklearn.linear_model', 'sklearn.ensemble', 'sklearn.tree', 'sklearn.neighbors',
    'sklearn.svm', 'sklearn.cluster', 'sklearn.metrics', 'sklearn.pipeline'
]

print("\nMajor scikit-learn modules:")
for module in major_modules:
    print(f"- {module}")


In [None]:
# Example of working with scikit-learn's API

# 1. Load dataset
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target

print(f"Dataset shape: {X.shape}")
print(f"Features: {iris.feature_names}")
print(f"Target classes: {iris.target_names}")
print(f"Number of samples per class: {np.bincount(y)}")

# 2. Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 3. Preprocess the data (transformer)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)  # Note: only transform, not fit_transform

# 4. Train a model (estimator)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=200)
model.fit(X_train_scaled, y_train)

# 5. Make predictions (predictor)
y_pred = model.predict(X_test_scaled)
y_proba = model.predict_proba(X_test_scaled)

# 6. Evaluate the model
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))


In [None]:
# Visualize the model's decision boundaries
from sklearn.decomposition import PCA

# Use PCA to reduce to 2 dimensions for visualization
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Train a new logistic regression model on the PCA-transformed data
model_pca = LogisticRegression(max_iter=200)
model_pca.fit(X_train_pca, y_train)

# Create a mesh grid to visualize decision boundaries
def plot_decision_boundaries(X, y, model, ax=None):
    if ax is None:
        ax = plt.gca()
    
    # Create mesh grid
    h = 0.02  # step size in the mesh
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    
    # Make predictions on the mesh grid
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    # Plot decision boundaries
    ax.contourf(xx, yy, Z, alpha=0.3)
    
    # Plot data points
    scatter = ax.scatter(X[:, 0], X[:, 1], c=y, edgecolor='k', s=50, cmap='viridis')
    
    return ax

# Plot
plt.figure(figsize=(12, 5))

# Training data
plt.subplot(1, 2, 1)
plot_decision_boundaries(X_train_pca, y_train, model_pca)
plt.title('Decision Boundaries (Training Data)')
plt.xlabel('PCA Feature 1')
plt.ylabel('PCA Feature 2')

# Test data
plt.subplot(1, 2, 2)
plot_decision_boundaries(X_test_pca, y_test, model_pca)
plt.title('Decision Boundaries (Test Data)')
plt.xlabel('PCA Feature 1')
plt.ylabel('PCA Feature 2')

plt.tight_layout()
plt.show()


In [None]:
# Explore available datasets in scikit-learn
from sklearn import datasets

# 1. Toy datasets (small, for quick testing)
print("Toy datasets:")
print("- Iris dataset (classification)")
print("- Digits dataset (classification)")
print("- Wine dataset (classification)")
print("- Breast cancer dataset (classification)")
print("- Boston housing dataset (regression)")
print("- Diabetes dataset (regression)")

# 2. Sample generators (synthetic data)
print("\nSample generators:")
print("- make_classification: Generate random n-class classification problem")
print("- make_regression: Generate random regression problem")
print("- make_blobs: Generate isotropic Gaussian blobs for clustering")
print("- make_circles/make_moons: Generate 2D classification datasets")

# 3. Real-world datasets (fetch from remote)
print("\nReal-world datasets:")
print("- fetch_california_housing: California housing regression dataset")
print("- fetch_covtype: Forest cover type classification dataset")
print("- fetch_20newsgroups: 20 newsgroups text dataset")
print("- fetch_olivetti_faces: Olivetti faces dataset")

# Let's explore a few examples
# Example 1: Digits dataset (for classification)
digits = datasets.load_digits()
print(f"\nDigits dataset: {digits.data.shape} samples with {digits.data.shape[1]} features")
print(f"Target classes: {np.unique(digits.target)}")

# Example 2: Synthetic classification data
from sklearn.datasets import make_classification
X_synth, y_synth = make_classification(
    n_samples=1000,
    n_features=20,
    n_informative=10,
    n_redundant=5,
    n_classes=3,
    random_state=42
)
print(f"\nSynthetic classification data: {X_synth.shape}")
print(f"Class distribution: {np.bincount(y_synth)}")


In [None]:
# Visualize the digits dataset
plt.figure(figsize=(14, 4))
for i in range(10):
    plt.subplot(1, 10, i+1)
    plt.imshow(digits.images[i], cmap='binary')
    plt.title(f"Digit: {digits.target[i]}")
    plt.axis('off')
plt.tight_layout()
plt.show()

# Visualize synthetic data using PCA
from sklearn.decomposition import PCA

# Reduce to 2 dimensions for visualization
pca = PCA(n_components=2)
X_synth_2d = pca.fit_transform(X_synth)

# Plot the synthetic data
plt.figure(figsize=(10, 6))
for i in range(3):  # 3 classes
    plt.scatter(X_synth_2d[y_synth == i, 0], X_synth_2d[y_synth == i, 1], label=f'Class {i}', alpha=0.7)
plt.title('PCA Visualization of Synthetic Classification Data')
plt.xlabel('PCA Feature 1')
plt.ylabel('PCA Feature 2')
plt.legend()
plt.show()


In [None]:
# Let's compare different classification algorithms on the Iris dataset
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Set up the models
models = {
    'Logistic Regression': LogisticRegression(max_iter=200),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Neural Network': MLPClassifier(max_iter=500)
}

# Use the original Iris dataset
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.3, random_state=42
)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train and evaluate each model
results = {}
for name, model in models.items():
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    
    print(f"{name} - Accuracy: {accuracy:.4f}")

# Visualize results
plt.figure(figsize=(12, 6))
plt.bar(results.keys(), results.values())
plt.title('Model Comparison on Iris Dataset')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.ylim(0.8, 1.05)  # Adjust as needed
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
# 5.1 Cross-Validation
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold

# Let's focus on the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Basic cross-validation
cv_scores = cross_val_score(rf, X_train_scaled, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print(f"Mean CV score: {cv_scores.mean():.4f}")
print(f"Standard deviation: {cv_scores.std():.4f}")

# Using KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
kf_scores = cross_val_score(rf, X_train_scaled, y_train, cv=kf)
print("\nKFold cross-validation scores:", kf_scores)
print(f"Mean KFold CV score: {kf_scores.mean():.4f}")

# Using StratifiedKFold (maintains class distribution in each fold)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
skf_scores = cross_val_score(rf, X_train_scaled, y_train, cv=skf)
print("\nStratifiedKFold cross-validation scores:", skf_scores)
print(f"Mean StratifiedKFold CV score: {skf_scores.mean():.4f}")

# 5.2 Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Define parameter grid for Random Forest
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Grid Search
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    return_train_score=True
)

grid_search.fit(X_train_scaled, y_train)

print("\nGrid Search Results:")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

# Get test set performance with best model
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test_scaled, y_test)
print(f"Test accuracy with best model: {test_accuracy:.4f}")


In [None]:
# Visualize grid search results
# Extract results from grid search
grid_results = pd.DataFrame(grid_search.cv_results_)

# Function to plot parameter comparison
def plot_grid_search_param(results, param_name, scoring='mean_test_score'):
    plt.figure(figsize=(13, 5))
    
    # Group by the parameter and calculate mean score
    grouped = results.groupby(f'param_{param_name}')[scoring].mean()
    
    # Plot bar chart
    sns.barplot(x=grouped.index, y=grouped.values)
    plt.title(f'Grid Search Scores for Different {param_name} Values')
    plt.xlabel(param_name)
    plt.ylabel('Mean Test Score')
    plt.ylim(0.9, 1.0)  # Adjust as needed
    plt.tight_layout()
    plt.show()

# Plot the effect of different parameters
for param in param_grid.keys():
    plot_grid_search_param(grid_results, param)

# 5.3 Model Evaluation Metrics
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc

# Use the best model from grid search
y_pred = best_model.predict(X_test_scaled)
y_prob = best_model.predict_proba(X_test_scaled)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=iris.target_names,
            yticklabels=iris.target_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.tight_layout()
plt.show()

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))

# ROC Curve and AUC (one-vs-rest for multiclass)
plt.figure(figsize=(10, 8))
for i, class_name in enumerate(iris.target_names):
    # Calculate ROC curve and AUC for each class
    fpr, tpr, _ = roc_curve(y_test == i, y_prob[:, i])
    roc_auc = auc(fpr, tpr)
    
    # Plot ROC curve
    plt.plot(fpr, tpr, lw=2, label=f'{class_name} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves (One-vs-Rest)')
plt.legend(loc="lower right")
plt.tight_layout()
plt.show()


In [None]:
# Creating a complete pipeline
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif

# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),               # Step 1: Scale the data
    ('feature_selection', SelectKBest(f_classif, k=3)),  # Step 2: Select top k features
    ('pca', PCA(n_components=2)),               # Step 3: Reduce dimensions with PCA
    ('classifier', RandomForestClassifier(random_state=42))  # Step 4: Train a classifier
])

# Train the pipeline
pipeline.fit(X_train, y_train)

# Evaluate on test data
pipeline_accuracy = pipeline.score(X_test, y_test)
print(f"Pipeline accuracy: {pipeline_accuracy:.4f}")

# Make predictions (the pipeline handles all preprocessing steps)
y_pipeline_pred = pipeline.predict(X_test)

# Inspect the pipeline steps
print("\nPipeline steps:")
for step_idx, (name, transformer) in enumerate(pipeline.steps):
    print(f"Step {step_idx+1}: {name} - {transformer.__class__.__name__}")
    
    # Print details for each step if available
    if name == 'feature_selection':
        selected_features = iris.feature_names[transformer.get_support()]
        print(f"  Selected features: {', '.join(selected_features)}")
    elif name == 'classifier':
        print(f"  Number of trees: {transformer.n_estimators}")
        print(f"  Feature importances: {transformer.feature_importances_}")
        
# Using ColumnTransformer with Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Let's create an example dataset with mixed data types
X_mixed = pd.DataFrame({
    'feature1': np.random.randn(100),  # numerical
    'feature2': np.random.randn(100),  # numerical
    'category': np.random.choice(['A', 'B', 'C'], 100)  # categorical
})
y_mixed = np.random.randint(0, 2, 100)  # binary target

# Define preprocessing for numerical and categorical features
numerical_features = ['feature1', 'feature2']
categorical_features = ['category']

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create the full pipeline
full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Split the mixed data
X_mixed_train, X_mixed_test, y_mixed_train, y_mixed_test = train_test_split(
    X_mixed, y_mixed, test_size=0.2, random_state=42
)

# Train and evaluate
full_pipeline.fit(X_mixed_train, y_mixed_train)
mixed_accuracy = full_pipeline.score(X_mixed_test, y_mixed_test)
print(f"\nMixed data pipeline accuracy: {mixed_accuracy:.4f}")
