In [1]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.model_selection import GridSearchCV
import time
import json
import pandas as pd

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Models import
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, VotingClassifier

In [2]:
def create_pipeline(model, model_name, standardize=False, with_pca=False, n_pca_components=10):
    """
    Creates a pipeline for a given model with optional standardization and PCA.
    """
    steps = []
    if standardize:
        steps.append(('scaler', StandardScaler()))
    if with_pca:
        steps.append(('pca', PCA(n_components=n_pca_components)))
    steps.append((model_name, model))
    return Pipeline(steps)

In [3]:
def grid_search_tuning(model_name, config, X_train, y_train, scoring="f1", cv_folds=5, n_jobs=-1):
    """
    Perform hyperparameter tuning for a model pipeline using GridSearchCV.

    This function creates a pipeline with preprocessing and the model, applies 
    hyperparameter tuning using GridSearchCV, and saves the results.

    Parameters:
    - model_name (str): Name of the model for identification in outputs.
    - config (dict): Contains the model, preprocessing options, fixed params, 
                     and hyperparameter grid.
    - X_train, y_train: Training data and labels.
    - scoring (str): Metric to optimize (default: "f1").
    - cv_folds (int): Number of cross-validation folds (default: 5).
    - n_jobs (int): Number of parallel jobs (-1 uses all cores).

    Returns:
    - grid_search (GridSearchCV): Fitted GridSearchCV object.
    - best_params (dict): Best hyperparameters and metadata (e.g., time taken).

    Saves:
    - Full tuning results to CSV.
    - Best hyperparameters to JSON.
    """
    pipeline = create_pipeline(
        model=config["model"],
        model_name=model_name,
        standardize=config["preprocess"].get("standardize", False),
        with_pca=config["preprocess"].get("pca", False),
        n_pca_components=config["preprocess"].get("pca_components", 10)
    )
    pipeline.set_params(**config["fixed_params"])
    
    grid_search = GridSearchCV(
        pipeline,
        param_grid=config["grid_search_params"],
        scoring=scoring,
        cv=cv_folds,
        n_jobs=n_jobs,
        verbose=2
    )
    start_time = time.time()
    grid_search.fit(X_train, y_train)
    elapsed_time = (time.time() - start_time) / 60
    
    best_params = grid_search.best_params_
    best_params["MODEL_NAME"] = model_name
    best_params["TIME_ELAPSED_MIN"] = elapsed_time

    # Save results
    pd.DataFrame(grid_search.cv_results_).to_csv(f"Tuning_params/{model_name}_results.csv", index=False)
    with open(f"Tuning_params/{model_name}_best_params.json", "w") as f:
        json.dump(best_params, f, indent=4)

    print(f"Model {model_name} tuned in {elapsed_time:.2f} minutes.")
    return grid_search, best_params

In [4]:
def evaluate_optimized_models(results_dir, models_config, X_test, y_test):
    """
    Evaluates optimized models on the test set using the best parameters from tuning.

    This function loads the best hyperparameters saved during grid search for each model, 
    reconstructs the pipeline with the optimal parameters, and evaluates its performance 
    on the test data. Results include classification metrics, confusion matrices, and AUC scores.

    Parameters:
    - results_dir (str): Directory containing JSON files with the best parameters for each model.
    - models_config (dict): Configuration dictionary for the models, including model instances 
                            and preprocessing settings (e.g., PCA, standardization).
    - X_test (array-like): Features of the test dataset.
    - y_test (array-like): True labels of the test dataset.

    Returns:
    - results (list): A list of dictionaries, one for each model, containing:
        - "model_name": Name of the evaluated model.
        - "classification_report": Detailed classification metrics (precision, recall, F1, etc.).
        - "auc_score": AUC score if the model supports `predict_proba`; otherwise, None.
        - "confusion_matrix": Confusion matrix for the model's predictions.
        - "error" (if any): Error message for models that failed during evaluation.

    Workflow:
    1. For each model in `models_config`, load its best parameters from the JSON file.
    2. Recreate the pipeline with preprocessing steps (e.g., PCA, standardization) and the model.
    3. Set the pipeline's parameters to the best hyperparameters found during tuning.
    4. Evaluate the pipeline on the test dataset and compute:
       - Classification report with metrics like precision, recall, F1-score.
       - Confusion matrix.
       - AUC score (if applicable).
    5. Append the results to a list, including any errors encountered during the process.

    Example Output:
    [
        {
            "model_name": "LogisticRegression",
            "classification_report": {...},
            "auc_score": 0.92,
            "confusion_matrix": [[85, 15], [8, 112]],
        },
        {
            "model_name": "LDA",
            "error": "Pipeline fitting failed due to incompatible parameter grid."
        }
    ]

    Side Effects:
    - Prints progress for each model evaluation.
    - Logs any errors encountered during evaluation.
    """
    results = []

    for model_name, config in models_config.items():
        try:
            # Load best parameters
            with open(f"{results_dir}/{model_name}_best_params.json", "r") as f:
                best_params = json.load(f)
            
            # Create pipeline
            pipeline = create_pipeline(
                model=config["model"],
                model_name=model_name,
                standardize=config["preprocess"].get("standardize", False),
                with_pca=config["preprocess"].get("pca", False),
                n_pca_components=config["preprocess"].get("pca_components", 10)
            )
            pipeline.set_params(**best_params)

            # Evaluate
            y_pred = pipeline.predict(X_test)
            metrics = classification_report(y_test, y_pred, output_dict=True)
            auc_score = roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1]) if hasattr(pipeline, "predict_proba") else None
            confusion = confusion_matrix(y_test, y_pred).tolist()

            results.append({
                "model_name": model_name,
                "classification_report": metrics,
                "auc_score": auc_score,
                "confusion_matrix": confusion,
            })
            print(f"Evaluated model: {model_name}")
        except Exception as e:
            print(f"Error with model {model_name}: {e}")
            results.append({"model_name": model_name, "error": str(e)})
    
    return results

In [5]:
def mypredict():
    """
    Reads test.csv.gz, predicts class labels using the best model, 
    and writes the predictions to predictions.txt.
    """
    test_data = pd.read_csv("test.csv.gz")
    X_test = test_data.iloc[:, 1:].values  # Exclude label column
    model_name = "LogisticRegression"  # Example; replace with your best model
    with open(f"Tuning_params/{model_name}_best_params.json", "r") as f:
        best_params = json.load(f)

    # Configure pipeline
    pipeline = create_pipeline(
        model=LogisticRegression(),  # Replace with our chosen model
        model_name=model_name,
        standardize=True,
        with_pca=False
    )
    pipeline.set_params(**best_params)

    # Predict and save
    predictions = pipeline.predict(X_test)
    with open("predictions.txt", "w") as f:
        f.writelines(f"{label}\n" for label in predictions)
    print("Predictions saved to predictions.txt")

## Loading and Preparing the data

In [None]:
data = pd.read_csv("train.csv")  # Replace with your training dataset path

# Prepare features and labels
label_col = "label"  # Replace with your actual label column name
X = data.drop(columns=[label_col]).values
y = LabelEncoder().fit_transform(data[label_col].values)

# Split into training and testing sets
random_state = 42
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

## Define model configurations

Create a dictionary for all the models you want to tune and evaluate, based on your requirements.

In [None]:
models_config = {
    "LDA": {
        "model": LinearDiscriminantAnalysis(),
        "preprocess": {"standardize": True, "pca": False},
        "fixed_params": {"lda__tol": 0.0001},
        "grid_search_params": [
            {"lda__solver": ["svd"]},
            {"lda__solver": ["lsqr"], "lda__shrinkage": [0.0, 0.1, 0.5, "auto"]},
        ],
    },
    "LogisticRegression": {
        "model": LogisticRegression(),
        "preprocess": {"standardize": True, "pca": False},
        "fixed_params": {"logit__max_iter": 100, "logit__solver": "saga"},
        "grid_search_params": {
            "logit__C": [0.01, 0.1, 1, 10],
            "logit__penalty": ["l2", "elasticnet"],
            "logit__l1_ratio": [0.1, 0.5, 0.9],
        },
    },
    # Add other models similarly...
}

## Perform Hyperparameter tuning

Use the grid_search_tuning function to tune hyperparameters for each model.

In [None]:
from pathlib import Path

# Ensure the directory for saving results exists
Path("Tuning_params").mkdir(parents=True, exist_ok=True)

# Tune each model
for model_name, config in models_config.items():
    grid_search, best_params = grid_search_tuning(
        model_name=model_name,
        config=config,
        X_train=X_train,
        y_train=y_train,
        scoring="f1",  # Optimize for F1 score
        cv_folds=5,    # 5-fold cross-validation
        n_jobs=-1      # Use all available cores
    )

## Evaluate optimized models

Once tuning is complete, evaluate the best models on the test set using evaluate_optimized_models

In [None]:
results = evaluate_optimized_models(
    results_dir="Tuning_params",  # Directory containing best parameter JSON files
    models_config=models_config,
    X_test=X_test,
    y_test=y_test
)

# Print evaluation results
for result in results:
    print(f"Model: {result['model_name']}")
    if "error" in result:
        print(f"Error: {result['error']}")
    else:
        print("Classification Report:")
        print(result["classification_report"])
        print("AUC Score:", result["auc_score"])
        print("Confusion Matrix:", result["confusion_matrix"])

## Generate predictions for a test dataset

Use the mypredict function to predict and save labels for a test dataset.

## Expected Output 
1. Hyperparameter Tuning:
- JSON files containing the best parameters for each model (e.g., Tuning_params/LDA_best_params.json).
- CSV files with detailed cross-validation results for each model.

2. Evaluation:
- Printed classification reports, confusion matrices, and AUC scores.
- Results as a list of dictionaries.

3. Predictions:
- A text file (predictions.txt) containing one label per line.