In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import pickle
import mlflow
import pathlib
import dagshub
from hyperopt.pyll import scope


data_path = '../data/Landmines.csv' 
df = pd.read_csv(data_path)

numeric_df = df.select_dtypes(include='number')

X = numeric_df.drop('M', axis=1)
y = numeric_df['M']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

with dagshub.dagshub_logger(hparams_path='params.yml') as logger:

    models = {
        #'Logistic Regression': {
         #   'model': LogisticRegression(),
          #  'space': {
           #     'C': hp.loguniform('C', -4, 4),
            #    'penalty': hp.choice('penalty', ['l1', 'l2']),
             #   'solver': hp.choice('solver', ['liblinear', 'saga'])
           # }
       # },
        'Gradient Boosting': {
            'model': GradientBoostingClassifier(),
            'space': {
            'n_estimators': scope.int(hp.quniform('n_estimators', 100, 1000, 1)),
            'learning_rate': hp.loguniform('learning_rate', -3, 0),
            'max_depth': scope.int(hp.quniform('max_depth', 3, 10, 1)),  # This line is corrected
            'subsample': hp.uniform('subsample', 0.5, 1),
            }
        },
        'KNN': {
            'model': KNeighborsClassifier(),
            'space': {
                'n_neighbors': hp.quniform('n_neighbors', 3, 15, 1),
                'weights': hp.choice('weights', ['uniform', 'distance']),
                'p': hp.choice('p', [1, 2])
            }
        }
    }

    for model_name, model_info in models.items():
        def objective(params):
            with mlflow.start_run(nested=True):
                mlflow.set_tag("model", model_name)
                mlflow.log_params(params)

                pipeline = Pipeline([
                    ('scaler', StandardScaler()),
                    ('model', model_info['model'].set_params(**params))
                ])

                pipeline.fit(X_train, y_train)
                y_pred = pipeline.predict(X_test)
                accuracy = accuracy_score(y_test, y_pred)
                mlflow.log_metric("accuracy", accuracy)

            return {'loss': -accuracy, 'status': STATUS_OK}

        with mlflow.start_run(run_name=f"{model_name} Hyperparameter Tuning", nested=True):
            best_params = fmin(
                fn=objective,
                space=model_info['space'],
                algo=tpe.suggest,
                max_evals=10,
                trials=Trials()
            )

            best_model = Pipeline([
                ('scaler', StandardScaler()),
                ('model', model_info['model'].set_params(**best_params))
            ])
            best_model.fit(X_train, y_train)

            y_pred = best_model.predict(X_test)

            metrics = {
                'Accuracy': accuracy_score(y_test, y_pred),
                'Missclassification rate': 1 - accuracy_score(y_test, y_pred),
                'Precision': precision_score(y_test, y_pred, average='weighted'),
                'Recall': recall_score(y_test, y_pred, average='weighted'),
                'F1-score': f1_score(y_test, y_pred, average='weighted')
            }

            for metric_name, metric_value in metrics.items():
                logger.log_metrics({f"{model_name} - {metric_name}": metric_value})
                mlflow.log_metric(metric_name, metric_value)

            with open(f'models/{model_name}_model.pkl', 'wb') as f:
                pickle.dump(best_model, f)


100%|██████████| 10/10 [00:32<00:00,  3.26s/trial, best loss: -0.5882352941176471]


InvalidParameterError: The 'max_depth' parameter of GradientBoostingClassifier must be an int in the range [1, inf) or None. Got np.float64(10.0) instead.

In [16]:
import pathlib
import pickle
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
import dagshub
import mlflow

with dagshub.dagshub_logger() as logger:

    models = {
        'Logistic Regression': {
            'model': LogisticRegression(),
            'space': {
                'C': hp.loguniform('C', -4, 4),
                'penalty': hp.choice('penalty', ['l1', 'l2']),
                'solver': hp.choice('solver', ['liblinear', 'saga'])
            },
            'param_mappings': {
                'penalty': ['l1', 'l2'],
                'solver': ['liblinear', 'saga']
            }
        },
        'Gradient Boosting': {
            'model': GradientBoostingClassifier(),
            'space': {
                'n_estimators': hp.quniform('n_estimators', 100, 1000, 1),
                'learning_rate': hp.loguniform('learning_rate', -3, 0),
                'max_depth': hp.quniform('max_depth', 3, 10, 1),
                'subsample': hp.uniform('subsample', 0.5, 1),
            }
        },
        'KNN': {
            'model': KNeighborsClassifier(),
            'space': {
                'n_neighbors': hp.quniform('n_neighbors', 3, 15, 1),
                'weights': hp.choice('weights', ['uniform', 'distance']),
                'p': hp.choice('p', [1, 2])
            },
            'param_mappings': {
                'weights': ['uniform', 'distance'],
                'p': [1, 2]
            }
        }
    }

    for model_name, model_info in models.items():
        def objective(params):
            with mlflow.start_run(nested=True):
                # Convert float parameters to integer if needed
                params = {k: int(v) if isinstance(v, float) and k in ['n_estimators', 'max_depth', 'n_neighbors'] else v
                          for k, v in params.items()}
                
                mlflow.set_tag("model", model_name)
                mlflow.log_params(params)

                pipeline = Pipeline([
                    ('scaler', StandardScaler()),
                    ('model', model_info['model'].set_params(**params))
                ])

                pipeline.fit(X_train, y_train)
                y_pred = pipeline.predict(X_test)
                accuracy = accuracy_score(y_test, y_pred)
                mlflow.log_metric("accuracy", accuracy)

            return {'loss': -accuracy, 'status': STATUS_OK}

        with mlflow.start_run(run_name=f"{model_name} Hyperparameter Tuning", nested=True):
            best_params = fmin(
                fn=objective,
                space=model_info['space'],
                algo=tpe.suggest,
                max_evals=10,
                trials=Trials()
            )

            # Map categorical indexes back to actual values if mappings are provided
            if 'param_mappings' in model_info:
                for param, choices in model_info['param_mappings'].items():
                    if param in best_params:
                        best_params[param] = choices[best_params[param]]

            # Convert to integer if necessary
            best_params = {k: int(v) if isinstance(v, float) and k in ['n_estimators', 'max_depth', 'n_neighbors'] else v
                           for k, v in best_params.items()}

            best_model = Pipeline([
                ('scaler', StandardScaler()),
                ('model', model_info['model'].set_params(**best_params))
            ])
            best_model.fit(X_train, y_train)

            y_pred = best_model.predict(X_test)

            metrics = {
                'Accuracy': accuracy_score(y_test, y_pred),
                'Missclassification rate': 1 - accuracy_score(y_test, y_pred),
                'Precision': precision_score(y_test, y_pred, average='weighted'),
                'Recall': recall_score(y_test, y_pred, average='weighted'),
                'F1-score': f1_score(y_test, y_pred, average='weighted')
            }

            # Log metrics to both DAGsHub and MLflow
            logger.log_metrics({f"{model_name} - {metric_name}": metric_value for metric_name, metric_value in metrics.items()})
            for metric_name, metric_value in metrics.items():
                mlflow.log_metric(metric_name, metric_value)



            # Save the model
            pathlib.Path("models").mkdir(parents=True, exist_ok=True)
            with open(f'models/{model_name}_model.pkl', 'wb') as f:
                pickle.dump(best_model, f)

100%|██████████| 10/10 [00:00<00:00, 10.86trial/s, best loss: -0.5147058823529411]
100%|██████████| 10/10 [00:29<00:00,  3.00s/trial, best loss: -0.5735294117647058]
100%|██████████| 10/10 [00:00<00:00, 13.09trial/s, best loss: -0.5]
