In [118]:
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score
import numpy as np
import mlflow
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from mlflow.models.signature import infer_signature
from mlflow import MlflowClient

In [119]:
train_path = '../data/processed/train.csv'
test_path = '../data/processed/test.csv'
metrics_path ='../reports/metrics.txt'

In [120]:
class CategoricalTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return transform_categorical(X)


def transform_categorical(column):
    pm10 = column['pm10']
    pm10 = pm10.str.replace('<', '')
    nan_mask = pm10.isna()
    pm10nan = pm10[nan_mask]

    pm10 = pm10.str.strip().dropna().loc[lambda x: x.str.len() > 0]
    pm10 = pm10.astype('float')

    pm10nan[:] = pm10.mean()
    pm10 = pd.concat([pm10, pm10nan], axis=0)

    column['pm10'] = pm10.astype('float')
    return column

In [121]:
def transform_test(test_path, categorical_transform, numerical_transform):
    csv = pd.read_csv(test_path, encoding='utf_8')
    test = pd.DataFrame(csv)
    print('Data read')

    cat_features = test.select_dtypes(include=['object']).columns.tolist()
    num_features = test.select_dtypes(
        include=['float64', 'int64']).columns.tolist()

    test_preprocessor = ColumnTransformer([
        ('pm10_transform', categorical_transform, cat_features),
        ('normal_transform', numerical_transform, num_features)
    ])

    arr = test_preprocessor.fit_transform(test)

    test = pd.DataFrame(
        arr, columns=['temp', 'hum', 'percp', 'wspeed', 'pm10'])
    return test.astype('float')

In [122]:
MLFLOW_TRACKING_URI = "https://dagshub.com/JanaJankovic/air_pollution.mlflow"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("airpollution-mlruns")
mlflow.autolog()


2023/04/07 19:59:16 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [123]:
csv = pd.read_csv(train_path, encoding='utf_8')
train = pd.DataFrame(csv)

x_train = train.drop('pm10', axis=1)
y_train = pd.DataFrame(train['pm10'])

num_features = ['temp', 'hum', 'percp', 'wspeed']

categorical_transform = Pipeline([
    ('transformer', CategoricalTransformer())
])

arr = categorical_transform.fit_transform(y_train)
y_train = pd.DataFrame(arr, columns=['pm10'])

numerical_transform = Pipeline([
    ('imputer', SimpleImputer(strategy='mean'))
])

preprocessor = ColumnTransformer([
    ('numerical_transform', numerical_transform,
        num_features),
])

pipe = Pipeline([
    ('preprocess', preprocessor),
    ('MLPR', MLPRegressor())
])

parameter_space = {
    "MLPR__hidden_layer_sizes": [(32), (16)],
    "MLPR__learning_rate_init": [0.001, 0.01]
}

search = GridSearchCV(pipe, parameter_space,
                        verbose=2, error_score='raise')
search.fit(x_train, y_train)

test = transform_test(test_path, categorical_transform, numerical_transform)
x_test = test.drop('pm10', axis=1)
y_test = pd.DataFrame(test['pm10'])


signature = infer_signature(x_train, search.predict(x_test))
mlflow.sklearn.log_model(search, signature=signature, artifact_path="MLPRegressor",
        registered_model_name="MLPRegressor")

prediction = search.predict(x_test)
print('Model trained')

# Calculate MSE and MAE for the test data
mse_test = mean_squared_error(y_test, prediction)
mae_test = mean_absolute_error(y_test, prediction)
evs_test = explained_variance_score(y_test, prediction)

mlflow.log_metric("MSE Test", mse_test)
mlflow.log_metric("MAE Test", mae_test)
mlflow.log_metric("EVS Test", evs_test)

with open(metrics_path, 'w') as file:
    file.write('MAE:' + str(mae_test) + '\n')
    file.write('MSE:' + str(mse_test) + '\n')
    file.write('EVS:' + str(evs_test) + '\n')

print('Reports updated')

#mlflow.sklearn.log_model(search, "MLPRegressor")

print('Model serialized')

autolog_run = mlflow.last_active_run()
print(autolog_run)

  y = column_or_1d(y, warn=True)


Fitting 5 folds for each of 4 candidates, totalling 20 fits


  y = column_or_1d(y, warn=True)


[CV] END MLPR__hidden_layer_sizes=32, MLPR__learning_rate_init=0.001; total time=   3.5s


  y = column_or_1d(y, warn=True)


[CV] END MLPR__hidden_layer_sizes=32, MLPR__learning_rate_init=0.001; total time=   3.2s


  y = column_or_1d(y, warn=True)


[CV] END MLPR__hidden_layer_sizes=32, MLPR__learning_rate_init=0.001; total time=   3.1s


  y = column_or_1d(y, warn=True)


[CV] END MLPR__hidden_layer_sizes=32, MLPR__learning_rate_init=0.001; total time=   3.7s


  y = column_or_1d(y, warn=True)


[CV] END MLPR__hidden_layer_sizes=32, MLPR__learning_rate_init=0.001; total time=   3.0s
[CV] END MLPR__hidden_layer_sizes=32, MLPR__learning_rate_init=0.01; total time=   2.7s


  y = column_or_1d(y, warn=True)


[CV] END MLPR__hidden_layer_sizes=32, MLPR__learning_rate_init=0.01; total time=   2.1s


  y = column_or_1d(y, warn=True)


[CV] END MLPR__hidden_layer_sizes=32, MLPR__learning_rate_init=0.01; total time=   3.1s


  y = column_or_1d(y, warn=True)


[CV] END MLPR__hidden_layer_sizes=32, MLPR__learning_rate_init=0.01; total time=   2.2s


  y = column_or_1d(y, warn=True)


[CV] END MLPR__hidden_layer_sizes=32, MLPR__learning_rate_init=0.01; total time=   2.3s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END MLPR__hidden_layer_sizes=16, MLPR__learning_rate_init=0.001; total time=   2.6s


  y = column_or_1d(y, warn=True)


[CV] END MLPR__hidden_layer_sizes=16, MLPR__learning_rate_init=0.001; total time=   2.8s


  y = column_or_1d(y, warn=True)


[CV] END MLPR__hidden_layer_sizes=16, MLPR__learning_rate_init=0.001; total time=   2.9s


  y = column_or_1d(y, warn=True)


[CV] END MLPR__hidden_layer_sizes=16, MLPR__learning_rate_init=0.001; total time=   3.2s


  y = column_or_1d(y, warn=True)


[CV] END MLPR__hidden_layer_sizes=16, MLPR__learning_rate_init=0.001; total time=   2.7s
[CV] END MLPR__hidden_layer_sizes=16, MLPR__learning_rate_init=0.01; total time=   2.3s


  y = column_or_1d(y, warn=True)


[CV] END MLPR__hidden_layer_sizes=16, MLPR__learning_rate_init=0.01; total time=   1.0s


  y = column_or_1d(y, warn=True)


[CV] END MLPR__hidden_layer_sizes=16, MLPR__learning_rate_init=0.01; total time=   0.9s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END MLPR__hidden_layer_sizes=16, MLPR__learning_rate_init=0.01; total time=   2.5s
[CV] END MLPR__hidden_layer_sizes=16, MLPR__learning_rate_init=0.01; total time=   1.6s


  y = column_or_1d(y, warn=True)
2023/04/07 20:00:26 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


Data read


  inputs = _infer_schema(model_input)
Registered model 'MLPRegressor' already exists. Creating a new version of this model...
2023/04/07 20:00:35 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: MLPRegressor, version 11
Created version '11' of model 'MLPRegressor'.


Model trained
Reports updated
Model serialized
<ActiveRun: >


In [124]:
client = MlflowClient()
m = client.get_latest_versions('MLPRegressor', stages=["Production"])[0]
history = client.get_metric_history(m.run_id, key='MAE Test')

min = history[0].value
for h in history:
    if h.value < min:
        min = h.value

if mae_test < min:
    print('jere')
    client.transition_model_version_stage(
        name="MLPRegressor", version=m.version, stage='Production'
    )
    


In [127]:
model_version_info = client.get_latest_versions("MLPRegressor", stages=["Production"])[0]
model_uri = model_version_info.source

# Load the model
loaded_model = mlflow.pyfunc.load_model(model_uri)

x_test['hum'] = x_test['hum'].astype('int')

# Use the model to make predictions
predictions = loaded_model.predict(x_test)
print(predictions)

15163_MLPRegressor 11
[25.420775   26.93741804 26.93741804 ... 26.89865167 26.89865167
 26.89865167]
