## Part 1: Data Preprocessing & FE

In [0]:
%fs ls /databricks-datasets/wine-quality/

In [0]:
import pandas as pd 

red_wine = pd.read_csv("/databricks-datasets/wine-quality/winequality-red.csv",sep=";")
white_wine = pd.read_csv("/databricks-datasets/wine-quality/winequality-white.csv", sep=";")


In [0]:
red_wine['is_red'] = 1
white_wine['is_red'] = 0

In [0]:
data=pd.concat([red_wine,white_wine],axis=0)

In [0]:
data.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)

In [0]:
data.columns

In [0]:
data.dtypes

In [0]:
data.count()

In [0]:
data.corr()

In [0]:
import seaborn as sns
#import matplotlib.pyplot as plt
sns.displot(data.quality)

In [0]:
from sklearn.model_selection import train_test_split

X=data.drop(['quality'],axis=1)
y=data.quality

#split the training data
X_train, X_rem, y_train, y_rem = train_test_split(X, y, test_size=0.6, random_state=42)

# split the remaining into validation and test
X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5)



### Part 2: Building a baseline model

In [0]:
import mlflow
import mlflow.pyfunc
import mlflow.sklearn
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from mlflow.models.signature import infer_signature
from mlflow.utils.environment import _mlflow_conda_env
import cloudpickle
import time

In [0]:
class SklearnModelWrapper(mlflow.pyfunc.PythonModel):
  def __init__(self, model):
    self.model = model
    
  def predict(self, context, model_input):
    return self.model.predict_proba(model_input)[:,1]
 
 
with mlflow.start_run(run_name='untuned_random_forest'):
  n_estimators = 10
  model = RandomForestClassifier(n_estimators=n_estimators, random_state=np.random.RandomState(123))
  model.fit(X_train, y_train)
 
  # predict_proba returns [prob_negative, prob_positive], so slice the output with [:, 1]
  predictions_test = model.predict_proba(X_test)
  auc_score = roc_auc_score(y_test, model.predict_proba(X_test),multi_class='ovo')
  mlflow.log_param('n_estimators', n_estimators)
 
  # Use the area under the ROC curve as a metric
  mlflow.log_metric('auc', auc_score)
  wrappedModel = SklearnModelWrapper(model)
 
  # Log the model with a signature that defines the schema of the model's inputs and outputs. When the model is deployed, this signature will be used to validate inputs.
  signature = infer_signature(X_train, wrappedModel.predict(None, X_train))
  
  # MLflow contains utilities to create a conda environment used to serve models. The necessary dependencies are added to a conda.yaml file which is logged along with the model
  conda_env =  _mlflow_conda_env(
        additional_conda_deps=None,
        additional_pip_deps=["cloudpickle=={}".format(cloudpickle.__version__), "scikit-learn=={}".format(sklearn.__version__)],
        additional_conda_channels=None,
    )
  
  mlflow.pyfunc.log_model("random_forest_model", python_model=wrappedModel, conda_env=conda_env, signature=signature)

In [0]:
feature_importances = pd.DataFrame(model.feature_importances_, index=X_train.columns.tolist(), columns=['importance'])
feature_importances.sort_values('importance', ascending=False)

In [0]:
run_id = mlflow.search_runs(filter_string='tags.mlflow.runName = "untuned_random_forest"').iloc[0].run_id
print(run_id)

In [0]:
model_name = "ml.naval.wine_quality"
model_version = mlflow.register_model(f"runs:/{run_id}/random_forest_model", model_name)
 
# Registering the model takes a few seconds, so add a small delay
time.sleep(15)

In [0]:
model_name = "ml.naval.wine_quality"
new_model_version = mlflow.register_model(f"runs:/{run_id}/random_forest_model", model_name)

In [0]:
from mlflow.tracking import MlflowClient

client = MlflowClient()
client.set_registered_model_alias(
  name=model_name,
  version=model_version.version,
  alias="Production"
)

In [0]:
#older version of mlflow
client.transition_model_version_stage(
  name=model_name,
  version=model_version.version,
  stage="Archived"
)
 
# Promote the new model version to Production
client.transition_model_version_stage(
  name=model_name,
  version=new_model_version.version,
  stage="Production"
)

In [0]:
client.set_registered_model_alias(
  name=model_name,
  version=model_version.version,
  alias="Archived"
)

client.set_registered_model_alias(
  name=model_name,
  version=new_model_version.version,
  alias="Production"
)

### part 3: Batch inference

In [0]:
spark_df=spark.createDataFrame(X_train)

spark_df.write.mode("overwrite").saveAsTable("ml.naval.wine_quality_train")

In [0]:
new_data=spark.table("ml.naval.wine_quality_train")

In [0]:
import mlflow.pyfunc
apply_model_udf = mlflow.pyfunc.spark_udf(spark, f"models:/{model_name}@production")

In [0]:
from pyspark.sql.functions import struct
 
# Apply the model to the new data
udf_inputs = struct(*(X_train.columns.tolist()))
 
new_data = new_data.withColumn(
  "prediction",
  apply_model_udf(udf_inputs)
)

In [0]:
new_data.display()