# Train a machine learning model to predict diabetes

In [None]:
# Azure storage access info for open dataset diabetes
blob_account_name = "azureopendatastorage"
blob_container_name = "mlsamples"
blob_relative_path = "diabetes"
blob_sas_token = r"" # Blank since container is Anonymous access

# Set Spark config to access  blob storage
wasbs_path = f"wasbs://%s@%s.blob.core.windows.net/%s" % (blob_container_name, blob_account_name, blob_relative_path)
spark.conf.set("fs.azure.sas.%s.%s.blob.core.windows.net" % (blob_container_name, blob_account_name), blob_sas_token)
print("Remote blob path: " + wasbs_path)

# Spark read parquet, note that it won't load any data yet by now
df = spark.read.parquet(wasbs_path)

## Convert the Spark dataframe to a Pandas dataframe:

In [None]:
import pandas as pd
df = df.toPandas()
df.head()

## Split the data:

In [None]:
from sklearn.model_selection import train_test_split

X, y = df[['AGE','SEX','BMI','BP','S1','S2','S3','S4','S5','S6']].values, df['Y'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

## Create an experiment in the workspace:

In [None]:
import mlflow
experiment_name = "experiment-diabetes"
mlflow.set_experiment(experiment_name)

## Train and track the model:

In [None]:
from sklearn.tree import DecisionTreeRegressor
from mlflow.models.signature import ModelSignature
from mlflow.types.schema import Schema, ColSpec

with mlflow.start_run():
   mlflow.autolog(log_models=False)

   model = DecisionTreeRegressor(max_depth=5) 
   model.fit(X_train, y_train)

   # create the signature manually
   input_schema = Schema([
   ColSpec("integer", "AGE"),
   ColSpec("integer", "SEX"),
   ColSpec("double", "BMI"),
   ColSpec("double", "BP"),
   ColSpec("integer", "S1"),
   ColSpec("double", "S2"),
   ColSpec("double", "S3"),
   ColSpec("double", "S4"),
   ColSpec("double", "S5"),
   ColSpec("integer", "S6"),
   ])

   output_schema = Schema([ColSpec("integer")])

   # Create the signature object
   signature = ModelSignature(inputs=input_schema, outputs=output_schema)

   # manually log the model
   mlflow.sklearn.log_model(model, "model", signature=signature)

### When the model is trained and tracked in an experiment, you can register the model from the latest experiment run output. Start by retrieving the latest run ID

In [None]:
exp = mlflow.get_experiment_by_name(experiment_name)

last_run = mlflow.search_runs(exp.experiment_id, order_by=["start_time DESC"], max_results=1)

last_run_id = last_run.iloc[0]["run_id"]

print("Last Run ID:", last_run_id)

### Create the model URI by specifying the model output folder to which all model artifacts are stored and including the experiment run ID

In [None]:
model_uri = "runs:/{}/model".format(last_run_id)

## Save the model by registering it to the workspace

In [None]:
mv = mlflow.register_model(model_uri, "diabetes-model")

print("Name: {}".format(mv.name))
print("Version: {}".format(mv.version))


## Your model is now saved in your workspace under the name diabetes-model.

###  Optionally, you can use the browse feature in your workspace to find the model in the workspace and explore it using the UI.

## Create a test dataset and save in a lakehouse

## Before running the cell below, complete the following steps:

- In the Add lakehouse pane, select Add to add a lakehouse.
1. Select New lakehouse and select Add.
2. Create a new Lakehouse with a name of your choice.
3. When asked to stop the current session, select Stop now to restart the notebook.
4. When the lakehouse is created and attached to this notebook, run the following cell to create a test dataset

# Create the dataframe with test data

In [None]:
data = [
    (62, 2, 33.7, 101.0, 157, 93.2, 38.0, 4.0, 4.8598, 87),
    (50, 1, 22.7, 87.0, 183, 103.2, 70.0, 3.0, 3.8918, 69),
    (76, 2, 32.0, 93.0, 156, 93.6, 41.0, 4.0, 4.6728, 85),
    (25, 1, 26.6, 84.0, 198, 131.4, 40.0, 5.0, 4.8903, 89),
    (53, 1, 23.0, 101.0, 192, 125.4, 52.0, 4.0, 4.2905, 80),
    (24, 1, 23.7, 89.0, 139, 64.8, 61.0, 2.0, 4.1897, 68),
    (38, 2, 22.0, 90.0, 160, 99.6, 50.0, 3.0, 3.9512, 82),
    (69, 2, 27.5, 114.0, 255, 185.0, 56.0, 5.0, 4.2485, 92),
    (63, 2, 33.7, 83.0, 179, 119.4, 42.0, 4.0, 4.4773, 94),
    (30, 1, 30.0, 85.0, 180, 93.4, 43.0, 4.0, 5.3845, 88)
]

columns = ['AGE','SEX','BMI','BP','S1','S2','S3','S4','S5','S6']

df = spark.createDataFrame(data, schema=columns)
df.show()

## Visualize the data types of the columns:

In [None]:
df.dtypes

## Change the data types for the columns to align with the model's expected input

In [None]:
from pyspark.sql.types import IntegerType, DoubleType

df = df.withColumn("AGE", df["AGE"].cast(IntegerType()))
df = df.withColumn("SEX", df["SEX"].cast(IntegerType()))
df = df.withColumn("BMI", df["BMI"].cast(DoubleType()))
df = df.withColumn("BP", df["BP"].cast(DoubleType()))
df = df.withColumn("S1", df["S1"].cast(IntegerType()))
df = df.withColumn("S2", df["S2"].cast(DoubleType()))
df = df.withColumn("S3", df["S3"].cast(DoubleType()))
df = df.withColumn("S4", df["S4"].cast(DoubleType()))
df = df.withColumn("S5", df["S5"].cast(DoubleType()))
df = df.withColumn("S6", df["S6"].cast(IntegerType()))

df.dtypes

## Save the test dataset in the lakehouse as a Delta table named diabetes_test

In [None]:
table_name = "diabetes_test"
df.write.format("delta").mode("overwrite").save(f"Tables/{table_name}")
print(f"Spark dataframe saved to delta table: {table_name}")

#### To view the delta table, select the ... next to the Tables in the Lakehouse explorer pane, and select Refresh. The diabetes_test table should appear.

#### Expand the diabetes_test table in the left pane to view all fields it includes. Note that there's no field named predictions yet.

## Apply the model to generate predictions

- Finally, you can apply the model you trained.

In [None]:
import mlflow
from synapse.ml.predict import MLFlowTransformer

df_test = spark.read.format("delta").load(f"Tables/{table_name}")

model = MLFlowTransformer(
    inputCols=["AGE","SEX","BMI","BP","S1","S2","S3","S4","S5","S6"],
    outputCol="predictions",
    modelName="diabetes-model",
    modelVersion=1
)
df_test = model.transform(df)

df_test.write.format('delta').mode("overwrite").option("mergeSchema", "true").save(f"Tables/{table_name}")

### Select the ... next to the diabetes_test table and select Refresh. A new field predictions has been added.

#### Drag and drop the diabetes_test table to the field below. The necessary code to view the table's contents will appear. Run the cell to visualize the data