# Get started with data science in Microsoft Fabric

## Get Data

In [1]:
# Azure storage access info for open dataset diabetes
blob_account_name = "azureopendatastorage"
blob_container_name = "mlsamples"
blob_relative_path = "diabetes"
blob_sas_token = r"" # Blank since container is Anonymous access
    
# Set Spark config to access  blob storage
wasbs_path = f"wasbs://%s@%s.blob.core.windows.net/%s" % (blob_container_name, blob_account_name, blob_relative_path)
spark.conf.set("fs.azure.sas.%s.%s.blob.core.windows.net" % (blob_container_name, blob_account_name), blob_sas_token)
print("Remote blob path: " + wasbs_path)
    
# Spark read parquet, note that it won't load any data yet by now
df = spark.read.parquet(wasbs_path)

StatementMeta(, b98b1438-2dd4-4f66-914e-72f90444c84f, 3, Finished, Available, Finished)

Remote blob path: wasbs://mlsamples@azureopendatastorage.blob.core.windows.net/diabetes


In [2]:
display(df)

StatementMeta(, b98b1438-2dd4-4f66-914e-72f90444c84f, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 168d1dcc-98b8-4d07-aa97-4c3dda0b22e2)

## Prepare Data

In [3]:
df = df.toPandas()
df.head()

StatementMeta(, b98b1438-2dd4-4f66-914e-72f90444c84f, 5, Finished, Available, Finished)

Unnamed: 0,AGE,SEX,BMI,BP,S1,S2,S3,S4,S5,S6,Y
0,59,2,32.1,101.0,157,93.2,38.0,4.0,4.8598,87,151
1,48,1,21.6,87.0,183,103.2,70.0,3.0,3.8918,69,75
2,72,2,30.5,93.0,156,93.6,41.0,4.0,4.6728,85,141
3,24,1,25.3,84.0,198,131.4,40.0,5.0,4.8903,89,206
4,50,1,23.0,101.0,192,125.4,52.0,4.0,4.2905,80,135


In [4]:
# Code generated by Data Wrangler for pandas DataFrame

def clean_data(df):
    # Created column 'Risk' from formula
    df['Risk'] = (df['Y'] > 211.5).astype(int)
    return df

df_clean = clean_data(df.copy())
df_clean.head()

StatementMeta(, b98b1438-2dd4-4f66-914e-72f90444c84f, 19, Finished, Available, Finished)

Unnamed: 0,AGE,SEX,BMI,BP,S1,S2,S3,S4,S5,S6,Y,Risk
0,59,2,32.1,101.0,157,93.2,38.0,4.0,4.8598,87,151,0
1,48,1,21.6,87.0,183,103.2,70.0,3.0,3.8918,69,75,0
2,72,2,30.5,93.0,156,93.6,41.0,4.0,4.6728,85,141,0
3,24,1,25.3,84.0,198,131.4,40.0,5.0,4.8903,89,206,0
4,50,1,23.0,101.0,192,125.4,52.0,4.0,4.2905,80,135,0


In [5]:
df_clean.describe()

StatementMeta(, b98b1438-2dd4-4f66-914e-72f90444c84f, 20, Finished, Available, Finished)

Unnamed: 0,AGE,SEX,BMI,BP,S1,S2,S3,S4,S5,S6,Y,Risk
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,48.5181,1.468326,26.375792,94.647014,189.140271,115.43914,49.788462,4.070249,4.641411,91.260181,152.133484,0.251131
std,13.109028,0.499561,4.418122,13.831283,34.608052,30.413081,12.934202,1.29045,0.522391,11.496335,77.093005,0.434155
min,19.0,1.0,18.0,62.0,97.0,41.6,22.0,2.0,3.2581,58.0,25.0,0.0
25%,38.25,1.0,23.2,84.0,164.25,96.05,40.25,3.0,4.2767,83.25,87.0,0.0
50%,50.0,1.0,25.7,93.0,186.0,113.0,48.0,4.0,4.62005,91.0,140.5,0.0
75%,59.0,2.0,29.275,105.0,209.75,134.5,57.75,5.0,4.9972,98.0,211.5,0.75
max,79.0,2.0,42.2,133.0,301.0,242.4,99.0,9.09,6.107,124.0,346.0,1.0


## Train machine learning models

#### Train a regression model

In [6]:
from sklearn.model_selection import train_test_split
    
X, y = df_clean[['AGE','SEX','BMI','BP','S1','S2','S3','S4','S5','S6']].values, df_clean['Y'].values
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

StatementMeta(, b98b1438-2dd4-4f66-914e-72f90444c84f, 21, Finished, Available, Finished)

In [7]:
import mlflow
experiment_name = "diabetes-regression"
mlflow.set_experiment(experiment_name)

StatementMeta(, b98b1438-2dd4-4f66-914e-72f90444c84f, 22, Finished, Available, Finished)

2025/03/06 05:21:18 INFO mlflow.tracking.fluent: Experiment with name 'diabetes-regression' does not exist. Creating a new experiment.


<Experiment: artifact_location='', creation_time=1741238483964, experiment_id='13cb8edd-993a-43c5-b094-71fc215573bc', last_update_time=None, lifecycle_stage='active', name='diabetes-regression', tags={}>

In [8]:
from sklearn.linear_model import LinearRegression
    
with mlflow.start_run():
   mlflow.autolog()
    
   model = LinearRegression()
   model.fit(X_train, y_train)

StatementMeta(, b98b1438-2dd4-4f66-914e-72f90444c84f, 23, Finished, Available, Finished)

2025/03/06 05:21:41 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


#### Train a classification model

In [9]:
from sklearn.model_selection import train_test_split
    
X, y = df_clean[['AGE','SEX','BMI','BP','S1','S2','S3','S4','S5','S6']].values, df_clean['Risk'].values
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

StatementMeta(, b98b1438-2dd4-4f66-914e-72f90444c84f, 24, Finished, Available, Finished)

In [10]:
import mlflow
experiment_name = "diabetes-classification"
mlflow.set_experiment(experiment_name)

StatementMeta(, b98b1438-2dd4-4f66-914e-72f90444c84f, 25, Finished, Available, Finished)

2025/03/06 05:23:42 INFO mlflow.tracking.fluent: Experiment with name 'diabetes-classification' does not exist. Creating a new experiment.


<Experiment: artifact_location='', creation_time=1741238623163, experiment_id='5c9b3c85-1e7e-4b84-a9ae-f7e14b1ba212', last_update_time=None, lifecycle_stage='active', name='diabetes-classification', tags={}>

In [11]:
from sklearn.linear_model import LogisticRegression
    
with mlflow.start_run():
    mlflow.sklearn.autolog()

    model = LogisticRegression(C=1/0.1, solver="liblinear").fit(X_train, y_train)

StatementMeta(, b98b1438-2dd4-4f66-914e-72f90444c84f, 26, Finished, Available, Finished)

# Generate batch predictions using a deployed model in Microsoft Fabric

## Train a machine learning model

In [12]:
import pandas as pd
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from mlflow.models.signature import ModelSignature
from mlflow.types.schema import Schema, ColSpec

# Get the data
blob_account_name = "azureopendatastorage"
blob_container_name = "mlsamples"
blob_relative_path = "diabetes"
blob_sas_token = r""
wasbs_path = f"wasbs://%s@%s.blob.core.windows.net/%s" % (blob_container_name, blob_account_name, blob_relative_path)
spark.conf.set("fs.azure.sas.%s.%s.blob.core.windows.net" % (blob_container_name, blob_account_name), blob_sas_token)
df = spark.read.parquet(wasbs_path).toPandas()

# Split the features and label for training
X, y = df[['AGE','SEX','BMI','BP','S1','S2','S3','S4','S5','S6']].values, df['Y'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train the model in an MLflow experiment
experiment_name = "experiment-diabetes"
mlflow.set_experiment(experiment_name)
with mlflow.start_run():
    mlflow.autolog(log_models=False)
    model = DecisionTreeRegressor(max_depth=5)
    model.fit(X_train, y_train)
       
    # Define the model signature
    input_schema = Schema([
        ColSpec("integer", "AGE"),
        ColSpec("integer", "SEX"),\
        ColSpec("double", "BMI"),
        ColSpec("double", "BP"),
        ColSpec("integer", "S1"),
        ColSpec("double", "S2"),
        ColSpec("double", "S3"),
        ColSpec("double", "S4"),
        ColSpec("double", "S5"),
        ColSpec("integer", "S6"),
     ])
    output_schema = Schema([ColSpec("integer")])
    signature = ModelSignature(inputs=input_schema, outputs=output_schema)
   
    # Log the model
    mlflow.sklearn.log_model(model, "model", signature=signature)

StatementMeta(, b98b1438-2dd4-4f66-914e-72f90444c84f, 27, Finished, Available, Finished)

2025/03/06 05:32:37 INFO mlflow.tracking.fluent: Experiment with name 'experiment-diabetes' does not exist. Creating a new experiment.


In [13]:
# Get the most recent experiement run
exp = mlflow.get_experiment_by_name(experiment_name)
last_run = mlflow.search_runs(exp.experiment_id, order_by=["start_time DESC"], max_results=1)
last_run_id = last_run.iloc[0]["run_id"]

# Register the model that was trained in that run
print("Registering the model from run :", last_run_id)
model_uri = "runs:/{}/model".format(last_run_id)
mv = mlflow.register_model(model_uri, "diabetes-model")
print("Name: {}".format(mv.name))
print("Version: {}".format(mv.version))

StatementMeta(, b98b1438-2dd4-4f66-914e-72f90444c84f, 28, Finished, Available, Finished)

Registering the model from run : e534f20e-4bac-49f8-80df-99706e234a40


Successfully registered model 'diabetes-model'.
Created version '1' of model 'diabetes-model'.


## Create a test dataset in a lakehouse

In [14]:
from pyspark.sql.types import IntegerType, DoubleType

# Create a new dataframe with patient data
data = [
    (62, 2, 33.7, 101.0, 157, 93.2, 38.0, 4.0, 4.8598, 87),
    (50, 1, 22.7, 87.0, 183, 103.2, 70.0, 3.0, 3.8918, 69),
    (76, 2, 32.0, 93.0, 156, 93.6, 41.0, 4.0, 4.6728, 85),
    (25, 1, 26.6, 84.0, 198, 131.4, 40.0, 5.0, 4.8903, 89),
    (53, 1, 23.0, 101.0, 192, 125.4, 52.0, 4.0, 4.2905, 80),
    (24, 1, 23.7, 89.0, 139, 64.8, 61.0, 2.0, 4.1897, 68),
    (38, 2, 22.0, 90.0, 160, 99.6, 50.0, 3.0, 3.9512, 82),
    (69, 2, 27.5, 114.0, 255, 185.0, 56.0, 5.0, 4.2485, 92),
    (63, 2, 33.7, 83.0, 179, 119.4, 42.0, 4.0, 4.4773, 94),
    (30, 1, 30.0, 85.0, 180, 93.4, 43.0, 4.0, 5.3845, 88)
]
columns = ['AGE','SEX','BMI','BP','S1','S2','S3','S4','S5','S6']
df = spark.createDataFrame(data, schema=columns)

# Convert data types to match the model input schema
df = df.withColumn("AGE", df["AGE"].cast(IntegerType()))
df = df.withColumn("SEX", df["SEX"].cast(IntegerType()))
df = df.withColumn("BMI", df["BMI"].cast(DoubleType()))
df = df.withColumn("BP", df["BP"].cast(DoubleType()))
df = df.withColumn("S1", df["S1"].cast(IntegerType()))
df = df.withColumn("S2", df["S2"].cast(DoubleType()))
df = df.withColumn("S3", df["S3"].cast(DoubleType()))
df = df.withColumn("S4", df["S4"].cast(DoubleType()))
df = df.withColumn("S5", df["S5"].cast(DoubleType()))
df = df.withColumn("S6", df["S6"].cast(IntegerType()))

# Save the data in a delta table
table_name = "diabetes_test"
df.write.format("delta").mode("overwrite").saveAsTable(table_name)
print(f"Spark dataframe saved to delta table: {table_name}")

StatementMeta(, b98b1438-2dd4-4f66-914e-72f90444c84f, 29, Finished, Available, Finished)

Spark dataframe saved to delta table: diabetes_test


## Apply the model to generate predictions

In [15]:
import mlflow
from synapse.ml.predict import MLFlowTransformer

## Read the patient features data 
df_test = spark.read.format("delta").load(f"Tables/{table_name}")

# Use the model to generate diabetes predictions for each row
model = MLFlowTransformer(
    inputCols=["AGE","SEX","BMI","BP","S1","S2","S3","S4","S5","S6"],
    outputCol="predictions",
    modelName="diabetes-model",
    modelVersion=1)
df_test = model.transform(df)

# Save the results (the original features PLUS the prediction)
df_test.write.format('delta').mode("overwrite").option("mergeSchema", "true").saveAsTable(table_name)

StatementMeta(, b98b1438-2dd4-4f66-914e-72f90444c84f, 30, Finished, Available, Finished)

2025/03/06 05:35:13 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.


Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]



StatementMeta(, b98b1438-2dd4-4f66-914e-72f90444c84f, 31, Finished, Available, Finished)

In [16]:
df = spark.sql("SELECT * FROM kai_fuad.diabetes_test LIMIT 1000")
display(df)

StatementMeta(, b98b1438-2dd4-4f66-914e-72f90444c84f, 32, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 0312733a-6834-4431-8e05-2a1bc06c0613)