## Understanding the data

https://www.kaggle.com/c/playground-series-s3e14

## Trying out FLAML

In [1]:
from sklearn.model_selection import train_test_split

import pandas as pd 

In [2]:
data = pd.read_csv("data/train.csv")

data.head()

Unnamed: 0,id,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds,yield
0,0,25.0,0.5,0.25,0.75,0.5,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.425011,0.417545,32.460887,4476.81146
1,1,25.0,0.5,0.25,0.5,0.5,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.444908,0.422051,33.858317,5548.12201
2,2,12.5,0.25,0.25,0.63,0.63,86.0,52.0,71.9,62.0,30.0,50.8,24.0,0.39,0.552927,0.470853,38.341781,6869.7776
3,3,12.5,0.25,0.25,0.63,0.5,77.4,46.8,64.7,55.8,27.0,45.8,24.0,0.39,0.565976,0.478137,39.467561,6880.7759
4,4,25.0,0.5,0.25,0.63,0.63,77.4,46.8,64.7,55.8,27.0,45.8,24.0,0.39,0.579677,0.494165,40.484512,7479.93417


In [3]:
X = data.drop("yield", axis=1)
y = data["yield"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [5]:
from flaml import AutoML

automl = AutoML()
automl.fit(X_train, y_train, task="regression")

[flaml.automl.logger: 12-02 22:35:36] {1679} INFO - task = regression
[flaml.automl.logger: 12-02 22:35:36] {1690} INFO - Evaluation method: cv
[flaml.automl.logger: 12-02 22:35:36] {1788} INFO - Minimizing error metric: 1-r2
[flaml.automl.logger: 12-02 22:35:36] {1900} INFO - List of ML learners in AutoML Run: ['xgboost', 'lgbm', 'rf', 'extra_tree', 'xgb_limitdepth']
[flaml.automl.logger: 12-02 22:35:36] {2218} INFO - iteration 0, current learner xgboost


KeyboardInterrupt: 

## Using the Azure SDK 

In [2]:
from azure.identity import DefaultAzureCredential
from azure.ai.ml import MLClient

credential = DefaultAzureCredential()
ml_client = MLClient.from_config(credential)

Found the config file in: .\config.json


In [None]:
import mltable

paths = [
    {"file": "./data/train.csv"}
]

train_table = mltable.from_delimited_files(paths)
train_table.save('./data/mltable')

: 

In [4]:
#!az ml compute create -f compute.yml

In [None]:
# Compute Instances need to have a unique name across the region.
# Here we create a unique name with current datetime
from azure.ai.ml.entities import ComputeInstance, AmlCompute
import datetime

compute_name = "automl-webinar"
ci_basic = ComputeInstance(
    name=compute_name, 
    size="STANDARD_D2AS_V4", # 2 cores, 8GB RAM, 16GB Storage
    idle_time_before_shutdown_minutes="30"
)
ml_client.begin_create_or_update(ci_basic).result()

In [None]:
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml import automl, Input

# note that this is a code snippet -- you might have to modify the variable values to run it successfully

# make an Input object for the training data
training_data_input = Input(
    type=AssetTypes.MLTABLE, path="./data/training-mltable-folder"
)

# configure the classification job
classification_job = automl.regression(
    compute=compute_name,
    experiment_name="automl-webinar-blueberry-prediction",
    training_data=training_data_input,
    target_column_name="yield",
    primary_metric="mae",
    n_cross_validations=5,
    enable_model_explainability=True,
    tags={"my_custom_tag": "My custom value"}
)

# Limits are all optional
classification_job.set_limits(
    timeout_minutes=600, 
    trial_timeout_minutes=20, 
    max_trials=5,
    enable_early_termination=True,
)

# Training properties are optional
classification_job.set_training(
    blocked_training_algorithms=["logistic_regression"], 
    enable_onnx_compatible_models=True
)

## Alternatives to Azure AutoML