# Multilabel classification with text data using AutoML NLP

## 1. Connect to Azure Machine Learning Workspace



### 1.1. Import the required libraries

In [3]:
# pip install azure-identity
from azure.identity import DefaultAzureCredential 
# pip install azure-ai-ml
from azure.ai.ml.constants import AssetTypes 
from azure.ai.ml import automl, Input, MLClient

### 1.2. Configure workspace details and get a handle to the workspace

To connect to a workspace, we need identifier parameters:
- A subscription.
- Resource group.
- Workspace name. 
 
We will use these details in the `MLClient` from `azure.ai.ml` to get a handle on the required Azure Machine Learning workspace. 

In [4]:
credentials = DefaultAzureCredential()
subscription_id = 'a7ef3688-af58-4835-953c-e51f219fbd0f'
resource_group_name = "BigData_resource_group"
workspace = "BigData_workspace"
try:
    ml_client = MLClient(credentials, subscription_id, resource_group_name, workspace)
    print("MLClient created")
except Exception as ex:
    print(ex)

MLClient created


## 2. Data Preparation

Scenario: Paper submission systems (such as CMT, OpenReview, etc.) require the users to upload paper titles and paper abstracts and then specify the subject areas their papers best belong to. 

Our model will be trained on a dataset of paper titles and abstracts and their corresponding subject areas to classify/suggest what category corresponding papers could be best associated with.

Our dataset is called 'arxiv_data.csv' and can be downloaded from [this link](https://www.kaggle.com/spsayakpaul/arxiv-paper-abstracts).


### 2.1 Filter out less common labels, and save the preprocessed dataset to a new file.

In [5]:
# import ast
# import os

import pandas as pd
import numpy as np
# from collections import Counter
# from sklearn.preprocessing import MultiLabelBinarizer

# # We want to filter out less common labels from a dataset of arxiv papers, and then save the preprocessed dataset to a new CSV file.
# N = 5  # the number of most popular labels to keep

# # Read the dataset
# datasetPath = './data/arxiv_data.csv'
# data = pd.read_csv(datasetPath)
# # Convert the labels from strings to lists
# data["terms"] = data["terms"].apply(ast.literal_eval) # ast.literal_eval: is used to convert the string representation of a list of labels to an actual list.

# # Convert the list of labels into a binary matrix 
# transformer = MultiLabelBinarizer(sparse_output=True) # is used to convert the list of labels into a binary matrix. This is necessary to use the labels as input to a machine learning algorithm.
# transformer.fit(data["terms"])
# K = len(transformer.classes_)
# print("The original dataset has {} unique labels".format(K))

# counter = Counter() # is used to count the number of times each label appears in the dataset.
# for labels in data["terms"]:
#     counter.update(labels)
# min_count = counter.most_common(N)[-1] # returns a list of the N most common labels in the dataset, based on their frequency.
# print("The {} most common labels appear at least {} times".format(N, min_count[1]))

# # Count the occurrences of each term
# term_counts = data["terms"].apply(pd.Series).stack().value_counts()

# # Find the terms that occur less than min_count times
# rare_terms = term_counts[term_counts < min_count[1]].index

# # Remove rows that contain rare terms
# data = data[~data["terms"].apply(set(rare_terms).intersection).astype(bool)]

# # Create the folder if not already exists, save dataset
# if not os.path.exists("data"):
#     os.mkdir("data")
# data.to_csv("./data/arxiv_abstract.csv", index=False)



### 2.2 Clean and sample the data

In [6]:
# Read the dataset
datasetPath = './data/arxiv_abstract.csv'
data = pd.read_csv(datasetPath)
# Drop rows with missing or duplicate values
# data = data.dropna()
# data = data.drop_duplicates()
# data = data.sample(frac=1).reset_index(drop=True)

# sample the data since it is too large
# data = data.sample(frac=0.1).reset_index(drop=True)

### 2.3 Divide the data into train, validate and test sets

In [7]:
# # 80% training, 10% validation, 10% testing
# train, validate, test = np.split(data, [int(.8*len(data)), int(.9*len(data))])
# train.to_csv('./trainData/train.csv', index=False)
# validate.to_csv('./validationData/validation.csv', index=False)
# test.to_csv('./testData/test.csv', index=False)

# # print the number of rows in each set wth its ratio
# print("Train set: ", len(train), ", which is: ", len(train)/len(data))
# print("Validation set: ", len(validate), ", which is: ",len(validate)/len(data))
# print("Test set: ", len(test), ", which is: ",len(test)/len(data))



### 2.4 Read the data

In [8]:
# MLTable folders
training_mltable_path = "./trainData/"
validation_mltable_path = "./validationData/"

# Training MLTable defined locally, with local data to be uploaded
trainData = Input(type=AssetTypes.MLTABLE, path=training_mltable_path)

# Validation MLTable defined locally, with local data to be uploaded
validationData = Input(type=AssetTypes.MLTABLE, path=validation_mltable_path)

## 3. Configure the AutoML NLP Text Classification Multilabel training job


### 3.1. Create or get an existing Azure Machine Learning compute target

Now, we want to create or get an existing Azure Machine Learning compute target. The compute target is used for training machine learning models and can be thought of as a set of virtual machines that run in parallel to speed up the training process.

In [9]:
from azure.ai.ml.entities import AmlCompute
from azure.core.exceptions import ResourceNotFoundError

compute_name = "mwk"

try:
    _ = ml_client.compute.get(compute_name)
    print("Found existing compute target.")
except ResourceNotFoundError:
    print("Creating a new compute target...")
    # general job parameters
    compute_config = AmlCompute(
        name=compute_name,
        type="amlcompute",
        size="Standard_NC6",
        idle_time_before_scale_down=120,
        min_instances=0,
        max_instances=4,
    )
    # Finally, the new compute target is created using ml_client.begin_create_or_update(compute_config).result(). 
    # The .result() method ensures that the creation operation completes before moving on to the next step of the code.
    ml_client.begin_create_or_update(compute_config).result()


Found existing compute target.


### 3.2. Create the AutoML job(experiment) with the related factory-function

Now, we want to create a new text classification multilabel experiment using Azure Machine Learning's automated machine learning (AutoML) functionality, with the specified configuration settings.

After the AutoML experiment configuration is set up, `text_classification_multilabel_job.set_limits(timeout_minutes=exp_timeout)` is used to set the maximum amount of time that the experiment can run for.
Once the configuration is complete and the timeout is set, the AutoML experiment can be run using `text_classification_multilabel_job.fit()` to train multiple models and find the best performing model based on the specified evaluation metric.


In [10]:
# Create the AutoML job with the related factory-function.
exp_name = "dpv2-nlp-multilabel"
exp_timeout = 120
text_classification_multilabel_job = automl.text_classification_multilabel(
    compute=compute_name,
    experiment_name=exp_name,
    training_data=trainData,
    validation_data=validationData,
    target_column_name="terms",
    primary_metric="accuracy", # specifies the evaluation metric to be used to compare the performance of different models during the AutoML experiment.
    tags={"Name": "BigData-Text-Classification-Multilabel"},
)
text_classification_multilabel_job.set_limits(timeout_minutes=exp_timeout)

## 4. Run the AutoML NLP Text Classification Multilabel training job


### 4.1 Submit the AutoML job

The `ml_client.jobs.create_or_update()` method is called with the `text_classification_multilabel_job` object as an argument. 

This method creates a new job or updates an existing job with the specified experiment configuration.

The `create_or_update()` method returns a job object that represents the job in the Azure Machine Learning service backend.

The `returned_job` variable is assigned to the job object returned by the `create_or_update()` method. 

The job object contains information about the job, such as its ID, status, and run history.

The job is then submitted to the backend for execution. The status of the job can be tracked and monitored using the Azure Machine Learning service backend.

In [11]:
# Submit the AutoML job

returned_job = ml_client.jobs.create_or_update(
    text_classification_multilabel_job
)  # submit the job to the backend

print(f"Created job: {returned_job}")

Readonly attribute primary_metric will be ignored in class <class 'azure.ai.ml._restclient.v2023_02_01_preview.models._models_py3.TextClassificationMultilabel'>


Created job: compute: azureml:mwk
creation_context:
  created_at: '2023-05-01T23:30:55.453769+00:00'
  created_by: "\u0645\u0635\u0637\u0641\u0649 \u0648\u0627\u0626\u0644 \u0643\u0645\
    \u0627\u0644 \u0645\u062D\u0645\u062F \u0645\u062D\u0645\u062F \u0639\u0644\u0649"
  created_by_type: User
display_name: goofy_reggae_79wspbhc97
experiment_name: dpv2-nlp-multilabel
id: azureml:/subscriptions/a7ef3688-af58-4835-953c-e51f219fbd0f/resourceGroups/BigData_resource_group/providers/Microsoft.MachineLearningServices/workspaces/BigData_workspace/jobs/goofy_reggae_79wspbhc97
limits:
  max_concurrent_trials: 1
  max_nodes: 1
  max_trials: 1
  timeout_minutes: 120
log_verbosity: info
name: goofy_reggae_79wspbhc97
outputs: {}
primary_metric: accuracy
properties: {}
resources:
  instance_count: 1
  shm_size: 2g
services:
  Studio:
    endpoint: https://ml.azure.com/runs/goofy_reggae_79wspbhc97?wsid=/subscriptions/a7ef3688-af58-4835-953c-e51f219fbd0f/resourcegroups/BigData_resource_group/workspac

### 4.2 Monitor the AutoML job

In [None]:
ml_client.jobs.stream(returned_job.name) # The actual execution of the job is started using the ml_client.jobs.stream() method.


## 5. Retrieve Model Information from the Best Trial of the Model
Once all the trials complete training, we can retrieve the best model.



### 5.1 Obtain best child run id

In [13]:
# Obtain best child run id
returned_nlp_job = ml_client.jobs.get(name=returned_job.name)
best_child_run_id = returned_nlp_job.tags["automl_best_child_run_id"]

### 5.2 Obtain the tracking URI for MLFlow

In [15]:
# Obtain the tracking URI for MLFlow

# pip install azureml-mlflow
import mlflow

# Obtain the tracking URL from MLClient
MLFLOW_TRACKING_URI = ml_client.workspaces.get(
    name=ml_client.workspace_name
).mlflow_tracking_uri
# Set the MLFLOW TRACKING URI
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
print("\nCurrent tracking uri: {}".format(mlflow.get_tracking_uri()))

### 5.3 Get the AutoML parent Job

In [17]:
from mlflow.tracking.client import MlflowClient

# Initialize MLFlow client
mlflow_client = MlflowClient()

# Get the AutoML parent Job
job_name = returned_job.name

# Get the parent run
mlflow_parent_run = mlflow_client.get_run(job_name)

print("Parent Run: ")
print(mlflow_parent_run)

Parent Run: 
<Run: data=<RunData: metrics={'AUC_macro': 0.4704729801043168,
 'AUC_micro': 0.8810082427131436,
 'AUC_weighted': 0.5188226592089041,
 'accuracy': 0.6140350877192983,
 'average_precision_score_macro': 0.1085165195178471,
 'average_precision_score_micro': 0.7414656351057535,
 'average_precision_score_weighted': 0.6432029684560076,
 'balanced_accuracy': 0.0625,
 'f1_score_macro': 0.060810810810810814,
 'f1_score_micro': 0.7346938775510204,
 'f1_score_weighted': 0.5837837837837838,
 'log_loss': 3.625166160127888,
 'norm_macro_recall': 0.14666666666666667,
 'precision_score_macro': 0.05921052631578947,
 'precision_score_micro': 0.9473684210526315,
 'precision_score_weighted': 0.5684210526315789,
 'recall_score_macro': 0.0625,
 'recall_score_micro': 0.6,
 'recall_score_weighted': 0.6}, params={}, tags={'Name': 'BigData-Text-Classification-Multilabel',
 'automl_best_child_run_id': 'goofy_reggae_79wspbhc97_HD_0',
 'fit_time_000': 'NaN',
 'is_gpu': 'True',
 'iteration_000': '0',
 

### 5.4 Get the AutoML best child run

In [18]:
# Get the AutoML best child run
best_run = mlflow_client.get_run(best_child_run_id)
# OR
# best_child_run_id = mlflow_parent_run.data.tags["automl_best_child_run_id"]
print("Best child run: ")
print(best_run)
print("Best child run metrics: ")
print(best_run.data.metrics)

Best child run: 
<Run: data=<RunData: metrics={'AUC_macro': 0.4704729801043168,
 'AUC_micro': 0.8810082427131436,
 'AUC_weighted': 0.5188226592089041,
 'accuracy': 0.6140350877192983,
 'average_precision_score_macro': 0.1085165195178471,
 'average_precision_score_micro': 0.7414656351057535,
 'average_precision_score_weighted': 0.6432029684560076,
 'balanced_accuracy': 0.0625,
 'f1_score_macro': 0.060810810810810814,
 'f1_score_micro': 0.7346938775510204,
 'f1_score_weighted': 0.5837837837837838,
 'log_loss': 3.625166160127888,
 'norm_macro_recall': 0.14666666666666667,
 'precision_score_macro': 0.05921052631578947,
 'precision_score_micro': 0.9473684210526315,
 'precision_score_weighted': 0.5684210526315789,
 'recall_score_macro': 0.0625,
 'recall_score_micro': 0.6,
 'recall_score_weighted': 0.6}, params={}, tags={'hyperparameters': '{"ignored_argument": 0}',
 'mlflow.parentRunId': 'goofy_reggae_79wspbhc97_HD',
 'mlflow.rootRunId': 'goofy_reggae_79wspbhc97',
 'mlflow.runName': 'modest_

### 6. Download the best model locally

Access the results (such as Models, Artifacts, Metrics) of a previously completed AutoML Run.

In [19]:
import os
from mlflow.artifacts import download_artifacts

# Create local folder
local_dir = "./artifact_downloads"
if not os.path.exists(local_dir):
    os.mkdir(local_dir)
# Download run's artifacts/outputs
local_path = download_artifacts(
    run_id=best_run.info.run_id, artifact_path="outputs", dst_path=local_dir
)
print("Artifacts downloaded in: {}".format(local_path))
print("Artifacts: {}".format(os.listdir(local_path)))

Artifacts downloaded in: /media/mostafa/CUFE/CMP4/2nd term/Big Data/Labs/azure lab/artifact_downloads/outputs
Artifacts: ['all_results.json', 'conda_env_v_1_0_0.yml', 'config.json', 'generated_code', 'metrics.csv', 'mlflow-model', 'model.pkl', 'pytorch_model.bin', 'run_id.txt', 'score_script.py', 'scoring_file_v_1_0_0.py', 'special_tokens_map.json', 'tokenizer.json', 'tokenizer_config.json', 'trainer_state.json', 'training_args.bin', 'train_results.json', 'vocab.txt']


In [20]:
# Show the contents of the MLFlow model folder
os.listdir("./artifact_downloads/outputs/mlflow-model")

['conda.yaml',
 'data',
 'input_example.json',
 'MLmodel',
 'python_env.yaml',
 'requirements.txt']