In [160]:
import azureml.core
# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.43.0


In [161]:
import pandas as pd
df_loan = pd.read_csv('data/loan/Data_Loan.csv', index_col='Id')
df_test = pd.read_csv('data/loan/Test Data.csv', index_col='ID')
df_test_RF = pd.read_csv('data/loan/Sample Prediction Dataset.csv', index_col='ID')

In [162]:
df_test = pd.concat([df_test,df_test_RF],axis=1)
df_loan = pd.concat([df_loan, df_test], axis=0)

In [165]:
print(df_loan.shape)
df_loan.head()

(280000, 12)


Unnamed: 0,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
1,1303834,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0
2,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13,0
3,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10,0
4,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12,1
5,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1


In [113]:
df_loan.shape

(252000, 12)

# Set up Compute

In [114]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core import Workspace

ws = Workspace.from_config()
print("Workspace: " + ws.name, "Region: " + ws.location, sep = '\n')

# Choose a name for your CPU cluster
amlcompute_cluster_name = "cpu-cluster"

# Verify that cluster does not exist already
try:
    aml_compute = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2',
                                                           max_nodes=4)
    aml_compute = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

aml_compute.wait_for_completion(show_output=True)

Workspace: mlopscourse
Region: eastus
Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


# Upload data to Azure Blob

In [115]:
import os
dataDir = "data"

if not os.path.exists(dataDir):
    os.mkdir(dataDir)

loanDir = dataDir + "/loan"

if not os.path.exists(loanDir):
    os.mkdir(loanDir)
    
loanData = loanDir + "/unprepared.parquet"

df_loan.to_csv(loanData, index=False)

print("Data written to local folder.")

Data written to local folder.


In [116]:

# Default datastore
default_store = ws.get_default_datastore() 

default_store.upload_files([loanData], 
                           target_path = 'loan', 
                           overwrite = True, 
                           show_progress = True)

print("Upload calls completed.")

Uploading an estimated of 1 files
Uploading data/loan/unprepared.parquet
Uploaded data/loan/unprepared.parquet, 1 files out of an estimated total of 1
Uploaded 1 files
Upload calls completed.


# Create and register datasets

Bằng cách khởi tạo dataset, ta có thể tạo một liên kết đến địa chỉ data source. Nên khi ta áp dụng các kỹ thuật feature engineering lên bộ data, nó cũng sẽ được lưu giữ tại

In [117]:
from azureml.core import Dataset
loan_data = Dataset.Tabular.from_delimited_files(default_store.path('loan/unprepared.parquet'))

In [118]:
loan_data = loan_data.register(ws, 'loan_data')

# Create run config

In [119]:
from azureml.core.runconfig import RunConfiguration, DockerConfiguration
from azureml.core.conda_dependencies import CondaDependencies

# Create a new runconfig object
aml_run_config = RunConfiguration()

# Use the aml_compute being created above. 
aml_run_config.target = aml_compute

# Enable Docker
docker=DockerConfiguration(use_docker=True)
aml_run_config.docker=docker

# Use conda_dependencies.yml to create a conda environment in the Docker image for execution
aml_run_config.environment.python.user_managed_dependencies = False

# Specify CondaDependencies obj, add necessary packages
aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
    conda_packages=['pandas','scikit-learn'], 
    pip_packages=['azureml-sdk[automl]', 'pyarrow'])

# Feature Engineering

In [120]:
miss = df_loan.isna().sum()
miss = miss[miss>0]
print(f"Các dữ liệu thiếu:\n{miss}")
print(f"Số lượng cột có dữ liệu thiếu {len(miss)}")

Các dữ liệu thiếu:
Series([], dtype: int64)
Số lượng cột có dữ liệu thiếu 0


In [121]:
columns = ['Income','Age','Experience','Married/Single','House_Ownership',	'Car_Ownership','Profession','CITY','STATE','CURRENT_JOB_YRS',	'CURRENT_HOUSE_YRS','Risk_Flag']

## Remove duplicate:
Loại bỏ dữ liệu nhiễu bằng cách loại bỏ các dữ liệu trùng lặp ở các cột, ngoài trừ cột Risk_flag (khả năng vỡ nợ)

In [122]:
attributes=['Income','Age','Experience','Married/Single','House_Ownership',
            'Car_Ownership','Profession','CITY','STATE','CURRENT_JOB_YRS',
            'CURRENT_HOUSE_YRS' ]
loan = df_loan.copy()
df_loan = df_loan.drop_duplicates(subset=attributes)
df_loan.shape

print('Trước khi loại bỏ nhiễu:',loan.shape)
loan = loan.drop_duplicates(subset=attributes)
print('Sau khi loại bỏ nhiễu:', loan.shape)


Trước khi loại bỏ nhiễu: (252000, 12)
Sau khi loại bỏ nhiễu: (42007, 12)


In [123]:
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep

# python scripts folder
prepare_data_folder = './prep_data'

In [124]:
# Define output after remove duplicate step
clean_data = PipelineData("clean_data", datastore=default_store).as_dataset()

print('Remove noise script is in {}.'.format(os.path.realpath(prepare_data_folder)))

# remove noise step creation
# See the drop_noise.py for details about input and output
cleanStep = PythonScriptStep(
    name="drop druplicate Data",
    script_name="drop_noise.py", 
    arguments=["--output_dropNoise", clean_data],
    inputs=[loan_data.as_named_input('raw_data')],
    outputs=[clean_data],
    compute_target=aml_compute,
    runconfig=aml_run_config,
    source_directory=prepare_data_folder,
    allow_reuse=True
)

print("Remove noise data created.")

Remove noise script is in /mnt/batch/tasks/shared/LS_root/mounts/clusters/cpulong/code/Users/long.nguyen.1839/feat_engineering/prep_data.
Remove noise data created.


## Encode data:
 Chuyển đổi các label dạng string thành dạng numeric

In [125]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
cols = ['Married/Single','House_Ownership','Car_Ownership','Profession','CITY','STATE']

for i in cols:
    loan[i]= le.fit_transform(loan[i])
loan.head()

Unnamed: 0_level_0,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1303834,23,3,1,2,0,33,251,13,3,13,0
2,7574516,40,10,1,2,0,43,227,14,9,13,0
3,3991815,66,4,0,2,0,47,8,12,4,10,0
4,6256451,41,2,1,2,1,43,54,17,2,12,1
5,5768871,47,11,1,2,0,11,296,22,3,14,1


In [126]:
# Define output after encoded step
encoded_data = PipelineData("encoded_data", datastore=default_store).as_dataset()

print('Encode script is in {}.'.format(os.path.realpath(prepare_data_folder)))

# encode step creation
# See the encode.py for details about input and output
encodeStep = PythonScriptStep(
    name="Encode Loan Data",
    script_name="encode.py", 
    arguments=["--output_encode", encoded_data],
    inputs=[clean_data.parse_parquet_files()],
    outputs=[encoded_data],
    compute_target=aml_compute,
    runconfig=aml_run_config,
    source_directory=prepare_data_folder,
    allow_reuse=True
)

print("Encode data created.")

Encode script is in /mnt/batch/tasks/shared/LS_root/mounts/clusters/cpulong/code/Users/long.nguyen.1839/feat_engineering/prep_data.
Encode data created.


## Robust scaling

In [127]:
loan.reset_index(drop=True, inplace=True)
df_loan = loan

In [128]:
# robust scalling
scaler = preprocessing.RobustScaler()
loan = scaler.fit_transform(loan[attributes])
loan = pd.DataFrame(loan, columns =attributes)

In [129]:
loan = pd.concat([loan,df_loan['Risk_Flag']], axis=1)
loan.head()

Unnamed: 0,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,-0.739674,-0.9,-0.7,0.0,0.0,0.0,0.307692,0.588608,-0.058824,-0.5,0.5,0
1,0.519909,-0.333333,0.0,0.0,0.0,0.0,0.692308,0.436709,0.0,0.5,0.5,0
2,-0.199743,0.533333,-0.6,-1.0,0.0,0.0,0.846154,-0.949367,-0.117647,-0.333333,-1.0,0
3,0.255152,-0.3,-0.8,0.0,0.0,1.0,0.692308,-0.658228,0.176471,-0.666667,0.0,1
4,0.157212,-0.1,0.1,0.0,0.0,0.0,-0.538462,0.873418,0.470588,-0.5,1.0,1


In [130]:
loan.shape

(42007, 12)

In [131]:
# Define output after remove duplicate step
scaled_data = PipelineData("scaled_data", datastore=default_store).as_dataset()

print('Scalling script is in {}.'.format(os.path.realpath(prepare_data_folder)))

# Scalling step creation
# See the scaling.py for details about input and output
scalingStep = PythonScriptStep(
    name="Robust scalling Data",
    script_name="scaling.py", 
    arguments=["--output_scale", scaled_data],
    inputs=[encoded_data.parse_parquet_files()],
    outputs=[scaled_data],
    compute_target=aml_compute,
    runconfig=aml_run_config,
    source_directory=prepare_data_folder,
    allow_reuse=True
)

print("scaled data created.")


Scalling script is in /mnt/batch/tasks/shared/LS_root/mounts/clusters/cpulong/code/Users/long.nguyen.1839/feat_engineering/prep_data.
scaled data created.


## Train-test split

In [132]:
train_model_folder = './trainmodel'

# train and test splits output
output_split_train = PipelineData("output_split_train", datastore=default_store).as_dataset()
output_split_test = PipelineData("output_split_test", datastore=default_store).as_dataset()

print('Data spilt script is in {}.'.format(os.path.realpath(train_model_folder)))

# test train split step creation
# See the train_test_split.py for details about input and output
testTrainSplitStep = PythonScriptStep(
    name="Train Test Data Split",
    script_name="train_test_split.py", 
    arguments=["--output_split_train", output_split_train,
               "--output_split_test", output_split_test],
    inputs=[scaled_data.parse_parquet_files()],
    outputs=[output_split_train, output_split_test],
    compute_target=aml_compute,
    runconfig = aml_run_config,
    source_directory=train_model_folder,
    allow_reuse=True
)

print("testTrainSplitStep created.")

Data spilt script is in /mnt/batch/tasks/shared/LS_root/mounts/clusters/cpulong/code/Users/long.nguyen.1839/feat_engineering/trainmodel.
testTrainSplitStep created.


# Auto ML

In [133]:
from azureml.core import Experiment

experiment = Experiment(ws, 'Fraud_detection_pipeline')

print("Experiment created")

Experiment created


In [134]:
from azureml.train.automl import AutoMLConfig
import json
import logging

# Change iterations to a reasonable number (50) to get better accuracy
automl_settings = {
    "experiment_timeout_hours": 0.3,
    "enable_early_stopping": True,
    "iteration_timeout_minutes": 5,
    "max_concurrent_iterations": 4,
    "max_cores_per_iteration": -1,
    # "n_cross_validations": 2,
    "primary_metric": "AUC_weighted",
    "featurization": "auto",
    "verbosity": logging.INFO,
    "enable_code_generation": True,
}
training_dataset = output_split_train.parse_parquet_files().keep_columns(['Income','Age','Experience','Married/Single','House_Ownership',
            'Car_Ownership','Profession','CITY','STATE','CURRENT_JOB_YRS',
            'CURRENT_HOUSE_YRS','Risk_Flag'])

automl_config = AutoMLConfig(
    task="classification",
    debug_log="automl_errors.log",
    compute_target=aml_compute,
    #experiment_exit_score=0.9984,
    blocked_models=["KNN", "LinearSVM"],
    #enable_onnx_compatible_models=True,
    training_data=training_dataset,
    label_column_name='Risk_Flag',
    # validation_data=validation_dataset,
    **automl_settings,
)
                             
print("AutoML config created.")

AutoML config created.


In [135]:
from azureml.pipeline.steps import AutoMLStep

trainWithAutomlStep = AutoMLStep(name='AutoML_Classification',
                                 automl_config=automl_config,
                                 allow_reuse=True)
print("trainWithAutomlStep created.")

trainWithAutomlStep created.


In [136]:
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

pipeline_steps = [trainWithAutomlStep]

pipeline = Pipeline(workspace = ws, steps=pipeline_steps)
print("Pipeline is built.")

pipeline_run = experiment.submit(pipeline, regenerate_outputs=False)

print("Pipeline submitted for execution.")

Pipeline is built.
Created step AutoML_Classification [d92a3a67][f080a17b-8cff-43fe-800f-3ee0747dd9cb], (This step will run and generate new outputs)Created step Train Test Data Split [240ce18a][5992be1d-816b-4bbb-88a4-e412f5226119], (This step will run and generate new outputs)

Created step Robust scalling Data [ce20a7f2][d81e1a68-5622-47b3-9b48-1262736bec13], (This step is eligible to reuse a previous run's output)
Created step Encode Loan Data [489c1994][b883c7c7-4515-46b8-91f7-a496cd03da6c], (This step is eligible to reuse a previous run's output)
Created step drop druplicate Data [b945d1e9][30636ead-3f13-462e-85b3-bc33a603cdf7], (This step is eligible to reuse a previous run's output)
Submitted PipelineRun 9b6e2fbc-45b7-4b1c-a138-5715c71db2b9
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/9b6e2fbc-45b7-4b1c-a138-5715c71db2b9?wsid=/subscriptions/bd28012e-c908-4162-ab7f-04c61a03a62a/resourcegroups/datascienceworld/workspaces/mlopscourse&tid=7bbbced8-b31a-4a36-95bb

In [137]:
RunDetails(pipeline_run).show()

_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

# Explore results

In [138]:
# Before we proceed we need to wait for the run to complete.
pipeline_run.wait_for_completion(show_output=False)

# functions to download output to local and fetch as dataframe
def get_download_path(download_path, output_name):
    output_folder = os.listdir(download_path + '/azureml')[0]
    path =  download_path + '/azureml/' + output_folder + '/' + output_name
    return path

def fetch_df(current_step, output_name):
    output_data = current_step.get_output_data(output_name)    
    download_path = './outputs/' + output_name
    output_data.download(download_path, overwrite=True)
    df_path = get_download_path(download_path, output_name) + '/processed.parquet'
    return pd.read_parquet(df_path)

PipelineRunId: 9b6e2fbc-45b7-4b1c-a138-5715c71db2b9
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/9b6e2fbc-45b7-4b1c-a138-5715c71db2b9?wsid=/subscriptions/bd28012e-c908-4162-ab7f-04c61a03a62a/resourcegroups/datascienceworld/workspaces/mlopscourse&tid=7bbbced8-b31a-4a36-95bb-9f06bc9d72a6


In [142]:
cleanse_step = pipeline_run.find_step_run(cleanStep.name)[0]

cleansed_df = fetch_df(cleanse_step, clean_data.name)

print(cleansed_df.shape)
display(cleansed_df.head(5))

(42007, 12)


Unnamed: 0,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,1303834,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0
1,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13,0
2,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10,0
3,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12,1
4,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1


In [143]:
encode_step = pipeline_run.find_step_run(encodeStep.name)[0]

encoded_df = fetch_df(encode_step, encoded_data.name)

print(encoded_df.shape)
display(encoded_df.head(5))

(42007, 13)


Unnamed: 0,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag,__index_level_0__
0,1303834,23,3,1,2,0,33,251,13,3,13,0,0
1,7574516,40,10,1,2,0,43,227,14,9,13,0,1
2,3991815,66,4,0,2,0,47,8,12,4,10,0,2
3,6256451,41,2,1,2,1,43,54,17,2,12,1,3
4,5768871,47,11,1,2,0,11,296,22,3,14,1,4


In [144]:
scale_step = pipeline_run.find_step_run(scalingStep.name)[0]

scaled_df = fetch_df(scale_step, scaled_data.name)

print(scaled_df.shape)
display(scaled_df.head(5))

(42007, 12)


Unnamed: 0,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,-0.739674,-0.9,-0.7,0.0,0.0,0.0,0.307692,0.588608,-0.058824,-0.5,0.5,0
1,0.519909,-0.333333,0.0,0.0,0.0,0.0,0.692308,0.436709,0.0,0.5,0.5,0
2,-0.199743,0.533333,-0.6,-1.0,0.0,0.0,0.846154,-0.949367,-0.117647,-0.333333,-1.0,0
3,0.255152,-0.3,-0.8,0.0,0.0,1.0,0.692308,-0.658228,0.176471,-0.666667,0.0,1
4,0.157212,-0.1,0.1,0.0,0.0,0.0,-0.538462,0.873418,0.470588,-0.5,1.0,1


In [146]:

split_step = pipeline_run.find_step_run(testTrainSplitStep.name)[0]

train_df = fetch_df(split_step, output_split_train.name)

print(train_df.shape)
display(train_df.head(5))

(33605, 12)


Unnamed: 0,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,-0.257675,-0.033333,-0.3,0.0,0.0,1.0,0.538462,0.101266,0.294118,-0.166667,-0.5,0
1,-0.231296,0.933333,0.2,-1.0,0.0,1.0,0.653846,0.468354,0.0,-0.333333,0.0,1
2,0.715899,-0.266667,0.3,0.0,0.0,0.0,-0.230769,-0.06962,0.470588,-0.5,0.5,1
3,0.700409,-0.366667,-0.9,0.0,0.0,0.0,-0.153846,0.35443,-0.529412,-0.833333,0.5,0
4,0.657288,-0.733333,0.3,0.0,0.0,0.0,-0.923077,-0.436709,0.823529,0.166667,-0.5,0


In [148]:
split_step = pipeline_run.find_step_run(testTrainSplitStep.name)[0]

test_df = fetch_df(split_step, output_split_test.name)

print(test_df.shape)
display(test_df.head(5))

(8402, 12)


Unnamed: 0,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,0.03459,-0.966667,-0.8,0.0,0.0,0.0,-0.692308,0.683544,0.647059,-0.666667,0.0,0
1,0.783744,0.833333,0.4,0.0,0.0,0.0,-0.538462,0.006329,-0.705882,-0.333333,-0.5,0
2,-0.918826,0.0,0.9,0.0,0.0,0.0,0.192308,0.044304,0.352941,-0.333333,1.0,0
3,1.000799,0.4,0.1,0.0,0.0,1.0,0.269231,-0.943038,0.647059,0.0,1.0,0
4,-0.147652,-0.733333,1.0,0.0,0.0,0.0,-0.730769,-0.886076,0.647059,-0.166667,1.0,0
