In [None]:
from azureml.core import Run
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from argparse import ArgumentParser as AP
import os

In [None]:
# Get the run context
new_run = Run.get_context()

In [None]:
# Get the workspace from the run
ws = new_run.experiment.workspace

In [None]:
# Read the input dataset
df = new_run.input_datasets['raw_data'].to_pandas_dataframe()

In [None]:
# Select relevant columns from the dataset
dataPrep = df.drop(["ID"], axis=1)
all_cols = dataPrep.columns

In [None]:
# Check the missing values
dataNull = dataPrep.isnull().sum()

In [None]:
# Replace the missing values of string variable with mode
mode = dataPrep.mode().iloc[0]
cols = dataPrep.select_dtypes(include='object').columns
dataPrep[cols] = dataPrep[cols].fillna(mode)

In [None]:
# Replace numerical columns with mean
mean = dataPrep.mean()
dataPrep = dataPrep.fillna(mean)

In [None]:
# Create Dummy variables - Not required in designer/Classic Studio
dataPrep = pd.get_dummies(dataPrep, drop_first=True)

In [None]:
# Normalise the data
scaler = MinMaxScaler()
columns = df.select_dtypes(include='number').columns
dataPrep[columns] = scaler.fit_transform(dataPrep[columns])

In [None]:
# Get the arguments from pipeline job
parser = AP()
parser.add_argument('--datafolder', type=str)
args = parser.parse_args()

In [None]:
# Create the folder if it does not exist
os.makedirs(args.datafolder, exist_ok=True)

In [None]:
# Create the path
path = os.path.join(args.datafolder, 'defaults_prep.csv')

In [None]:
# Write the data preparation output as csv file
dataPrep.to_csv(path, index=False)

In [None]:
# Log null values
for column in all_cols:
    new_run.log(column, dataNull[column])

In [None]:
# Complete the run
new_run.complete()