# Getting Workspace and Datastore

In [None]:
from azureml.core import Workspace, Datastore, Dataset,Experiment
from azureml.core.runconfig import DataReferenceConfiguration
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.runconfig import RunConfiguration
from azureml.core.script_run_config import ScriptRunConfig
from azureml.core.conda_dependencies import CondaDependencies
from azureml.data.data_reference import DataReference

#Important for Pipelines
from azureml.pipeline.core import Pipeline
from azureml.pipeline.steps import PythonScriptStep
from azureml.widgets import RunDetails

In [None]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

dataStoreName = 'group1datastore'
ds = ws.datastores.get(dataStoreName)

# project folder
project_folder = '.'

# Provisioning compute targets for data prep and model training

In [None]:
# Cluster for Data Preparation
clusterNameForDataPreparation = "clusterDataPrep"
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=clusterNameForDataPreparation)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=2, min_nodes=1)
    cpu_cluster = ComputeTarget.create(ws, clusterNameForDataPreparation, compute_config)
cpu_cluster.wait_for_completion(show_output=True)

# Cluster for Model Training
clusterNameForModelTraining = "clusterTraining"
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=clusterNameForModelTraining)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=2,min_nodes=1)
    cpu_cluster = ComputeTarget.create(ws, clusterNameForModelTraining, compute_config)
cpu_cluster.wait_for_completion(show_output=True)

# Getting a DataReference and a run_config file

In [None]:
dataReference = DataReferenceConfiguration(datastore_name=dataStoreName,
                                           path_on_compute="/data",
                                           path_on_datastore="challenge5",
                                           mode="download",
                                           overwrite=True)
# create a new RunConfig object
clusterNameForDataPreparation = 'clusterDataPrep'

run_config = RunConfiguration(framework="python")
run_config.target = clusterNameForDataPreparation
run_config.data_references = {'myDataStore':dataReference}

# specify CondaDependencies obj
run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn','pandas','numpy','scipy'])
run_config.environment.python.conda_dependencies.add_pip_package('azureml-dataprep')

# Data preparation script

In [None]:
%%writefile train-on-amlcompute/dataprep.py
import numpy as np
import pandas as pd
import random
from azureml.dataprep import ColumnSelector
from sklearn.feature_selection import SelectKBest
import scipy.stats as stats
from scipy.stats import chi2_contingency
from sklearn.feature_selection import SelectKBest
from azureml.core import Dataset,Workspace

ws = Workspace.from_config()
dataset = Dataset.auto_read_files('/data/train.csv')

#Registering our dataset for historic reasons
datasetName = 'datasetRaw-Challenge5'
description = 'This is a raw data set for Challenge 5'
dataset = dataset.register(workspace = ws,
                           name = datasetName,
                           description = description,
                           exist_ok = True)

# Getting the 20% of our dataset with a randome sampling
seed = random.randint(0, 4294967295)
datasetAt20 = dataset.sample('simple_random', {'probability':0.2, 'seed': seed})

# Registering our dataset
datasetName = 'dataset20-Challenge5'
description = 'This is a data set at 20% for Challenge 5'
datasetAt20 = datasetAt20.register(workspace = ws,
                           name = datasetName,
                           description = description,
                           exist_ok = False)

# Getting the first 100K rows from our DatasetAt20
datasetAt20Definition = datasetAt20.get_definition()
datasetFirst100KRows = datasetAt20Definition.take(100000)


# Dropping unnecesary columns
cols_to_drop = ["DefaultBrowsersIdentifier",
                "OrganizationIdentifier",
                "PuaMode",
                "SmartScreen",
                "Census_ProcessorClass",
                "Census_InternalBatteryType",
                "Census_IsFlightingInternal",
                "Census_ThresholdOptIn",
                "Census_IsWIMBootEnabled",
                "Census_SystemVolumeTotalCapacity"]
datasetFirst100KRows = datasetFirst100KRows.drop_columns(cols_to_drop)

# Dropping some selected columns
column_selector = ColumnSelector(term=".*", use_regex=True)
datasetFirst100KRows = datasetFirst100KRows.replace_na(column_selector)
datasetFirst100KRows = datasetFirst100KRows.fill_nulls(column_selector, 0)
datasetFirst100KRows = datasetFirst100KRows.fill_errors('Census_PrimaryDiskTotalCapacity', 0)
datasetFirst100KRows = datasetFirst100KRows.clip('Census_TotalPhysicalRAM',0,16384)

# Converting our datasetFirst100KRows to a Pandas Dataframe
df = datasetFirst100KRows.to_pandas_dataframe()

# Dropping more columns
explore_df = df
explore_df = explore_df.drop('MachineIdentifier', 1)
cols_to_drop.append('MachineIdentifier')

#Getting categorical vs non categorical values
# Split the data into two dataframes - one for each label value
detections_df = explore_df[(explore_df.HasDetections==1)]
nondetections_df = explore_df[(explore_df.HasDetections==0)]

# Get the numeric features
num_cols = ["AVProductsInstalled",
            "AVProductsEnabled",
            "OsBuild",
            "Census_ProcessorCoreCount",
            "Census_InternalBatteryNumberOfCharges",
            "Census_OSBuildNumber",
            "Census_OSBuildRevision",
            "Census_PrimaryDiskTotalCapacity",
            "Census_TotalPhysicalRAM",
            "Census_InternalPrimaryDiagonalDisplaySizeInInches",
            "Census_InternalPrimaryDisplayResolutionHorizontal",
            "Census_InternalPrimaryDisplayResolutionVertical"]

# Get the categorical features
cat_cols = list(detections_df.columns)
non_cat_cols = num_cols.copy()
non_cat_cols.append("HasDetections")
for col in non_cat_cols:
    cat_cols.remove(col)

# Using Chi-Squared to drop more columns
alpha = 0.005
Y = explore_df["HasDetections"].astype(str)
    
# Categorical feature Selection
for var in cat_cols:
    X = explore_df[var].astype(str)
    df_crosstab = pd.crosstab(Y,X)
    chi2, p, dof, expected = chi2_contingency(df_crosstab)
    if p < alpha:
        print("{0} is IMPORTANT".format(var))
    else:
        print("{0} is not important".format(var))
        cols_to_drop.append(var)

# Use ANOVA to get the most important numeric columns
X = explore_df[num_cols].astype(np.float)
X.fillna(0, inplace=True)
y = explore_df["HasDetections"]

# Find the 4 most important numeric columns
X_new = SelectKBest(k=4).fit(X, y)

for i in range(len(num_cols)):
    if X_new.get_support()[i]:
        print("{0} is IMPORTANT".format(num_cols[i]))
    else:
        print("{0} is not important".format(num_cols[i]))
        cols_to_drop.append(num_cols[i])
        
# Eliminating more columns
more_columns = ['AVProductStatesIdentifier',
                'OsPlatformSubRelease',
                'OsSuite',
                'OsBuildLab',
                'SkuEdition',
                'SMode',
                'Census_OSVersion',
                'Census_OSBranch',
                'Census_OSEdition',
                'Census_OSSkuName',
                'Census_OSInstallTypeName',
                'Census_OSWUAutoUpdateOptionsName',
                'Census_ActivationChannel',
                'CountryIdentifier',
                'AvSigVersionEncoded',
                'Platform',
                'Processor',
                'Census_MDC2FormFactor',
                'Census_DeviceFamily',
                'Census_PrimaryDiskTypeName',
                'Census_OSArchitecture',
                'Census_GenuineStateName', 
                'Census_PowerPlatformRoleName',
                'AvSigVersion',
                'Census_ChassisTypeName'
               ]
for col in more_columns:
    cols_to_drop.append(col)
    
datasetCleaner = datasetFirst100KRows.drop_columns(cols_to_drop)

# One hot encoding
datasetOneHotEncoded = datasetCleaner.label_encode('EngineVersion','EngineVersionEncoded')
datasetOneHotEncoded = datasetOneHotEncoded.label_encode('AppVersion','AppVersionEncoded')
datasetOneHotEncoded = datasetOneHotEncoded.label_encode('Census_FlightRing','Census_FlightRingEncoded')
cols_to_drop = ["EngineVersion",
                "AppVersion",
                "CensusFlightRing"]
datasetCleaned = datasetOneHotEncoded.drop_columns(cols_to_drop)

# Normalization
datasetNormalized = datasetCleaned.min_max_scale('AVProductsInstalled',0,5)
datasetNormalized = datasetNormalized.min_max_scale('Census_TotalPhysicalRAM',512,16384)

# Updating definition
datasetAt20.update_definition(datasetNormalized,'Applied transformations for data prep')

df = datasetAt20.to_pandas_dataframe()
df.to_csv('preppedChallenge1.csv')

# Train step

In [None]:
newDataReference = DataReference(datastore=ds,
                                 path_on_compute="/data",
                                 path_on_datastore="challenge5",
                                 mode="download",
                                 overwrite=True)

trainStep = PythonScriptStep(name="trainStep",
                             script_name="train-on-amlcompute/dataprep.py", 
                             compute_target=cpu_cluster, 
                             source_directory=project_folder,
                             runconfig=run_config,
                             inputs=[newDataReference],
                             allow_reuse=True)
print("Train step created")

#Definition of steps
steps = [trainStep]

# Definition of pipelines
myPipeline = Pipeline(workspace=ws, steps=steps)
print ("Pipeline is built")

myPipeline.validate()
print("Pipeline validation complete")

myPipelineRun = Experiment(ws, 'Challenge5JorgeExperiment').submit(myPipeline, regenerate_outputs=False)
print("Pipeline is submitted for execution")

In [None]:
RunDetails(myPipelineRun).show()