# Automated ML (Titanic Survival Prediction Project)

Importing Dependencies needed to to complete the project:

In [1]:
#Import libraries needed
import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from azureml.data.dataset_factory import TabularDatasetFactory
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset

from azureml.pipeline.steps import AutoMLStep

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.20.0


In [2]:
#Testing the authentication using the Workspace method "from_config"
Workspace.from_config()

Performing interactive authentication. Please follow the instructions on the terminal.
To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code AGQ5R88M5 to authenticate.
You have logged in. Now let us find all the subscriptions to which you have access...
Interactive authentication successfully completed.


Workspace.create(name='quick-starts-ws-139092', subscription_id='2c48c51c-bd47-40d4-abbe-fb8eabd19c8c', resource_group='aml-quickstarts-139092')

In [3]:
#create workspace
ws = Workspace.from_config()

In [4]:
# Create Experiment

experiment_name = 'automl-experiment-1'
project_folder = './pipeline-project'

experiment = Experiment(ws, experiment_name)
experiment

Name,Workspace,Report Page,Docs Page
automl-experiment-1,quick-starts-ws-139092,Link to Azure Machine Learning studio,Link to Documentation


## Dataset Overview:

As you probably have guessed from the project title we will be working with the "Titanic Dataset" which is already a classical dataset to learn Machine Learning.

The main task for this project will be to build a predictive model that answers the question: “what sorts of people were more likely to survive?” To answer the above stated question we are going to give the model different input variables such as age, type of cabin the passanger had, etc

In [7]:
target_column_name = 'demand'
time_column_name = 'timeStamp'

In [10]:
#Getting the dataset
key = "Energy Dataset 2"
description_text = "Forecasting Energy Dataset"

dataset = TabularDatasetFactory.from_delimited_files("https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/nyc_energy.csv").with_timestamp_columns(fine_grain_timestamp=time_column_name)

#Register Dataset
dataset = dataset.register(workspace=ws,
                           name=key,
                           description=description_text)

#Create a df out of the registered dataset
dataset = dataset.to_pandas_dataframe()
dataset.describe()

Unnamed: 0,demand,precip,temp
count,49124.0,48975.0,49019.0
mean,6067.447361,0.003522,55.520428
std,1285.607657,0.022841,17.704848
min,2859.6,0.0,0.33
25%,5133.86225,0.0,41.415
50%,6020.071,0.0,56.26
75%,6684.3,0.0,70.54
max,11456.0,0.9051,97.26


In [12]:
dataset.head()

Unnamed: 0,timeStamp,demand,precip,temp
2012-01-01 00:00:00,2012-01-01 00:00:00,4937.5,0.0,46.13
2012-01-01 01:00:00,2012-01-01 01:00:00,4752.1,0.0,45.89
2012-01-01 02:00:00,2012-01-01 02:00:00,4542.6,0.0,45.04
2012-01-01 03:00:00,2012-01-01 03:00:00,4357.7,0.0,45.03
2012-01-01 04:00:00,2012-01-01 04:00:00,4275.5,0.0,42.61


In [11]:
###Be sure to have the compute target setup
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException


# Choose a name for your CPU cluster
amlcompute_cluster_name = "notebook139092"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2',# for GPU, use "STANDARD_NC6"
                                                           #vm_priority = 'lowpriority', # optional
                                                           min_nodes=1,
                                                           max_nodes=6)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)


Found existing cluster, use it.

Running


In [17]:
from datetime import datetime
# Cut off the end of the dataset due to large number of nan values
#dataset = dataset.time_before(datetime(2017, 10, 10, 5))
zeit = (datetime(2017,10,10,5))


In [20]:
dataset = dataset[dataset.timeStamp < zeit].copy()

### Clean Data:

In [None]:
#Import your clean data function from the train.py file
from train import clean_data

In [None]:
#apply the function
x, y = clean_data(dataset)

In [None]:
#Scale the features
from sklearn.preprocessing import StandardScaler

In [None]:
# create scaler
variables = x.columns.tolist()

scaler = StandardScaler()
scaler.fit(x[variables]) 

x = scaler.transform(x[variables])

In [None]:
x = pd.DataFrame(x,columns=variables)


In [None]:
#Split the data
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,
                                                    test_size = 0.2,
                                                    random_state=0)

In [None]:
#bring them together them again
dataset = pd.concat([x_train,y_train],axis=1)

In [None]:
dataset.head()

In [None]:
#To train the model we need a TabularDataset and not a dataframe, therefore the current df will be converterd 
#into a TabularDataset:

#Convert the dataframe into a csv
local_path = 'prepared.csv'

#Save it locally
dataset.to_csv(local_path,index=None)

#Generate the a datastore object which is the the default datastore
datastore = ws.get_default_datastore()

In [None]:
#Upload the dataframe which was previosly converted into a csv
datastore.upload(src_dir='.', target_path='data')

In [None]:
#For the sake of checking; check the path
datastore.path()

In [None]:
#Now the uploaded file will be transformed into a Tabular dataset and store in a varible named 'training_dataset'
training_dataset = Dataset.Tabular.from_delimited_files(path= [(datastore,('data/prepared.csv'))])

In [None]:
#let's visualize the data:
training_dataset.to_pandas_dataframe().head()

## AutoML Configuration

Below we will chose the automl settings and cofiguration

In [None]:
#Create the automl settings which will be used as argurments in the automl config
automl_settings = {
    "experiment_timeout_minutes": 20,
    "max_concurrent_iterations": 5,
    "primary_metric" : 'accuracy'
}

#Create the automl_config
automl_config = AutoMLConfig(compute_target=compute_target,
                             task = "classification",
                             training_data=training_dataset,
                             label_column_name="survived",   
                             path = project_folder,
                             enable_early_stopping= True,
                             featurization= 'auto',
                             debug_log = "automl_errors.log",
                             **automl_settings
                            )

In [None]:
#Submitt the experiment

automl_run = experiment.submit(automl_config,show_output=True)

In [None]:
#Additional Run Details
from azureml.widgets import RunDetails

RunDetails(automl_run).show()

# wait for completion
automl_run.wait_for_completion()

In [None]:
#Get generic outputs from the automl_run
automl_run.get_output()


In [None]:
#Get the best model outputs
best_automl_run, best_model = automl_run.get_output()


# Retrieve the best automl run model
print('Best AutoML run: ', best_automl_run)
print('Best AutoML model :', best_model)

# get best model and display properties
model_name = best_automl_run.properties['model_name']
print('Best_model name: ', model_name)

# display all the properties of the best model
best_automl_run.get_properties()

In [None]:
#Get the best model id
print(best_automl_run.id)

In [None]:
#Save the best AutoML model

import joblib

joblib.dump(best_model, 'best_automl_model.pkl')

In [None]:
# register best AutoML model for future deployment

from azureml.core.model import Model
description = 'AutoML Model trained on the titanic dataset'
tags = {'area': 'data science beginners', 'type': 'classification'}

automl_model = Model.register(workspace =ws,
                              model_name = 'best-titanicMLmodel',
                              model_path = 'best_automl_model.pkl',
                             description = description, tags = tags)

print('AutoML RunID: ', automl_run.id, sep='\t')

In [None]:
#Prepare deploying of the model as a web service
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice
from azureml.core import Environment
from azureml.core.model import Model
from azureml.core.conda_dependencies import CondaDependencies


In [None]:
#Setup Environment
env = Environment.get(workspace=ws, name='AzureML-AutoML')

In [None]:
#Chekc environment dependencies
print("packages", env.python.conda_dependencies.serialize_to_string())

In [None]:
#setup the inference and aci config
inference_config = InferenceConfig(entry_script='score.py', environment=env)
aci_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=4, enable_app_insights=True, auth_enabled=True)


In [None]:
#Deploy
service_name = 'my-ml-service'

model = Model(ws,name='best-titanicMLmodel')
service = Model.deploy(workspace=ws,
                       name=service_name,
                       models=[model],
                       inference_config=inference_config,
                       deployment_config=aci_config,
                       overwrite=True)

service.wait_for_deployment(show_output=True)

In [None]:
service.get_logs()

In [None]:
# print service state
print(service.state)
# print scoring URI
print('scoring URI: ' + service.scoring_uri)
# print Swagger URI
print('Swagger URI: ' + service.swagger_uri)
# retrieve authentication keys
primary, secondary = service.get_keys()
# print primary authenticaton key
print('Primary Authentication Key: ' + primary)

In [None]:
#Store the uri's in variables:

scoring_uri = 'http://87897773-cb15-40d5-ba0d-ba8285d8f467.southcentralus.azurecontainer.io/score'

key = 'iBX2glUB3xcahOndX5AW62WoVbRiDcIZ'

In [None]:
#let's test requests:
import json
import requests

scoring_uri = scoring_uri
key = key

headers = {'Content-Type':'application/json'}
headers['Authorization'] = f'Bearer {key}'


test_data = json.dumps({'data':[{
    'pclass': 0.8419164182590155,
    'age': -0.34907541344456255,
    'sibsp': -0.47908676070718687,
    'parch': -0.444999501816175,
    'fare': -0.4902404567566683,
    'age_NA': -0.5014319838391105,
    'fare_NA': -0.027650063180466557,
    'sex_male': 0.743496915331831,
    'cabin_Missing': 0.5393765119990418,
    'cabin_Rare': -0.42592011250734235,
    'embarked_Q': -0.32204029159373954,
    'embarked_Rare': -0.03911805059269843,
    'embarked_S': 0.6573935670276714,
    'title_Mr': 0.8525918887485938,
    'title_Mrs': -0.42592011250734235,
    'title_Rare': -0.27494677157229536
    }
    ]
        })

test_data2 = json.dumps({'data':[{
    'pclass': -15460978645168200,
    'age': 0.8912042887450313,
    'sibsp': -0.47908676070718687,
    'parch': -0.444999501816175,
    'fare': 19569900306355100,
    'age_NA': -0.5014319838391105,
    'fare_NA': -0.027650063180466557,
    'sex_male': -13449954927569300,
    'cabin_Missing': -18539924853119600,
    'cabin_Rare': 23478581326275300,
    'embarked_Q': -0.32204029159373954,
    'embarked_Rare': -0.03911805059269843,
    'embarked_S': -15211587854766800,
    'title_Mr': -11728941046668400,
    'title_Mrs': -0.42592011250734235,
    'title_Rare': -0.27494677157229536

    }
    ]
        })


response1 = requests.post(scoring_uri, data=test_data, headers=headers)
response2 = requests.post(scoring_uri, data=test_data2, headers=headers)

print("Classification Prediction:",response1.text)
print("Classification Prediction:",response2.text)

In [None]:
#get the environment Details and stored them into a file:
f = open("env.yml", "w")
f.write(env.python.conda_dependencies.serialize_to_string())
f.close()

print("packages", env.python.conda_dependencies.serialize_to_string())

In [None]:
#Delete Service:
service.delete()