In [1]:
!pip install azureml azureml.core azureml.widgets azureml-dataset-runtime transformers azureml.train

Collecting azureml
  Downloading https://files.pythonhosted.org/packages/ab/e8/76cd2cb6784b9039affd2c659eed1b3f46baf2e6b87a10b072a20b5b0113/azureml-0.2.7-py2.py3-none-any.whl
Collecting azureml.core
[?25l  Downloading https://files.pythonhosted.org/packages/bf/56/0fe0576c3efe0cf19aeed7dd64ed412db49522c11134f7ccf50e33ac6281/azureml_core-1.25.0-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.2MB 12.2MB/s 
[?25hCollecting azureml.widgets
[?25l  Downloading https://files.pythonhosted.org/packages/ff/3e/3187ecbefd9de606003a6a44b160b839a8913c40fb10f8ca44d0faacf86c/azureml_widgets-1.25.0-py3-none-any.whl (14.1MB)
[K     |████████████████████████████████| 14.1MB 281kB/s 
[?25hCollecting azureml-dataset-runtime
  Downloading https://files.pythonhosted.org/packages/7a/7a/deb88d9216c374364362e922a7b3aaa66bff96465f3f9e609137bcbd279b/azureml_dataset_runtime-1.25.0-py3-none-any.whl
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/d5/f4

In [1]:
# Azure Connection Details
setup = False
azure = "private"
modelName = "bert-base-uncased" # "openai-gpt" # 'bert-base-uncased'

subscription_id = "93986b83-8c58-4738-abfd-f7d1cbcce9f8"
resource_group = "master-privat"
workspace_name = "master-privat"
computeResource = "cluster-nd6"

repoDir = "/Users/michael/workspaces/MasterThesis"

scriptsPath = os.path.join(repoDir,"scripts")
trainingDataPath = os.path.join(repoDir,"data","embeddings")
modelPath = os.path.join(repoDir,"data","models")
condaFile = os.path.join(repoDir,"environment.yaml")

Mounted at /content/drive


In [2]:
from azureml.core.authentication import InteractiveLoginAuthentication

interactive_auth = InteractiveLoginAuthentication(tenant_id="b232d827-1e67-4b06-b634-a6a6785fc4bf")

In [3]:
# Establish Connection to Workspace
from azureml.core import Workspace, Dataset
ws = Workspace(subscription_id, resource_group, workspace_name, auth=interactive_auth)
ws

Workspace.create(name='master-privat', subscription_id='93986b83-8c58-4738-abfd-f7d1cbcce9f8', resource_group='master-privat')

In [4]:
ds = ws.get_default_datastore()
ds

{
  "name": "workspaceblobstore",
  "container_name": "azureml-blobstore-62ccc7ca-963e-4c24-867d-ab748217e05b",
  "account_name": "masterprivat1227231575",
  "protocol": "https",
  "endpoint": "core.windows.net"
}

In [5]:
if setup:
  ds.upload(
      src_dir=trainingDataPath,
      target_path="embeddings",
      overwrite=False,
  )

In [31]:
if setup:
  from transformers import BertForSequenceClassification
  model = BertForSequenceClassification.from_pretrained(
                                      modelName,
                                      num_labels = 9,
                                      output_attentions = False,
                                      output_hidden_states = False)
  localPretrainedModelPath = modelPath
  model.save_pretrained(localPretrainedModelPath)
  ds.upload(
      src_dir=localPretrainedModelPath,
      target_path=modelName,
      overwrite=False,
  )

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [5]:
from azureml.core import (
    Experiment,
    Environment,
    ScriptRunConfig,
    Dataset,
    Run
)
from azureml.widgets import RunDetails

In [6]:
ds = Dataset.File.from_files((ws.get_default_datastore(),
                              "embeddings/"))
bertPretrained = Dataset.File.from_files((ws.get_default_datastore(),
                              modelName +"/"))

In [7]:
env = Environment.from_conda_specification(
    name="master-thesis-env",
    file_path=condaFile,
)

In [10]:
learningRates = {
    "climate" : {
        "partyGroupIdeology" : 4.991037466501143E-05
    },
    "health" : {
        "partyGroupIdeology" : 4.98261670745635E-05
    }
}

epochs = {
    "climate" : {
        "partyGroupIdeology" : 4
    },
    "health" : {
        "partyGroupIdeology" : 4
    }
}

distributions = [
    {
        "train" : 0.80,
        "test" : 0.10
    }
]

for distribution in distributions:    
    for category in ["health", "climate"]:
        for labels in ["partyGroupIdeology"]:
            script_run_config = ScriptRunConfig(
                source_directory=scriptsPath,
                script="08_cnn_train.py",
                compute_target=computeResource,
                environment=env,
                arguments=[
                    "--data-path",
                    ds.as_mount(),
                    "--pretrained-model",
                    bertPretrained.as_mount(),
                    "--learning-rate",
                    learningRates[category][labels],
                    "--epochs",
                    epochs[category][labels],
                    "--batch-size",
                    24,
                    "--seed",
                    4806,
                    "--category",
                    category,
                    "--labels",
                    labels,
                    "--train_share",
                    distribution["train"],
                    "--test_share",
                    distribution["test"]
                ]

            )
            runSingle = Experiment(
                workspace=ws, 
                name="Train_"+category+"_"+labels
            ).submit(script_run_config)

In [22]:
from azureml.train.hyperdrive import HyperDriveConfig
from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, uniform, PrimaryMetricGoal, randint, choice

param_sampling = RandomParameterSampling( {
        'learning-rate': uniform(0.00002, 0.00005)
    }
)

#        'seed' : randint(5000)

early_termination_policy = BanditPolicy(slack_factor=0.05, evaluation_interval=1)

hd_config = HyperDriveConfig(run_config=script_run_config,
                             hyperparameter_sampling=param_sampling,
                             policy=early_termination_policy,
                             primary_metric_name="epoch_train_avg_loss",
                             primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
                             max_total_runs=50,
                             max_concurrent_runs=1)

run = Experiment(workspace=ws, name="HyperParameter_"+category+"_"+labels).submit(hd_config)
RunDetails(run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [10]:
## Random Forest Model

categories = ["climate", "health"]
labelsSelection = ["partyGroupIdeology"] 

for category in categories:
    for labels in labelsSelection:
        script_run_config = ScriptRunConfig(
            source_directory=scriptsPath,
            script="09_baseline_random_forest.py",
            compute_target=computeResource,
            environment=env,
            arguments=[
                "--data-path",
                ds.as_mount(),
                "--category",
                category,
                "--labels",
                labels,
                "--train_share",
                0.80,
                "--test_share",
                0.10
            ]

        )
        runSingle = Experiment(
            workspace=ws, name="Forest_"+category+"_"+labels
        ).submit(script_run_config)

In [None]:
# Export graphs
runName = "HyperParameter_"+category+"_"+labels
print("Load "+runName)
trainExperiment = Experiment(ws, name=runName)
trainRuns = trainExperiment.get_runs(tags={"thesis":1})
trainRun: Run = next(trainRuns)
#print()
trainRunChildren = trainRun.get_children()
trainRunChildren: Run = next(trainRunChildren)
print(trainRunChildren)
#for trainRunChild in trainRunChildren:
#    print(trainRunChild)
#    exit()
#print(trainRunChildren)


In [13]:
# Get model to validate
from azureml.core.model import Model

tmpDir = "/Users/michael/Downloads/model"


distributions = [
    {
        "train" : 0.80,
        "test" : 0.10
    }
]

for modelType in ["forest","bert"]:
    for distribution in distributions:
        for category in ["health", "climate"]:
            for labels in ["leftRightPosition","partyGroupIdeology"]:
                model = Model(ws, modelType+'_'+category+'_'+labels+'_train'+str(round(distribution["train"]*100))+'_test'+str(round(distribution["test"]*100)))
                print(model)
                print("Model {}".format(model["id"]))
                model.download(target_dir=tmpDir)

Model(workspace=Workspace.create(name='master-privat', subscription_id='93986b83-8c58-4738-abfd-f7d1cbcce9f8', resource_group='master-privat'), name=forest_health_leftRightPosition_train80_test15, id=forest_health_leftRightPosition_train80_test15:1, version=1, tags={}, properties={})
Model(workspace=Workspace.create(name='master-privat', subscription_id='93986b83-8c58-4738-abfd-f7d1cbcce9f8', resource_group='master-privat'), name=forest_health_partyGroupIdeology_train80_test15, id=forest_health_partyGroupIdeology_train80_test15:1, version=1, tags={}, properties={})
Model(workspace=Workspace.create(name='master-privat', subscription_id='93986b83-8c58-4738-abfd-f7d1cbcce9f8', resource_group='master-privat'), name=forest_climate_leftRightPosition_train80_test15, id=forest_climate_leftRightPosition_train80_test15:1, version=1, tags={}, properties={})
Model(workspace=Workspace.create(name='master-privat', subscription_id='93986b83-8c58-4738-abfd-f7d1cbcce9f8', resource_group='master-privat'

WebserviceException: WebserviceException:
	Message: File already exists. To overwrite, set exist_ok to True. /Users/michael/Downloads/model/bert_climate_partyGroupIdeology_train90_test5.pkl
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "File already exists. To overwrite, set exist_ok to True. /Users/michael/Downloads/model/bert_climate_partyGroupIdeology_train90_test5.pkl"
    }
}