# Create experiment

In [1]:
import azureml.core
from azureml.core import Workspace
ws = Workspace.from_config()
print("Azure ML SDK Version: ", azureml.core.VERSION)
print(ws.name, ws.location, ws.resource_group, sep='\t')

Azure ML SDK Version:  1.0.74
azure-ml-test	eastus2	azureml


In [2]:
from azureml.core import Experiment
experiment = Experiment(workspace=ws, name="FakeNewsDetect")

# Apply for compute target resources

In [3]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

# choose a name for your cluster
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "gpu-cluster")
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 1)

# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_NC6")


if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print("found compute target: " + compute_name)
else:
    print("creating new compute target...")
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = vm_size,
        min_nodes = compute_min_nodes, 
        max_nodes = compute_max_nodes)

    # create the cluster
    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)
    
    # can poll for a minimum number of nodes and for a specific timeout. 
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
    
     # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())

found compute target: gpu-cluster


# Edit task script

In [4]:
import os
script_folder = os.path.join(os.getcwd(), "ml_scripts")
os.makedirs(script_folder, exist_ok=True)

In [5]:
%%writefile $script_folder/train.py

import azureml.core
from azureml.core import Workspace, Experiment, Datastore
import argparse
import os

# 安装依赖包
os.system("pip install transformers")
os.system("pip install pandas")
os.system('pip install "azureml-dataprep[fuse,pandas]"')

import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd

# 创建实验并登录数据库
ws = Workspace(subscription_id="6e12bb22-e64d-4e66-b7ad-a5ef94af1799", resource_group="AzureML", workspace_name="azure-ml-test")
experiment = Experiment(workspace=ws, name="FakeNewsDetectSmall")
datastore = Datastore.get(ws, datastore_name='workspaceblobstore')

# 提取训练集数据
train_set_name = "FakeNewsTrain"
train_set = azureml.core.Dataset.get_by_name(workspace=ws, name=train_set_name)
df_train = train_set.to_pandas_dataframe()
empty_title = ((df_train['title2_zh'].isnull()) \
               | (df_train['title1_zh'].isnull()) \
               | (df_train['title2_zh'] == '') \
               | (df_train['title2_zh'] == '0'))
df_train = df_train[~empty_title]
print(df_train.head())

# 过滤掉过长的样本
MAX_LENGTH = 32
df_train = df_train[~(df_train.title1_zh.apply(lambda x : len(x)) > MAX_LENGTH)]
df_train = df_train[~(df_train.title2_zh.apply(lambda x : len(x)) > MAX_LENGTH)]

# 只使用部分数据集进行训练
SAMPLE_FRAC = 0.1
df_train = df_train.sample(frac=SAMPLE_FRAC, random_state=9527)

# 提取有用的列，去除不必要的列
df_train = df_train.reset_index()
df_train = df_train.loc[:, ['title1_zh', 'title2_zh', 'label']]
df_train.columns = ['text_a', 'text_b', 'label']

# 二次分割为训练集和验证集
df_total = df_train
df_train = df_total.sample(frac=0.8, random_state=9527)
df_valid = df_total[~df_total.index.isin(df_train.index)]

print("训练样本数： ", len(df_train))
print(df_train.head())

print("验证样本数： ", len(df_valid))
print(df_valid.head())

# 提取测试集数据
test_set_name = "FakeNewsTest"
test_set = azureml.core.Dataset.get_by_name(workspace=ws, name=test_set_name)
df_test = test_set.to_pandas_dataframe()
df_test = df_test.loc[:, ["title1_zh", "title2_zh", "id"]]
df_test.columns = ["text_a", "text_b", "Id"]

print("预测样本数： ", len(df_test))
print(df_test.head())

################################################################

class FakeNewsDataset(Dataset):
    # 初始化数据集
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "valid", "test"]
        self.mode = mode
        if self.mode == "train":
            self.df = df_train
        elif self.mode == "valid":
            self.df = df_valid
        else:
            self.df = df_test
        
        self.df = self.df.dropna()
        self.len = len(self.df)
        self.label_map = {'agreed': 0, 'disagreed': 1, 'unrelated': 2}
        self.tokenizer = tokenizer
    
    # 获得数据集中的数据
    def __getitem__(self, idx):
        if self.mode == "test":
            text_a, text_b = self.df.iloc[idx, :2].values
            label_tensor = None
        else:
            text_a, text_b, label = self.df.iloc[idx, :].values
            label_id = self.label_map[label]
            label_tensor = torch.tensor(label_id)
            
        # 第一个句子的tokens
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(text_a)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(word_pieces)
        
        # 第二个句子的tokens
        tokens_b = self.tokenizer.tokenize(text_b)
        word_pieces += tokens_b + ["[SEP]"]
        len_b = len(word_pieces) - len_a
        
        # 将tokens转换成token_ids
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # segments_tensor用于区分两个句子
        segments_tensor = torch.tensor([0] * len_a + [1] * len_b, dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len


def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 训练集/数据集，有无label？
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero padding
    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, batch_first=True)
    
    # masks tensors
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

################################################################

# 初始化训练集和验证集
BATCH_SIZE = 64
PRETRAINED_MODEL_NAME = "bert-base-chinese"
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
trainset = FakeNewsDataset("train", tokenizer=tokenizer)
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, collate_fn=create_mini_batch)
validset = FakeNewsDataset("valid", tokenizer=tokenizer)
validloader = DataLoader(validset, batch_size=BATCH_SIZE, collate_fn=create_mini_batch)

# 载入预训练的BERT模型
NUM_LABELS = 3
model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

# 将模型转换到GPU上
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)

# 模型结构
for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print(f"{name}:{n}")
    else:
        print("{:15} {}".format(name, module))

def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        # 遍历整个数据集
        for data in dataloader:
            # 将数据传送给GPU进行运算
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # 使用数据进行预测
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, token_type_ids=segments_tensors, attention_mask=masks_tensors)
            
            # 获取预测结果
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            # 记录分类准确率
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # 记录预测结果
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions
    
# 将模型转换到GPU上
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)

def get_learnable_params(module):
    return [p for p in module.parameters() if p.requires_grad] 
model_params = get_learnable_params(model)
clf_params = get_learnable_params(model.classifier)


print(model.config)
print(f"""
分类模型的参数：{sum(p.numel() for p in model_params)}
线性分类器的参数：{sum(p.numel() for p in clf_params)}
""")

################################################################

# 训练模型（微调）
model.train()

# 使用 Adam 自适应优化器
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

run = experiment.start_logging()
EPOCHS = 6
for epoch in range(EPOCHS):
    running_loss = 0.0
    for data in trainloader:
        
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]

        # 梯度归零
        optimizer.zero_grad()
        
        # 前向传播
        outputs = model(input_ids=tokens_tensors, token_type_ids=segments_tensors, 
                   attention_mask=masks_tensors, labels=labels)
        loss = outputs[0]
        
        # 反向传播
        loss.backward()
        optimizer.step()

        # 累加当前Loss
        running_loss += loss.item()
        
    # 计算和统计模型数据
    _, acc = get_predictions(model, trainloader, compute_acc=True)
    _, v_acc = get_predictions(model, validloader, compute_acc=True)
    print('[Epoch %d] loss: %.3f, acc: %.3f, valid acc: %.3f' %
          (epoch + 1, running_loss, acc, v_acc))
    run.log("Loss", running_loss)
    run.log("Accuracy", acc)
    run.log("Validation accuracy", v_acc)
    
    # 保存模型
    model_name = "model_" + str(epoch+1) + ".pkl"
    filename = "outputs/" +  model_name
    torch.save(model.state_dict(), filename)
run.complete()

################################################################

# 测试模型
testset = FakeNewsDataset("test", tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=128, collate_fn=create_mini_batch)

predictions = get_predictions(model, testloader)
index_map = {v: k for k, v in testset.label_map.items()}

df = pd.DataFrame({"Category": predictions.tolist()})
df['Category'] = df.Category.apply(lambda x: index_map[x])
df_pred = pd.concat([testset.df.loc[:, ["Id"]], df.loc[:, 'Category']], axis=1)
df_pred.to_csv('outputs/bert_prec_training_samples.csv', index=False)
df_pred.head()

Overwriting /mnt/azmnt/code/Users/wck593462925/ml_scripts/train.py


In [6]:
from azureml.train.dnn import PyTorch

script_params = {
    # to mount files referenced by mnist dataset
    '--regularization': 0.5
}

estimator = PyTorch(source_directory=script_folder,
                    compute_target=compute_target,
                    entry_script='train.py',
                    node_count=1,
                    use_gpu=True)



In [7]:
run = experiment.submit(config=estimator)
run

Experiment,Id,Type,Status,Details Page,Docs Page
FakeNewsDetectSmall,FakeNewsDetectSmall_1575446831_f59c9f3b,azureml.scriptrun,Starting,Link to Azure Machine Learning studio,Link to Documentation
