In [60]:
def unique(list1):
 
    # insert the list to the set
    list_set = set(list1)
    # convert the set to the list
    unique_list = (list(list_set))
    # for x in unique_list:
    #     print x,
    return(unique_list)

In [62]:
from datasets import load_dataset, DatasetDict
dataset = load_dataset("gretelai/symptom_to_diagnosis")

train = dataset["train"]
train_label = train[:]["output_text"]

In [63]:
labels=(unique(train_label))
labels_dict = {labels[i]: i for i in range(len(labels))}
print(labels_dict)

{'cervical spondylosis': 0, 'fungal infection': 1, 'bronchial asthma': 2, 'common cold': 3, 'urinary tract infection': 4, 'diabetes': 5, 'allergy': 6, 'varicose veins': 7, 'dengue': 8, 'malaria': 9, 'typhoid': 10, 'impetigo': 11, 'psoriasis': 12, 'hypertension': 13, 'jaundice': 14, 'migraine': 15, 'gastroesophageal reflux disease': 16, 'drug reaction': 17, 'pneumonia': 18, 'arthritis': 19, 'chicken pox': 20, 'peptic ulcer disease': 21}


In [64]:
def to_category(data):
    data_labels = []
    for i in range(len(data)):
        label = [0]*(len(labels))
        label_index = labels_dict[data[i]]
        label[label_index] = 1
        # train_label[i] = label
        data_labels.append(label)
    return {"label": data_labels}

In [76]:
batch_size = 110 # can experiment with different sizes

dataset = dataset.map(function=to_category, input_columns="output_text", remove_columns="output_text", batched=True, batch_size=batch_size)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['input_text', 'label'],
        num_rows: 853
    })
    test: Dataset({
        features: ['input_text', 'label'],
        num_rows: 212
    })
})


In [77]:
train = dataset["train"]
# train_label = train[:]["label"]
# print(train_label)

# Split the 10% test + valid in half test, half valid
# test_valid = dataset["test"].train_test_split(0.5)

# train_test_valid_dataset = DatasetDict({
#     'train': train,
#     'test': test_valid['test'],
#     'validation': test_valid['train']})

train_test_valid_dataset = dataset

print(train_test_valid_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_text', 'label'],
        num_rows: 853
    })
    test: Dataset({
        features: ['input_text', 'label'],
        num_rows: 212
    })
})


In [78]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=True)

def preprocess(sample):
    return tokenizer(sample["input_text"], truncation=True)

In [79]:
tokenized_dataset = train_test_valid_dataset.map(preprocess, batched=True, batch_size=batch_size)
print(tokenized_dataset)


Map:   0%|          | 0/212 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 853
    })
    test: Dataset({
        features: ['input_text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 212
    })
})


In [80]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# DataCollatorForTokenClassification

In [81]:
from evaluate import load
metric = load('accuracy')

In [82]:
import numpy as np

def compute_metrics(preds):
    logits, labels = preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [83]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# batch_size = 110 # can experiment with different sizes

args = TrainingArguments(
    "test", # directory to save the model
    evaluation_strategy = "epoch", # evaluate after each epoch
    save_strategy = "epoch", # save after each epoch
    learning_rate=2e-5, # the learning rate to use
    per_device_train_batch_size=batch_size, # the batch size
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5, # number of epochs; 5 took about 30 minutes
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy" # metric associated with COLA GLUE
)

In [84]:
# autoload a model from the base for sequence classification,
# we pass 22 labels
label2id = labels_dict
id2label = {v: k for k, v in labels_dict.items()}
print(id2label)
print(label2id)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(labels_dict), label2id=label2id, id2label=id2label)

trainer = Trainer(
    model,  # the pre-trained model
    args,  # the TrainingAgruments, defined above
    train_dataset=tokenized_dataset["train"], # the training dataset
    eval_dataset=tokenized_dataset["test"], # the validation dataset
    tokenizer=tokenizer, # our tokenizer
    data_collator=data_collator, # the collator we defined above
    compute_metrics=compute_metrics # our function for computing the metrics
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{0: 'cervical spondylosis', 1: 'fungal infection', 2: 'bronchial asthma', 3: 'common cold', 4: 'urinary tract infection', 5: 'diabetes', 6: 'allergy', 7: 'varicose veins', 8: 'dengue', 9: 'malaria', 10: 'typhoid', 11: 'impetigo', 12: 'psoriasis', 13: 'hypertension', 14: 'jaundice', 15: 'migraine', 16: 'gastroesophageal reflux disease', 17: 'drug reaction', 18: 'pneumonia', 19: 'arthritis', 20: 'chicken pox', 21: 'peptic ulcer disease'}
{'cervical spondylosis': 0, 'fungal infection': 1, 'bronchial asthma': 2, 'common cold': 3, 'urinary tract infection': 4, 'diabetes': 5, 'allergy': 6, 'varicose veins': 7, 'dengue': 8, 'malaria': 9, 'typhoid': 10, 'impetigo': 11, 'psoriasis': 12, 'hypertension': 13, 'jaundice': 14, 'migraine': 15, 'gastroesophageal reflux disease': 16, 'drug reaction': 17, 'pneumonia': 18, 'arthritis': 19, 'chicken pox': 20, 'peptic ulcer disease': 21}


In [85]:
trainer.train()

ValueError: Expected input batch_size (110) to match target batch_size (2420).