In [12]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import Dataset
import torch

In [13]:
data = pd.read_csv('sample_data/data.csv')
data.head()

Unnamed: 0,Age,Gender,Ethnicity,Region,Socioeconomic Status,Symptom Description,Symptom Severity,Duration of Symptoms (days),Additional Symptoms,Chronic Condition,Allergies,Previous Visits,Potential Condition,Suggested Action,Insurance Status
0,25,female,caucasian,urban,low,persistent cough and shortness of breath,1.0,10.0,fatigue,asthma,penicillin,4.0,food poisoning,go to emergency,yes
1,53,male,asian,rural,low,fatigue and weakness,7.0,21.0,fatigue,diabetes,penicillin,6.0,muscle strain,go to emergency,yes
2,53,female,hispanic,suburban,low,frequent urination,9.0,23.0,fatigue,asthma,penicillin,8.0,diabetes,take home care measures,yes
3,79,female,other,rural,middle,chest pain,7.0,15.0,none,asthma,peanuts,7.0,muscle strain,take home care measures,no
4,79,male,other,urban,high,mild headache,8.0,22.0,none,diabetes,pollen,1.0,covid-19,visit doctor,no


In [14]:
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-v1.1', do_lower_case=True)
model = BertForSequenceClassification.from_pretrained('dmis-lab/biobert-v1.1', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
dataset = Dataset.from_pandas(data)
dataset

Dataset({
    features: ['Age', 'Gender', 'Ethnicity', 'Region', 'Socioeconomic Status', 'Symptom Description', 'Symptom Severity', 'Duration of Symptoms (days)', 'Additional Symptoms', 'Chronic Condition', 'Allergies', 'Previous Visits', 'Potential Condition', 'Suggested Action', 'Insurance Status'],
    num_rows: 182
})

In [16]:
def tokenization_function(examples):
    return tokenizer(examples['Symptom Description'], padding='max_length', truncation=True)

tokenization_dataset =dataset.map(tokenization_function, batched=True)

Map:   0%|          | 0/182 [00:00<?, ? examples/s]

In [17]:
train_test_split = tokenization_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [18]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    no_cuda=True
)


