In [1]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m73.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m86.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig, AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, AutoConfig

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
%cd /content/gdrive/MyDrive/CS224NFinalProj/

[Errno 2] No such file or directory: '/content/gdrive/MyDrive/CS224NFinalProj/'
/content


In [6]:
df = pd.read_csv("patient_data.csv")
#df['list'] = df[df.columns[1:3]].values.tolist()
new_df = df[['id', 'doctor_faculty', 'description']].copy()
new_df.head()

FileNotFoundError: ignored

In [None]:
#map each department to an index 
departments = new_df['doctor_faculty'].unique()
d2ind = {departments[i]:i for i in range(len(departments))}
ind2d = {i:departments[i] for i in range(len(departments))}

In [None]:
ground_truths = []
for i, row in new_df.iterrows():
  dept_name = row['doctor_faculty']
  dept_ind = d2ind[dept_name]
  one_hot = np.zeros(len(departments))
  one_hot[dept_ind] = 1
  ground_truths.append(one_hot)
new_df['one_hot'] = list(ground_truths)

In [None]:
new_df.drop(['id', 'doctor_faculty'], axis='columns', inplace=True)
new_df.head()

In [None]:
## Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 5
LEARNING_RATE = 1e-05
# load in pre-trained chinese bert
# experiment with chinese-bert-wwm
#tokenizer = BertTokenizer.from_pretrained('uer/chinese_roberta_L-8_H-512')
#tokenizer = AutoTokenizer.from_pretrained('uer/roberta-base-finetuned-chinanews-chinese')
#tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-chinanews-chinese")
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-chinanews-chinese")

In [None]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.description = dataframe.description
        self.targets = self.data.one_hot
        self.max_len = max_len

    def __len__(self):
        return len(self.description)

    def __getitem__(self, index):
        description = str(self.description[index])
        description = " ".join(description.split())

        inputs = self.tokenizer.encode_plus(
            description,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=200)

In [None]:
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

In [None]:
testing_set[1]

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

##Training

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("uer/roberta-base-finetuned-chinanews-chinese", num_labels=10, ignore_mismatched_sizes=True, id2label=ind2d)
model.to(device)
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids, labels=targets)
        #sigmoids = torch.nn.functional.sigmoid(outputs.logits)
        #sigmoids = torch.zeros_like(sigmoids).scatter_(1, torch.multinomial(sigmoids,1), 1.)

        optimizer.zero_grad()
        #loss = loss_fn(outputs.logits, targets)

        loss, logits = outputs[:2]

        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        #loss.backward()
        #optimizer.step()
 
        loss.backward()
        optimizer.step()

In [None]:
for epoch in range(EPOCHS):
    train(epoch)

##Validation

In [None]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            sigmoids = torch.sigmoid(outputs.logits).cpu().detach().numpy().tolist()
            idx = np.argmax(sigmoids, axis=-1)
            sigmoids = np.zeros( (len(sigmoids), len(sigmoids[1])) )
            sigmoids[ np.arange(len(sigmoids)), idx] = 1
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(sigmoids)
    return fin_outputs, fin_targets

In [None]:
EPOCHS=1
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    print('outputs', outputs)
    print('targets', targets)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

In [None]:
torch.save(model.state_dict(), "./roberta-base.bin")

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("uer/roberta-base-finetuned-chinanews-chinese", num_labels=10, ignore_mismatched_sizes=True)
model.to(device)
my_config = AutoConfig.from_pretrained("uer/roberta-base-finetuned-chinanews-chinese", num_labels=10)
print(my_config)
for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        print(predictions.shape) # 8,10


In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        #self.l1 = AutoModelForSequenceClassification.from_pretrained("uer/roberta-base-finetuned-chinanews-chinese")
        self.l1 = AutoModelForSequenceClassification.from_pretrained("uer/roberta-base-finetuned-chinanews-chinese", num_labels=10, ignore_mismatched_sizes=True)

        #self.l2 = torch.nn.Dropout(0.3)
        #self.l3 = torch.nn.Linear(21128, 10)
    
    def forward(self, ids, mask, token_type_ids):
        output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        
        #print('output_1', output_1[0].shape)
        #output_2 = self.l2(output_1[0])
        #print('output_2', output_2[0].shape)
        #output = self.l3(output_2)
        return output_1.logits

model = BERTClass()
model.to(device)