In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW
from transformers import BertForSequenceClassification, BertConfig
from sklearn.utils.class_weight import compute_class_weight
import csv

# specify GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
#device = 'cpu'
print(device)

df = pd.read_csv("/kaggle/input/reviewdata/review_cpu_label_map.csv")
dfn = pd.read_csv("/kaggle/input/needdata/need_cpu_label_map.csv")

train_text, val_text, train_labels, val_labels = train_test_split(df['review'], df['cpu_label'], 
                                                                    random_state=2018, 
                                                                    test_size=0.1, 
                                                                    stratify=df['cpu_label'])

finetune_text, test_text, finetune_labels, test_labels = train_test_split(dfn['need'], dfn['cpu_label'], 
                                                                    random_state=2018, 
                                                                          test_size=0.5)
                                                                    #test_size=0.5, 
                                                                    #stratify=dfn['screen_label'])

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

max_seq_len = 512

# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# for train set
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

# for validation set
val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

batch_size = 16

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)

# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

#--------------------------------------------------------------
#---------------------fine tune and test ----------------------
#---------------------------start------------------------------

# tokenize and encode sequences in the training set
tokens_finetune = tokenizer.batch_encode_plus(
    finetune_text.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the validation set
tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# for train set
finetune_seq = torch.tensor(tokens_finetune['input_ids'])
finetune_mask = torch.tensor(tokens_finetune['attention_mask'])
finetune_y = torch.tensor(finetune_labels.tolist())

# for validation set
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

batch_size = 16

# wrap tensors
finetune_data = TensorDataset(finetune_seq, finetune_mask, finetune_y)

# sampler for sampling the data during training
finetune_sampler = RandomSampler(finetune_data)

# dataLoader for train set
finetune_dataloader = DataLoader(finetune_data, sampler=finetune_sampler, batch_size=batch_size)

# wrap tensors
test_data = TensorDataset(test_seq, test_mask, test_y)

# sampler for sampling the data during training
test_sampler = SequentialSampler(test_data)

# dataLoader for validation set
test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size=batch_size)

#----------------------------end-------------------------------
#---------------------fine tune and test ----------------------
#--------------------------------------------------------------


model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 4, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
    )

# push the model to GPU
model = model.to(device)

# define the optimizer
optimizer = AdamW(model.parameters(), lr = 1e-5)

# train
epoch=10
model.train()
for i in range(epoch):
    count=0
    loss_rec=0
    for batch in train_dataloader:
        batch = [r.to(device) for r in batch]
        inputs, input_mask, labels=batch
        output = model(inputs, \
                        token_type_ids=None, \
                        attention_mask=input_mask, \
                        labels=labels)
        loss = output['loss']
        logits = output['logits']
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        count+=1
        loss_rec+=loss
        
    print('NO.',i,' epoch avg train loss: ',loss_rec/count)
    

with torch.no_grad():
    model.eval()
    preds=[]
    labels=[]
    for batch in val_dataloader:
        batch = [r.to(device) for r in batch]
        inputs, input_mask, label=batch
        output = model(inputs, \
                        token_type_ids=None, \
                        attention_mask=input_mask, \
                        labels=label)
        logits = output['logits']
        labels.extend(label.cpu().tolist())
        preds.extend(torch.argmax(logits,dim=-1).cpu().tolist())
    acc=sum([int(i==j) for i,j in zip(preds, labels)])/len(preds)

print("validation accuracy is : ",acc)

# fine tune
epoch=20
#model.train()
for i in range(epoch):
    count=0
    loss_rec=0
    model.train()
    for batch in finetune_dataloader:
        batch = [r.to(device) for r in batch]
        inputs, input_mask, labels=batch
        output = model(inputs, \
                        token_type_ids=None, \
                        attention_mask=input_mask, \
                        labels=labels)
        loss = output['loss']
        logits = output['logits']
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        count+=1
        loss_rec+=loss
    print('NO.',i,' epoch avg fine tune loss: ',loss_rec/count)

    if(i==0 or i==4 or i==9 or i==14 or i==19):
        with torch.no_grad():
            model.eval()
            all_pred=[]#add
            preds=[]
            labels=[]
            for batch in test_dataloader:
                batch = [r.to(device) for r in batch]
                inputs, input_mask, label=batch
                output = model(inputs, \
                        token_type_ids=None, \
                        attention_mask=input_mask, \
                        labels=label)
                logits = output['logits']
                all_pred.extend(logits.cpu().tolist())#add
                labels.extend(label.cpu().tolist())
                preds.extend(torch.argmax(logits,dim=-1).cpu().tolist())
            acc=sum([int(i==j) for i,j in zip(preds, labels)])/len(preds)
        
        save_path="/kaggle/working/baselinebert_cpu_epoch_"+str(i+1)+"_test_res.csv"
        n=len(labels)
        record=[]
        for j in range(0,n):
            tmp={"index":j, "label":labels[j], "prediction":preds[j], "all_pred": all_pred[j]}
            record.append(tmp)

        with open(save_path, 'w', newline='') as csvfile:
            fieldnames = ['index', 'label','prediction','all_pred']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            writer.writeheader()
            writer.writerows(record)
        print(save_path)
        print(i," epoch test accuracy is : ",acc)