In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW
from transformers import BertForSequenceClassification, BertConfig
from sklearn.utils.class_weight import compute_class_weight
import csv
import torch.nn.functional as F

# specify GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
#device = 'cpu'
print(device)

df = pd.read_csv("/kaggle/input/reviewdata/review_cpu_label_map.csv")
dfn = pd.read_csv("/kaggle/input/needdata/need_cpu_label_map.csv")

train_text, val_text, train_labels, val_labels = train_test_split(df['review'], df['cpu_label'], 
                                                                    random_state=2018, 
                                                                    test_size=0.1, 
                                                                    stratify=df['cpu_label'])

finetune_text, test_text, finetune_labels, test_labels = train_test_split(dfn['need'], dfn['cpu_label'], 
                                                                    random_state=2018, 
                                                                          test_size=0.5)
                                                                    #test_size=0.5, 
                                                                    #stratify=dfn['screen_label'])

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

max_seq_len = 512

# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# for train set
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

# for validation set
val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

batch_size = 16

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)

# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

#--------------------------------------------------------------
#---------------------fine tune and test ----------------------
#---------------------------start------------------------------

# tokenize and encode sequences in the training set
tokens_finetune = tokenizer.batch_encode_plus(
    finetune_text.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the validation set
tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# for train set
finetune_seq = torch.tensor(tokens_finetune['input_ids'])
finetune_mask = torch.tensor(tokens_finetune['attention_mask'])
finetune_y = torch.tensor(finetune_labels.tolist())

# for validation set
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

batch_size = 16

# wrap tensors
finetune_data = TensorDataset(finetune_seq, finetune_mask, finetune_y)

# sampler for sampling the data during training
finetune_sampler = RandomSampler(finetune_data)

# dataLoader for train set
finetune_dataloader = DataLoader(finetune_data, sampler=finetune_sampler, batch_size=batch_size)

# wrap tensors
test_data = TensorDataset(test_seq, test_mask, test_y)

# sampler for sampling the data during training
test_sampler = SequentialSampler(test_data)

# dataLoader for validation set
test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size=batch_size)

#----------------------------end-------------------------------
#---------------------fine tune and test ----------------------
#--------------------------------------------------------------

class CNN_Text(nn.Module):
    
    def __init__(self):
        super(CNN_Text, self).__init__()
        
        V = 28996 #args.embed_num
        D = 768#args.embed_dim
        C = 4#args.class_num
        Ci = 1
        Co = 32#args.kernel_num
        Ks = [4, 16, 64, 128]#args.kernel_sizes

        self.embed = nn.Embedding(30522, D)
        self.convs = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])
        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(len(Ks) * Co, C)

        #if self.args.static:
        #    self.embed.weight.requires_grad = False

    def forward(self, x):
        #print(x)
        x = self.embed(x)  # (N, W, D)
    
        x = x.unsqueeze(1)  # (N, Ci, W, D)

        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]  # [(N, Co, W), ...]*len(Ks)

        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(N, Co), ...]*len(Ks)

        x = torch.cat(x, 1)

        x = self.dropout(x)  # (N, len(Ks)*Co)
        logit = self.fc1(x)  # (N, C)
        return logit
    
class CNN_NLP(nn.Module):
    """An 1D Convulational Neural Network for Sentence Classification."""
    def __init__(self,
                 pretrained_embedding=None,
                 freeze_embedding=False,
                 vocab_size=30522,
                 embed_dim=300,
                 filter_sizes=[3, 4, 5],
                 num_filters=[100, 100, 100],
                 num_classes=2,
                 dropout=0.5):
        """
        The constructor for CNN_NLP class.

        Args:
            pretrained_embedding (torch.Tensor): Pretrained embeddings with
                shape (vocab_size, embed_dim)
            freeze_embedding (bool): Set to False to fine-tune pretraiend
                vectors. Default: False
            vocab_size (int): Need to be specified when not pretrained word
                embeddings are not used.
            embed_dim (int): Dimension of word vectors. Need to be specified
                when pretrained word embeddings are not used. Default: 300
            filter_sizes (List[int]): List of filter sizes. Default: [3, 4, 5]
            num_filters (List[int]): List of number of filters, has the same
                length as `filter_sizes`. Default: [100, 100, 100]
            n_classes (int): Number of classes. Default: 2
            dropout (float): Dropout rate. Default: 0.5
        """

        super(CNN_NLP, self).__init__()
        # Embedding layer
        if pretrained_embedding is not None:
            self.vocab_size, self.embed_dim = pretrained_embedding.shape
            self.embedding = nn.Embedding.from_pretrained(pretrained_embedding,
                                                          freeze=freeze_embedding)
        else:
            self.embed_dim = embed_dim
            self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                          embedding_dim=self.embed_dim,
                                          padding_idx=0,
                                          max_norm=5.0)
        # Conv Network
        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(in_channels=self.embed_dim,
                      out_channels=num_filters[i],
                      kernel_size=filter_sizes[i])
            for i in range(len(filter_sizes))
        ])
        # Fully-connected layer and Dropout
        self.fc = nn.Linear(np.sum(num_filters), num_classes)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, input_ids):
        """Perform a forward pass through the network.

        Args:
            input_ids (torch.Tensor): A tensor of token ids with shape
                (batch_size, max_sent_length)

        Returns:
            logits (torch.Tensor): Output logits with shape (batch_size,
                n_classes)
        """

        # Get embeddings from `input_ids`. Output shape: (b, max_len, embed_dim)
        x_embed = self.embedding(input_ids).float()

        # Permute `x_embed` to match input shape requirement of `nn.Conv1d`.
        # Output shape: (b, embed_dim, max_len)
        x_reshaped = x_embed.permute(0, 2, 1)

        # Apply CNN and ReLU. Output shape: (b, num_filters[i], L_out)
        x_conv_list = [F.relu(conv1d(x_reshaped)) for conv1d in self.conv1d_list]

        # Max pooling. Output shape: (b, num_filters[i], 1)
        x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2])
            for x_conv in x_conv_list]
        
        # Concatenate x_pool_list to feed the fully connected layer.
        # Output shape: (b, sum(num_filters))
        x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list],
                         dim=1)
        
        # Compute logits. Output shape: (b, n_classes)
        logits = self.fc(self.dropout(x_fc))

        return logits

model = CNN_Text()
#model=CNN_NLP(num_classes=4)

# push the model to GPU
model = model.to(device)

# define the optimizer
optimizer = AdamW(model.parameters(), lr = 1e-5)


# loss function
cross_entropy = nn.NLLLoss()

# train
epoch=10
model.train()
for i in range(epoch):
    count=0
    loss_rec=0
    for batch in train_dataloader:
        batch = [r.to(device) for r in batch]
        inputs, input_mask, labels=batch
        
        #print(next(model.parameters()).device)
        #print(inputs.get_device())
        
        output = model(inputs)
        
        loss = cross_entropy(output, labels)
        
        #logits = output['logits']
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        count+=1
        loss_rec+=loss
        
    print('NO.',i,' epoch avg train loss: ',loss_rec/count)
    

with torch.no_grad():
    model.eval()
    preds=[]
    labels=[]
    for batch in val_dataloader:
        batch = [r.to(device) for r in batch]
        inputs, input_mask, label=batch
        
        output = model(inputs)
        
        logits = output
        labels.extend(label.cpu().tolist())
        preds.extend(torch.argmax(logits,dim=-1).cpu().tolist())
    acc=sum([int(i==j) for i,j in zip(preds, labels)])/len(preds)

print("validation accuracy is : ",acc)

# fine tune
epoch=20
#model.train()
for i in range(epoch):
    count=0
    loss_rec=0
    model.train()
    for batch in finetune_dataloader:
        batch = [r.to(device) for r in batch]
        inputs, input_mask, labels=batch
        
        output = model(inputs)
        
        loss = cross_entropy(output, labels)
        
        #logits = output['logits']
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        count+=1
        loss_rec+=loss
    print('NO.',i,' epoch avg fine tune loss: ',loss_rec/count)

    if(i==0 or i==4 or i==9 or i==14 or i==19):
        with torch.no_grad():
            model.eval()
            all_pred=[]#add
            preds=[]
            labels=[]
            for batch in test_dataloader:
                batch = [r.to(device) for r in batch]
                inputs, input_mask, label=batch
                
                output = model(inputs)
                
                #loss = output['loss']
                logits = output
                all_pred.extend(logits.cpu().tolist())#add
                labels.extend(label.cpu().tolist())
                preds.extend(torch.argmax(logits,dim=-1).cpu().tolist())
            acc=sum([int(i==j) for i,j in zip(preds, labels)])/len(preds)
        
        save_path="/kaggle/working/baselinebert_cpu_epoch_"+str(i+1)+"_test_res.csv"
        n=len(labels)
        record=[]
        for j in range(0,n):
            tmp={"index":j, "label":labels[j], "prediction":preds[j], "all_pred": all_pred[j]}
            record.append(tmp)

        with open(save_path, 'w', newline='') as csvfile:
            fieldnames = ['index', 'label','prediction','all_pred']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            writer.writeheader()
            writer.writerows(record)
        print(save_path)
        print(i," epoch test accuracy is : ",acc)



cuda:0


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



NO. 0  epoch avg train loss:  tensor(-123.6794, device='cuda:0', grad_fn=<DivBackward0>)
NO. 1  epoch avg train loss:  tensor(-469.1632, device='cuda:0', grad_fn=<DivBackward0>)
NO. 2  epoch avg train loss:  tensor(-985.3215, device='cuda:0', grad_fn=<DivBackward0>)
NO. 3  epoch avg train loss:  tensor(-1684.6146, device='cuda:0', grad_fn=<DivBackward0>)
NO. 4  epoch avg train loss:  tensor(-2565.7676, device='cuda:0', grad_fn=<DivBackward0>)
NO. 5  epoch avg train loss:  tensor(-3620.8445, device='cuda:0', grad_fn=<DivBackward0>)
NO. 6  epoch avg train loss:  tensor(-4838.3188, device='cuda:0', grad_fn=<DivBackward0>)
NO. 7  epoch avg train loss:  tensor(-6197.4526, device='cuda:0', grad_fn=<DivBackward0>)
NO. 8  epoch avg train loss:  tensor(-7742.9497, device='cuda:0', grad_fn=<DivBackward0>)
NO. 9  epoch avg train loss:  tensor(-9445.3848, device='cuda:0', grad_fn=<DivBackward0>)
validation accuracy is :  0.49239280774550487
NO. 0  epoch avg fine tune loss:  tensor(-10264.9570, dev