In [None]:
#https://www.kaggle.com/datasets/prakharrathi25/google-play-store-reviews
#https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/
!pip -q install transformers
!pip install sentencepiece

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.optim as opt
from transformers import AutoTokenizer, AutoModel, AutoConfig
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix, auc, roc_curve
from copy import copy, deepcopy
import zipfile
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
#os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"  # specify which GPU(s) to be used
#torch.backends.cudnn.benchmark = True'''

In [None]:
df = pd.read_csv('/kaggle/input/app-reviews/reviews.csv')
def to_sentiment(rating):
  rating = int(rating)
  if rating <= 2:
    return 'negative'
  elif rating == 3:
    return 'neutral'
  else: 
    return 'positive'

df['sentiment'] = df.score.apply(to_sentiment)
df

In [None]:
df['sentiment'].value_counts()

In [None]:
pd.DataFrame({'Dtype': df.dtypes, 'Nunique': df.nunique(), 'Isnull': df.isnull().sum()}, index=df.columns)

In [None]:
seq_len = [len(sent) for sent in df['content']]
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.arange(len(df)), y=seq_len, mode='markers', name='Seq len'))
fig.add_trace(go.Scatter(x=np.arange(len(df)), y=[np.mean(seq_len)]*len(seq_len), mode='lines', name='Avg seq len'))
fig.add_trace(go.Scatter(x=np.arange(len(df)), y=[np.median(seq_len)]*len(seq_len), mode='lines', name='Med seq len'))
fig.show()

In [None]:
'''example_text = ['I will watch Memento tonight']
bert_input = tokenizer(example_text,padding='max_length', max_length = 10, truncation=True, return_tensors="pt")
tokenizer.decode(bert_input.input_ids[1])'''
text_column, out_column = 'content', 'sentiment'
df = df[[text_column, out_column]]
labels = dict(zip(df[out_column].unique(), range(df[out_column].nunique())))
df.replace({out_column: labels}, inplace=True)
X_train, X_test, Y_train, Y_test = train_test_split(df[text_column], df[out_column], test_size=0.1, random_state=42)
X_valid, X_test, Y_valid, Y_test = train_test_split(X_test, Y_test, test_size=0.5, random_state=42)
X_train.shape, Y_train.shape, X_valid.shape, Y_valid.shape, X_test.shape, Y_test.shape

In [None]:
X_train.tolist()

In [None]:
model_path = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model_config = AutoConfig.from_pretrained(model_path)
deberta = AutoModel.from_pretrained(model_path)
model_config, deberta

In [None]:
len(df[text_column].values[2777]), df[text_column].values[2777]

In [None]:
tmp = tokenizer([df[text_column].values[2777]], add_special_tokens=True, max_length=512, padding = 'max_length', truncation=True, return_tensors='pt')
tmp['input_ids'].shape,tmp

In [None]:
tokenizer.decode(tmp['input_ids'][0])

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class SeqClsLoader(Dataset): #Sequence classification dataloader
    def __init__(self, encoded_data, labels):
        self.labels = labels
        self.encoded_data = encoded_data

    def classes(self):
        return self.labels
 
    def __len__(self):
        return len(self.encoded_data['input_ids'])
 
    def __getitem__(self, idx):
        return self.encoded_data['input_ids'][idx], self.encoded_data['attention_mask'][idx], self.labels[idx] if self.labels is not None else None

class NNModel(nn.Module):
    def __init__(self, input_shape, units=None, factors=None, activ=True, norm=False, dropout=False, slops=None):
        super().__init__()
        self.input_shape = input_shape
        self.units = units
        self.factors = factors
        self.activ, self.norm = activ, norm
        self.network = nn.ModuleList()
        if self.factors:
            self.units = np.round(self.input_shape * np.asarray(self.factors)).astype(int)
        if self.units is not None:
            self.dropout = np.zeros_like(self.units) if not dropout else dropout
            self.slops = np.full(len(self.units), 1) if slops is None else slops
            for i, j, k in zip(self.units, self.dropout, self.slops):
                if i >= 1:
                    block = self.__build_block__(input_shape, i, p=j, slop=k)
                    self.network.extend(block)
                    input_shape = i
        self.output_shape = input_shape
        self.reset_parameters()
    
    def __build_block__(self, input_shape, units, p, slop):
        block = []
        block.append(nn.Linear(input_shape, units, bias=not self.norm))
        if self.norm:
            block.append(nn.BatchNorm1d(units))
            #block.append(nn.LayerNorm(units, eps=1e-5))
        if self.activ:
            #block.append(nn.LeakyReLU())
            block.append(nn.ELU(slop))
            #block.append(nn.GELU())
        if p > 0:
            block.append(nn.Dropout(p))
        return block
 
    def forward(self, x):
        for layer in self.network:
          tmp = layer(x)
          x = tmp
        return x
 
    def reset_parameters(self):
        for layer in self.network:
            if isinstance(layer, nn.Linear):
                nn.init.xavier_normal_(layer.weight)
                layer.bias.data.fill_(0.1)
 

class TransSeqClassifier(nn.Module):
    def __init__(self, model, emb_dim, mlp_units, mlp_dropout, nb_class):
        super(TransSeqClassifier, self).__init__()
        self.model = model
        self.mlp = NNModel(emb_dim, units=mlp_units, factors=None, dropout=[mlp_dropout]*len(mlp_units)) if mlp_units is not None else None
        cls_units = self.mlp.output_shape if mlp_units is not None else emb_dim
        self.classifier = nn.Linear(cls_units, nb_class)

    def forward(self, input_id, mask):
        discriminator_hidden_states = self.model(input_ids= input_id, attention_mask=mask,return_dict=False)
        last_hidden_state = discriminator_hidden_states[0]
        cls_token = last_hidden_state[:, 0, :]  # take <s> token (equiv. to [CLS])
        z = self.mlp(cls_token) if self.mlp is not None else cls_token
        pred = self.classifier(z)
        return z, cls_token, pred

class BaseSeqClassifier:
    def __init__(self, model, tokenizer, max_len):
        self.model = model.to(device)
        self.losses = {'Epoch': [], 'Train': [], 'Test': [], 'BState': [], 'LState': [], 'LR': []}
        self.tokenizer, self.max_len = tokenizer, max_len
 
    def train_model(self, optim, train_loader, grad_clip, l2_reg):
          total_loss = 0
          self.model = self.model.train()
        #with autograd.detect_anomaly():
          for i, (ids, mask, Y) in enumerate(train_loader):
              ids, mask, Y = ids.to(device), mask.to(device), Y.to(device)
              #self.model.get_weight()
              optim.zero_grad()
              loss = self.loss_function(ids, mask, Y, l2_reg)
              loss.backward()
              torch.nn.utils.clip_grad_norm_(self.model.parameters(), grad_clip)
              optim.step()
              total_loss += loss.item()
          return total_loss/(i+1)
        
 
    def eval_model(self, test_loader):
        self.model = self.model.eval()
        total_loss = 0
        for i, (ids, mask, Y) in enumerate(test_loader):
            ids, mask, Y = ids.to(device), mask.to(device), Y.to(device)
            loss = self.loss_function(ids, mask, Y, l2_reg=0)
            total_loss += loss.item()
        return total_loss/(i+1)#np.abs(-100. - total_loss)
 
    def fit(self, X_train, Y_train, epoch, lr, opt_kwarg, batch_size=None,  grad_clip=100, momentum=0.9, X_test=None, Y_test=None, l2_reg=0, verbose=True, save=True):
        batch_size = len(Y_train) if batch_size is None else batch_size
        encoded_train = self.tokenizer(X_train, add_special_tokens=True, max_length=self.max_len, padding = 'max_length', truncation=True, return_tensors='pt')
        train_load = DataLoader(SeqClsLoader(encoded_train, Y_train), batch_size=batch_size, shuffle=True)  # DATALOADER obj
        if X_test is not None:
            encoded_test = self.tokenizer(X_test, add_special_tokens=True, max_length=self.max_len, padding = 'max_length', truncation=True, return_tensors='pt')
            test_load = DataLoader(SeqClsLoader(encoded_test, Y_test), batch_size=batch_size, shuffle=True)
 
        best_loss = 1e100
        #optim = opt.Adam(self.model.parameters(), lr=lr)
        optim = opt.AdamW(self.model.parameters(), lr=lr)
        #optim = opt.SGD(self.model.parameters(), lr=lr, momentum=momentum, nesterov=True)

        scheduler = None
        #scheduler = opt.lr_scheduler.CyclicLR(optim, **opt_kwarg)
        #scheduler = opt.lr_scheduler.ReduceLROnPlateau(optim, **opt_kwarg)
        #scheduler = opt.lr_scheduler.MultiStepLR(optim, milestones=[28, 120], gamma=0.1)

        eval_score = ''
        for i in range(epoch):
            if verbose:
                print('##### EPOCH ' + str(i) + ' #####')
               
            train_loss = self.train_model(optim, train_load, grad_clip, l2_reg)
            self.losses['LState'] = deepcopy(self.model.state_dict())
    
            if verbose:
                print('train loss : ', train_loss)
            self.losses['Epoch'].append(i), self.losses['Train'].append(train_loss)
    
            if X_test is not None:
                valid_loss = self.eval_model(test_load)

                if verbose:
                    print('test loss : ', valid_loss)
                self.losses['Test'].append(valid_loss)
    
                if scheduler is not None:
                    scheduler.step(valid_loss)
                    self.losses['LR'].append(optim.param_groups[0]['lr'])
                    '''scheduler.step()
                    self.losses['LR'].append(scheduler.get_last_lr()[0])'''
    
                if valid_loss < best_loss:
                    self.losses['BState'] = deepcopy(self.model.state_dict())
                    best_loss = valid_loss
                    print('===========SAVE===========')


class Binaryclass(BaseSeqClassifier):#Binaryclass classification
    def __init__(self, model, tokenizer, max_len):
        super(Binaryclass, self).__init__(model, tokenizer, max_len)

    def loss_function(self, input_id, mask, Y, l2_reg):
        _, _, pred = self.model(input_id, mask)
        bce_loss = nn.BCEWithLogitsLoss()
        loss = bce_loss(pred, Y)
        return loss

    def prdict(self, X, batch_size):
        self.model.eval()
        encoded_data = self.tokenizer(X, add_special_tokens=True, max_length=self.max_len, padding = 'max_length', truncation=True, return_tensors='pt')
        data_load = DataLoader(TensorDataset(encoded_data['input_ids'],  encoded_data['attention_mask']),batch_size=batch_size)
        outputs = {'z': [], 'cls_token': [], 'pred': []}
        for i, (ids, mask) in enumerate(data_load):
            ids, mask = ids.to(device), mask.to(device)
            z, cls_token, pred = self.model(ids, mask)
            pred = nn.Sigmoid()(pred)
            z, cls_token, pred = z.cpu().data.numpy(), cls_token.cpu().data.numpy(), pred.cpu().data.numpy()
            outputs['z'].extend(z), outputs['cls_token'].extend(cls_token), outputs['pred'].extend(pred)
        return outputs

class Multiclass(BaseSeqClassifier):#multiclass classification
    def __init__(self, model, tokenizer, max_len):
        super(Multiclass, self).__init__(model, tokenizer, max_len)

    def loss_function(self, input_id, mask, Y, l2_reg):
        _, _, pred = self.model(input_id, mask)
        ce_loss = nn.CrossEntropyLoss()
        loss = ce_loss(pred, Y)
        return loss

    def prdict(self, X, batch_size):
        self.model.eval()
        encoded_data = self.tokenizer(X, add_special_tokens=True, max_length=self.max_len, padding = 'max_length', truncation=True, return_tensors='pt')
        data_load = DataLoader(TensorDataset(encoded_data['input_ids'],  encoded_data['attention_mask']),batch_size=batch_size)
        outputs = {'z': [], 'cls_token': [], 'pred': []}
        for i, (ids, mask) in enumerate(data_load):
            ids, mask = ids.to(device), mask.to(device)
            z, cls_token, pred = self.model(ids, mask)
            pred = nn.Softmax()(pred)
            z, cls_token, pred = z.cpu().data.numpy(), cls_token.cpu().data.numpy(), pred.cpu().data.numpy()
            outputs['z'].extend(z), outputs['cls_token'].extend(cls_token), outputs['pred'].extend(pred)
        return outputs

def gradient_clipper(model: nn.Module, val: float) -> nn.Module:
    def process_grad(grad):
        grad[grad != grad] = 1e-10
        return torch.clamp(grad, -val, val)
    for parameter in model.parameters():
        parameter.register_hook(lambda grad: process_grad(grad))
    
    return model

In [None]:
epoch, lr, batch_size, d, mlp_d = 50000, 2e-5, 4, 0.000001, 1e-6
#cyclic_kwarg = {'base_lr': lr, 'max_lr': 1e-2, 'step_size_up':200, 'step_size_down':200}
plateau_kwarg = {'factor':0.5, 'patience':200, 'verbose':True, 'min_lr':1e-7, 'mode':'min'}


model = TransSeqClassifier(deberta, model_config.hidden_size, mlp_units=None, mlp_dropout=1e-6, nb_class=df[out_column].nunique())
model = gradient_clipper(model, 10)
#model.load_state_dict(best_state)
print(device)
print(model)
print(sum(p.numel() for p in model.parameters() if p.requires_grad))
seq_bc = Multiclass(model, tokenizer, 512)
seq_bc.fit(X_train.tolist(), Y_train.values, epoch, lr, plateau_kwarg, batch_size=batch_size, grad_clip=10, momentum=0.9,
        X_test=X_valid.tolist(), Y_test=Y_valid.values, l2_reg=0, verbose=True)

In [None]:
if torch.cuda.device_count() > 1:
  print("Let's use", torch.cuda.device_count(), "GPUs!")

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

best_state = deepcopy(seq_bc.losses['BState'])
seq_bc.model.load_state_dict(best_state)
print(np.min(seq_bc.losses['Test']))

fig = make_subplots(rows=3, cols=1)
s = 0
fig.append_trace(go.Scatter(x=seq_bc.losses['Epoch'][s:], y=seq_bc.losses['Train'][s:],mode='lines',name='Train'), row=1, col=1)
fig.append_trace(go.Scatter(x=seq_bc.losses['Epoch'][s:], y=seq_bc.losses['Test'][s:],mode='lines',name='Test'), row=2, col=1)
fig.append_trace(go.Scatter(x=seq_bc.losses['Epoch'][s:], y=seq_bc.losses['LR'][s:],mode='lines',name='LR'), row=3, col=1)
fig.update_layout(height=1000, width=1500, title_text="Stacked Subplots")
fig.show()

In [None]:
output = seq_bc.prdict(X_valid.tolist(), 2)
pred = np.argmax(np.asarray(output['pred']), 1)
acc = accuracy_score(Y_valid.values, pred)
acc, confusion_matrix(Y_valid.values, pred)

In [None]:
output = seq_bc.prdict(X_test.tolist(), 2)
pred = np.argmax(np.asarray(output['pred']), 1)
acc = accuracy_score(Y_test.values, pred)
acc, confusion_matrix(Y_test.values, pred)

In [None]:
pd.concat((X_test.reset_index(drop=True), Y_test.reset_index(drop=True), pd.DataFrame({'Pred': pred.ravel()})), axis=1)