In [1]:
import os
import gc
import copy
import time
import random
import string
import joblib

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

# Utils
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, KFold

# For Transformer Models
from transformers import AutoTokenizer, AutoModel, AutoConfig, AdamW
from transformers import DataCollatorWithPadding
import datasets
# For colored terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
y_ = Fore.YELLOW
sr_ = Style.RESET_ALL

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# For reproducibility. Same as when the model was trained
HASH_NAME = "kmyc4k23vc47"

In [3]:
CONFIG = {"seed": 2022,
          "epochs": 3,
          "model_name": "microsoft/deberta-v3-base",
          "train_batch_size": 8,
          "valid_batch_size": 16,
          "max_length": 512,
          "learning_rate": 1e-5,
          "scheduler": 'CosineAnnealingLR',
          "min_lr": 1e-6,
          "T_max": 500,
          "weight_decay": 1e-6,
          "n_fold": 3,
          "n_accumulate": 1,
          "num_classes": 2,
          "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
          "hash_name": HASH_NAME,
          "_wandb_kernel": "deb",
          }

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])
CONFIG['group'] = f'{HASH_NAME}-Baseline'


In [6]:
mongodb_df = pd.read_csv("../csv/clean_MongoDB_balanced.csv")
mongodb_df

Unnamed: 0,text_clean,label
0,info ccdashboardauthauthproperties no jwt secr...,0
1,first logged failure here note that this commi...,1
2,i perform following tutorial for my knowledgei...,1
3,description yes it would be a new component of...,1
4,createuser docs are wrong the name of the user...,1
...,...,...
1811,dbcollectioncountor is extremmely slow i comma...,0
1812,we must make sure that there are no more users...,0
1813,paneltitleissue status as of june issue summar...,1
1814,the topology coordinator is the repository for...,0


In [7]:

mongodb_df = mongodb_df.rename(columns={'text_clean': 'text'})
mongodb_df

Unnamed: 0,text,label
0,info ccdashboardauthauthproperties no jwt secr...,0
1,first logged failure here note that this commi...,1
2,i perform following tutorial for my knowledgei...,1
3,description yes it would be a new component of...,1
4,createuser docs are wrong the name of the user...,1
...,...,...
1811,dbcollectioncountor is extremmely slow i comma...,0
1812,we must make sure that there are no more users...,0
1813,paneltitleissue status as of june issue summar...,1
1814,the topology coordinator is the repository for...,0


In [8]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [9]:
class TD_Model(nn.Module):
    def __init__(self, model_name):
        super(TD_Model, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.config = AutoConfig.from_pretrained(model_name)
        self.drop = nn.Dropout(p=0.2)
        self.pooler = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, CONFIG['num_classes'])
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        out = self.pooler(out.last_hidden_state, mask)
        out = self.drop(out)
        outputs = self.fc(out)
        return outputs

<h2> Testing Inference </h2>


In [10]:
import warnings,transformers,logging,torch

warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

In [11]:
test_df = mongodb_df

In [12]:
test_df

Unnamed: 0,text,label
0,info ccdashboardauthauthproperties no jwt secr...,0
1,first logged failure here note that this commi...,1
2,i perform following tutorial for my knowledgei...,1
3,description yes it would be a new component of...,1
4,createuser docs are wrong the name of the user...,1
...,...,...
1811,dbcollectioncountor is extremmely slow i comma...,0
1812,we must make sure that there are no more users...,0
1813,paneltitleissue status as of june issue summar...,1
1814,the topology coordinator is the repository for...,0


In [13]:
class HP_TestDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.text = df['text'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
                        text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_len
                    )
        
        samples = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            
        }

        if 'token_type_ids' in inputs:
            samples['token_type_ids'] = inputs['token_type_ids']
        
        return samples

In [14]:
collate_fn = DataCollatorWithPadding(tokenizer=CONFIG['tokenizer'])

softmax = nn.Softmax(dim=1)
model = TD_Model(CONFIG['model_name'])

In [15]:
def prepare_test_loader(test_df):    
    test_dataset = HP_TestDataset(test_df, 
                                   tokenizer=CONFIG['tokenizer'], 
                                   max_length=CONFIG['max_length'])
    
    test_loader = DataLoader(test_dataset, 
                             batch_size=CONFIG['valid_batch_size'], 
                             collate_fn=collate_fn, 
                             num_workers=2, 
                             shuffle=False, 
                             pin_memory=True, 
                             drop_last=False)
    return test_loader

test_loader = prepare_test_loader(test_df)

In [16]:
@torch.no_grad()
def inference(test_loader, model, device):
    preds = []
    preds_target = []
    model.eval()
    model.to(device)
    
    bar = tqdm(enumerate(test_loader), total=len(test_loader))
    
    for step, data in bar: 
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        
        output = model(ids, mask)
        y_preds = softmax(torch.tensor(output.to('cpu'))).numpy()
        pred_target = torch.argmax(output, dim=-1).flatten().tolist()
        
        preds.append(y_preds)
        preds_target.append(pred_target) 
    predictions = np.concatenate(preds)
    predictions_label = np.concatenate(preds_target)
    return predictions , predictions_label

In [17]:
deberta_predictions = []
deberta_predictions_labels = []



for fold in range(0, CONFIG['n_fold']):
    print("Fold {}".format(fold))
    
    state = torch.load(f'{HASH_NAME}-Loss-Fold-{fold}.bin')
    model.load_state_dict(state)
    
    prediction, predictions_label = inference(test_loader, model, CONFIG['device'])
    deberta_predictions.append(prediction)
    deberta_predictions_labels.append(predictions_label)
    del state, prediction; gc.collect()
    torch.cuda.empty_cache()
del model

Fold 0


FileNotFoundError: [Errno 2] No such file or directory: 'kmyc4k23vc47-Loss-Fold-0.bin'

In [None]:
deberta_predictions


In [None]:
predictions = np.mean(deberta_predictions, axis=0)
predictions

In [None]:
len(predictions)

In [None]:
predictions

In [None]:
test_df["prediction_0"] = predictions[:, 0] 

In [None]:
test_df["prediction_1"] = predictions[:, 1] 

In [None]:
for i in range(0, CONFIG['n_fold']):

    test_df[f"prediction_label_fold_{i}"] = deberta_predictions_labels[i]


In [None]:
test_df

In [None]:
len(test_df)

In [None]:
test_df['Max'] = test_df[['prediction_0','prediction_1']].idxmax(axis=1)

In [None]:
test_df["pred"] = test_df['Max'].apply(lambda x: x.replace("prediction_0", "0"))

In [None]:
test_df["pred"] = test_df['pred'].apply(lambda x: x.replace("prediction_1", "1"))


In [None]:
test_df["pred"] = test_df["pred"].astype(int)

In [None]:
test_df

In [None]:
from sklearn.metrics import confusion_matrix
cf_matrix = confusion_matrix(test_df["label"].values, test_df["pred"].values)

print(cf_matrix)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
ax = sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, 
            fmt='.2%', cmap='Blues')

ax.set_title('NonTD vs TD \n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');

## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['False(NonTD)','True(TD)'])
ax.yaxis.set_ticklabels(['False(NonTD)','True(TD)'])

## Display the visualization of the Confusion Matrix.
plt.show()

In [None]:
from sklearn.metrics import classification_report


print(classification_report(test_df["label"].values, test_df["pred"].values))

In [None]:
test_df[["text","label","pred"]].to_csv("Jira_Test_inference.csv")

In [None]:
test_df