In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import re
import nltk
from nltk.stem import WordNetLemmatizer

from tqdm import tqdm
tqdm.pandas()
from nltk.corpus import stopwords, wordnet
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from pytorch_transformers import BertTokenizer, BertForSequenceClassification, AdamW, AutoTokenizer
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

pd.options.display.float_format = '{:,.2f}'.format
RS = 12345

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if device == torch.device('cpu'):
    print('Using cpu')
else:
    n_gpu = torch.cuda.device_count()
    print('Using {} GPUs'.format(torch.cuda.get_device_name(0)))

Using cpu


## Data preparation

In [4]:
# links to training and test samples
url_train = "https://raw.githubusercontent.com/PolyAI-LDN/task-specific-datasets/master/banking_data/train.csv"
url_test = "https://raw.githubusercontent.com/PolyAI-LDN/task-specific-datasets/master/banking_data/test.csv"

In [5]:
# Load the CSV file into a DataFrame
df_train = pd.read_csv(url_train)
df_test = pd.read_csv(url_test)

In [6]:
# Let's explore the data
df_train.head()

Unnamed: 0,text,category
0,I am still waiting on my card?,card_arrival
1,What can I do if my card still hasn't arrived ...,card_arrival
2,I have been waiting over a week. Is the card s...,card_arrival
3,Can I track my card while it is in the process...,card_arrival
4,"How do I know if I will get my card, or if it ...",card_arrival


In [7]:
# Information about features
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10003 entries, 0 to 10002
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      10003 non-null  object
 1   category  10003 non-null  object
dtypes: object(2)
memory usage: 156.4+ KB


In [8]:
# Exploring gaps in a dataset
df_train.isna().mean()*100

text       0.00
category   0.00
dtype: float64

In [9]:
# Find duplicates
df_train.duplicated().sum()

0

In [10]:
# function for analysis categorical features
def unique(colomns, data):
    for column in colomns:
        print(f'Number of unique values ​​in a column {column}: {data[column].nunique()}')
        print(data[column].unique())
        print('-----------------------------')

In [11]:
# Let's check the correctness of the toxic column category
unique(['category'], df_train)

Number of unique values ​​in a column category: 77
['card_arrival' 'card_linking' 'exchange_rate'
 'card_payment_wrong_exchange_rate' 'extra_charge_on_statement'
 'pending_cash_withdrawal' 'fiat_currency_support'
 'card_delivery_estimate' 'automatic_top_up' 'card_not_working'
 'exchange_via_app' 'lost_or_stolen_card' 'age_limit' 'pin_blocked'
 'contactless_not_working' 'top_up_by_bank_transfer_charge'
 'pending_top_up' 'cancel_transfer' 'top_up_limits'
 'wrong_amount_of_cash_received' 'card_payment_fee_charged'
 'transfer_not_received_by_recipient' 'supported_cards_and_currencies'
 'getting_virtual_card' 'card_acceptance' 'top_up_reverted'
 'balance_not_updated_after_cheque_or_cash_deposit'
 'card_payment_not_recognised' 'edit_personal_details'
 'why_verify_identity' 'unable_to_verify_identity' 'get_physical_card'
 'visa_or_mastercard' 'topping_up_by_card' 'disposable_card_limits'
 'compromised_card' 'atm_support' 'direct_debit_payment_not_recognised'
 'passcode_forgotten' 'declined_ca

In [12]:
# Let's analyze how many records we have for each category
category_counts = df_train['category'].value_counts()

# Create a table with the number of records and the percentage of the total
category_table = pd.DataFrame({
    'Count': category_counts,
    'Percentage': (category_counts / category_counts.sum() * 100).round(2) 
})

In [13]:
print(category_table)

                                                  Count  Percentage
category                                                           
card_payment_fee_charged                            187        1.87
direct_debit_payment_not_recognised                 182        1.82
balance_not_updated_after_cheque_or_cash_deposit    181        1.81
wrong_amount_of_cash_received                       180        1.80
cash_withdrawal_charge                              177        1.77
transaction_charged_twice                           175        1.75
declined_cash_withdrawal                            173        1.73
transfer_fee_charged                                172        1.72
transfer_not_received_by_recipient                  171        1.71
balance_not_updated_after_bank_transfer             171        1.71
request_refund                                      169        1.69
card_payment_not_recognised                         168        1.68
card_payment_wrong_exchange_rate                

The classes are not balanced, balancing is needed

In [14]:
# Let's explore the data
df_test.head()

Unnamed: 0,text,category
0,How do I locate my card?,card_arrival
1,"I still have not received my new card, I order...",card_arrival
2,I ordered a card but it has not arrived. Help ...,card_arrival
3,Is there a way to know when my card will arrive?,card_arrival
4,My card has not arrived yet.,card_arrival


In [15]:
# POS-тег 
def get_wordnet_pos(word: str) -> str:
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    
    return tag_dict.get(tag, wordnet.NOUN)

In [16]:
lemmatizer = WordNetLemmatizer()

def lemmatize(text):
    text = text.lower()
    words = nltk.word_tokenize(text)
    lemm_words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words]
    lemm_text = " ".join(lemm_words)
    cleared_text = re.sub(r'[^a-zA-Z\s]', '', lemm_text) 
    return cleared_text

In [17]:
df_train['lemm_text'] = df_train['text'].progress_apply(lemmatize)
print(df_train['lemm_text'].head(3))

100%|███████████████████████████████████████████████████████████████████████████| 10003/10003 [00:51<00:00, 195.42it/s]

0                          i be still wait on my card 
1    what can i do if my card still have nt arrive ...
2    i have be wait over a week  be the card still ...
Name: lemm_text, dtype: object





In [18]:
df_train =df_train.drop(['text'], axis=1)
df_train.sample()

Unnamed: 0,category,lemm_text
8036,top_up_by_cash_or_cheque,i try to deposit a cheque into my account and ...


## Training model BERT

In [19]:
features = df_train['lemm_text'].values 

features = ["[CLS] " + feature + " [SEP]" for feature in features]

target = df_train['category'].values

In [20]:
target

array(['card_arrival', 'card_arrival', 'card_arrival', ...,
       'country_support', 'country_support', 'country_support'],
      dtype=object)

In [21]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
target = label_encoder.fit_transform(target)

In [22]:
print(f'Training sample size:')
display(df_train.shape)

print(f'Test sample size:')
display(df_test.shape)

Training sample size:


(10003, 2)

Test sample size:


(3080, 2)

In [23]:
%%time
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(i) for i in features]
print (tokenized_texts[0])

['[CLS]', 'i', 'be', 'still', 'wait', 'on', 'my', 'card', '[SEP]']
CPU times: total: 1.69 s
Wall time: 5.04 s


In [24]:
MAX_LEN = 155
BATCH_SIZE = 16
input_ids = [tokenizer.convert_tokens_to_ids(x[:150]) for x in tokenized_texts]
input_ids = pad_sequences(
    input_ids,
    maxlen = MAX_LEN,
    dtype = "long",
    truncating = "post",
    padding = "post"
)
attention_masks = [[float(i > 0) for i in seq] for seq in input_ids]

In [25]:
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, target, 
                                                                                    random_state=RS, test_size=0.1)

train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=RS, test_size=0.1)

In [26]:
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)

In [27]:
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)

In [28]:
from torch.utils.data import WeightedRandomSampler
from sklearn.utils.class_weight import compute_class_weight

# Вычисляем веса классов
class_weights = compute_class_weight(
    class_weight='balanced', 
    classes=np.unique(target), 
    y=target
)

# Вычисляем веса для каждого примера
sample_weights = [class_weights[label] for label in train_labels]

# Создаем WeightedRandomSampler
sampler = WeightedRandomSampler(sample_weights, num_samples=len(sample_weights), replacement=True)

In [29]:
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(
    train_data,
    sampler = sampler,
    batch_size = BATCH_SIZE 
)

In [30]:
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_dataloader = DataLoader(
    validation_data,
    sampler = SequentialSampler(validation_data),
    batch_size = BATCH_SIZE 
)

In [31]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=77)
model.to(device) 

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [32]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters,lr=1e-5)

In [33]:
%%time
train_loss_set = []
train_loss = 0

model.train()

for step, batch in enumerate(train_dataloader):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    b_input_ids = torch. tensor(b_input_ids). to (torch. int64)
    b_labels = b_labels.to(torch.long)
    optimizer.zero_grad()
    
    loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

    train_loss_set.append(loss[0].item())  
    
    loss[0].backward()
    
    optimizer.step()

    train_loss += loss[0].item()
    
print("Loss on train sample: {0:.5f}".format(train_loss / len(train_dataloader)))

model.eval()

valid_preds, valid_labels = [], []

for batch in validation_dataloader:   
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    b_input_ids = torch. tensor(b_input_ids). to (torch. int64)
    b_labels = b_labels.to(torch.long)
    with torch.no_grad():
        logits = model(b_input_ids, token_type_ids = None, attention_mask = b_input_mask)

    logits = logits[0].detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    batch_preds = np.argmax(logits, axis=1)
    batch_labels = np.array(label_ids)     
    valid_preds.extend(batch_preds)
    valid_labels.extend(batch_labels)

print("Percentage of correct predictions on the validation set: {0:.2f}%".format(
    accuracy_score(valid_labels, valid_preds) * 100))

Loss on train sample: 3.85794
Percentage of correct predictions on the validation set: 48.25%
CPU times: total: 5h 16min 59s
Wall time: 1h 4min 19s


In [34]:
f1_BERT = round(f1_score(valid_labels, valid_preds, average='macro'), 2)

In [35]:
print("F1_score BERT: {0:.2f}".format(f1_score(valid_labels, valid_preds, average='macro')))

F1_score BERT: 0.46
