In [1]:
import pandas as pd
import numpy as np

Подключим диск для загрузки данных

In [2]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


Загружаем данные

In [3]:
df = pd.read_csv('gdrive/MyDrive/test_task/train.csv')

In [4]:
df.category.value_counts()

card_payment_fee_charged                            187
direct_debit_payment_not_recognised                 182
balance_not_updated_after_cheque_or_cash_deposit    181
wrong_amount_of_cash_received                       180
cash_withdrawal_charge                              177
                                                   ... 
lost_or_stolen_card                                  82
card_swallowed                                       61
card_acceptance                                      59
virtual_card_not_working                             41
contactless_not_working                              35
Name: category, Length: 77, dtype: int64

Расставим метки

In [5]:
for num, name_class in enumerate(df.category.unique()):
  df.loc[df.category==name_class, 'class'] = num

In [None]:
df

Unnamed: 0,text,category,class
0,I am still waiting on my card?,card_arrival,0.0
1,What can I do if my card still hasn't arrived ...,card_arrival,0.0
2,I have been waiting over a week. Is the card s...,card_arrival,0.0
3,Can I track my card while it is in the process...,card_arrival,0.0
4,"How do I know if I will get my card, or if it ...",card_arrival,0.0
...,...,...,...
9998,You provide support in what countries?,country_support,76.0
9999,What countries are you supporting?,country_support,76.0
10000,What countries are getting support?,country_support,76.0
10001,Are cards available in the EU?,country_support,76.0


Предобработка текста

In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Удалим знаки припинания

In [7]:
import re
df['text_new'] = [re.sub(r'[^\w\s]','', w.lower()) for w in df.text]
df.head(2)

Unnamed: 0,text,category,class,text_new
0,I am still waiting on my card?,card_arrival,0.0,i am still waiting on my card
1,What can I do if my card still hasn't arrived ...,card_arrival,0.0,what can i do if my card still hasnt arrived a...


In [8]:
from nltk.corpus import stopwords
from nltk import WordPunctTokenizer

tokenizer = WordPunctTokenizer()
stop_words = stopwords.words('english')
df['text_preprocess'] = [' '.join([w for w in tokenizer.tokenize(text.lower()) if w not in stop_words]) for text in df.text_new]

In [9]:
df.head(2)

Unnamed: 0,text,category,class,text_new,text_preprocess
0,I am still waiting on my card?,card_arrival,0.0,i am still waiting on my card,still waiting card
1,What can I do if my card still hasn't arrived ...,card_arrival,0.0,what can i do if my card still hasnt arrived a...,card still hasnt arrived 2 weeks


Построим baseLine на TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
train, test = train_test_split(df, test_size=0.3, random_state=241)

In [None]:
test.category.value_counts()

transaction_charged_twice                           62
balance_not_updated_after_cheque_or_cash_deposit    60
wrong_exchange_rate_for_cash_withdrawal             59
balance_not_updated_after_bank_transfer             55
card_payment_not_recognised                         54
                                                    ..
top_up_limits                                       22
contactless_not_working                             17
card_swallowed                                      17
card_acceptance                                     17
virtual_card_not_working                             9
Name: category, Length: 77, dtype: int64

In [None]:
vec = TfidfVectorizer(max_features=5000)
X_train = vec.fit_transform(train.text)
X_test = vec.transform(test.text)

In [None]:
lr = LogisticRegression(C=2.5)
lr.fit(X_train, train['class'])

LogisticRegression(C=2.5, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Оценка качества

In [None]:
from sklearn.metrics import f1_score

In [None]:
predict = lr.predict(X_test)
print("f1 score", f1_score(predict, test['class'], average='macro'))

f1 score 0.8670151739732231


Напишем сеть  со своим мешком слов

In [11]:
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.functional as F

Посчитаем количество вхождения слов в полном тексте

In [12]:
from collections import Counter
token_counts = Counter()

In [13]:
for line in df.text_preprocess:
  token_counts.update(line.split())

UNK и PAD для обозначения начала и конца предложения

Можно убрать во время сортировки маленькое кол-во слов

In [14]:
tokens = sorted(t for t, c in token_counts.items())

tokens = ['UNK', 'PAD'] + tokens

создадим мешок слов по словам, закодировав слова по порядку

In [15]:
bow = {t: i for i, t in enumerate(tokens)}

In [16]:
UNK_IX, PAD_IX = bow['UNK'], bow['PAD'] # 0 и 1 для начала и конца предложения
print(UNK_IX, PAD_IX)

0 1


Функция для создания матрицы мешка слов

In [17]:
def get_matrix_bow(lines, bow):
  seq = [i.split() for i in lines]
  max_len = max(map(len, seq))

  matrix = np.full((len(seq), max_len), PAD_IX)

  for i, s in enumerate(seq):
    row_ix = [bow.get(word, UNK_IX) for word in s[:max_len]]
    matrix[i, :len(row_ix)] = row_ix
  return matrix

In [18]:
train, test = train_test_split(df, test_size=0.25, random_state=241)

In [19]:
def make_batch(data, columns: list, device):
  batch = {}
  for col in columns:
    if col.rfind('text')!=-1:
      batch[col] = torch.tensor(get_matrix_bow(data[col], bow), device=device, dtype=torch.int64)
    else: 
      batch[col] = torch.tensor(data[col].values, device=device, dtype=torch.int64)
  return batch

In [20]:
columns = ['text_preprocess', 'class']

train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')
DEVICE = torch.device("cpu")

CUDA is not available.  Training on CPU ...


In [21]:
make_batch(test, columns, device=DEVICE)

{'class': tensor([61, 72, 62,  ..., 46, 45, 17]),
 'text_preprocess': tensor([[2019,  330,  106,  ...,    1,    1,    1],
         [ 350,  767, 2245,  ...,    1,    1,    1],
         [1427, 1963, 1562,  ...,    1,    1,    1],
         ...,
         [2052, 1897, 1390,  ...,    1,    1,    1],
         [2062, 1378,  321,  ...,    1,    1,    1],
         [1427,  310, 1093,  ...,    1,    1,    1]])}

In [31]:
BATCH_SIZE =  64
EPOCHS = 4

n_class = len(df['class'].unique())


In [23]:
class Classifier(nn.Module):
    def __init__(self, n_tokens=len(tokens), emb_size=64, hid_size=128, n_class=n_class):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings=n_tokens, embedding_dim=emb_size)
        self.cnn1 = nn.Sequential(
                            nn.Conv1d(emb_size, hid_size, kernel_size=2),
                            nn.Dropout(p=0.25),
                            nn.ReLU()
                            )
        self.cnn2 = nn.Sequential(
                            nn.Conv1d(hid_size, hid_size, kernel_size=2),
                            nn.Dropout(p=0.25),
                            nn.ReLU()
                            )
        self.cnn3 = nn.Sequential(
                            nn.Conv1d(hid_size, hid_size, kernel_size=2),
                            nn.Dropout(p=0.25),
                            nn.ReLU(),
                            nn.AdaptiveMaxPool1d(output_size=1)
                            )
        self.linear = nn.Sequential(
                            nn.Linear(hid_size, 4*hid_size),
                            nn.ReLU(),
                            nn.Linear(hid_size*4, hid_size*4),
                            nn.ReLU(),
                            nn.Linear(4*hid_size, n_class)
                            )
        
    def __call__(self, input_ix):
        x = self.emb(input_ix).transpose(1, 2)
        x = self.cnn1(x)
        x = self.cnn2(x)
        x = self.cnn3(x).transpose(1, 2)
        # x = self.emb(input_ix)
        # x = self.LSTM(x)[0]
        # x = self.LSTM2(x)[0]
        # x = self.LSTM3(x)[0][:, -1]
        # x = self.relu(self.linear(x))
        # x = self.relu(self.linear2(x))
        # x = self.relu(self.linear3(x))
        # x = self.linear4(x)
        return self.linear(x).squeeze()
        

Проверка

In [None]:
make_batch(df[:10], columns=['text_preprocess', 'class'], device=DEVICE)['text_preprocess'].una

tensor([[1882, 2190,  321,    1,    1,    1],
        [ 321, 1882,  920,  174,   11, 2218],
        [2190, 2213,  321, 1882,  401,    1],
        [2031,  321, 1477,  533,    1,    1],
        [1083,  860,  321, 1155,    1,    1],
        [1763, 1279,  321,    1,    1,    1],
        [1016,  321,  533,    1,    1,    1],
        [1882, 1563, 1279,  321,    1,    1],
        [1360,  321, 2033,    1,    1,    1],
        [1338,  321, 1882, 1058,    1,    1]])

In [None]:
model = Classifier().to(DEVICE)
model(make_batch(df[:10], columns=['text_preprocess', 'class'], device=DEVICE)['text_preprocess'])

torch.Size([10, 64, 6])
torch.Size([10, 128, 5])
torch.Size([10, 128, 4])
torch.Size([10, 1, 128])


tensor([[0.0125, 0.0124, 0.0141, 0.0126, 0.0128, 0.0125, 0.0141, 0.0132, 0.0134,
         0.0133, 0.0137, 0.0136, 0.0126, 0.0126, 0.0129, 0.0133, 0.0132, 0.0125,
         0.0124, 0.0127, 0.0133, 0.0133, 0.0121, 0.0117, 0.0140, 0.0138, 0.0133,
         0.0130, 0.0134, 0.0131, 0.0124, 0.0133, 0.0134, 0.0129, 0.0124, 0.0132,
         0.0134, 0.0127, 0.0129, 0.0127, 0.0127, 0.0133, 0.0132, 0.0127, 0.0131,
         0.0132, 0.0140, 0.0129, 0.0119, 0.0125, 0.0135, 0.0131, 0.0131, 0.0128,
         0.0125, 0.0133, 0.0130, 0.0126, 0.0121, 0.0126, 0.0130, 0.0119, 0.0123,
         0.0138, 0.0137, 0.0144, 0.0128, 0.0126, 0.0132, 0.0121, 0.0128, 0.0135,
         0.0127, 0.0127, 0.0127, 0.0135, 0.0132],
        [0.0126, 0.0124, 0.0139, 0.0125, 0.0126, 0.0126, 0.0139, 0.0132, 0.0136,
         0.0133, 0.0135, 0.0134, 0.0126, 0.0126, 0.0130, 0.0133, 0.0131, 0.0126,
         0.0126, 0.0125, 0.0133, 0.0136, 0.0122, 0.0120, 0.0140, 0.0139, 0.0134,
         0.0135, 0.0131, 0.0129, 0.0124, 0.0131, 0.0132, 0.

In [24]:
def iterate_minibatches(data, batch_size=256, shuffle=True, cycle=False, device=torch.device('cuda')):
    """ iterates minibatches of data in random order """
    while True:
        indices = np.arange(len(data))
        if shuffle:
            indices = np.random.permutation(indices)
        
        for start in range(0, len(indices), batch_size):
            batch = make_batch(data.iloc[indices[start : start + batch_size]], columns=['text_preprocess', 'class'], device=device)
            yield batch
        
        if not cycle: break

In [32]:
import tqdm
from torch.optim import lr_scheduler
model = Classifier().to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = lr_scheduler.StepLR(optimizer, step_size=200, gamma=0.1)

for epoch in range(EPOCHS):
    print(f"epoch: {epoch}")
    model.train()
    for i, batch in enumerate(
            tqdm.tqdm_notebook(iterate_minibatches(train, batch_size=BATCH_SIZE, device=DEVICE))):
        pred = model(batch['text_preprocess'])
        target = batch['class'].long()
        loss = criterion(pred, batch['class'])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        if i % 10 == 0:
          print('loss = %f'%loss.item())

epoch: 0


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if sys.path[0] == '':


0it [00:00, ?it/s]

loss = 4.344515
loss = 4.338206
loss = 4.344456
loss = 4.165216
loss = 4.187551
loss = 3.991929
loss = 4.088512
loss = 4.002235
loss = 3.925064
loss = 4.021451
loss = 3.988570
loss = 3.824642
epoch: 1


0it [00:00, ?it/s]

loss = 3.673860
loss = 3.710098
loss = 3.291657
loss = 3.581052
loss = 3.403355
loss = 3.209296
loss = 3.355736
loss = 3.391734
loss = 3.406785
loss = 3.212821
loss = 3.024900
loss = 2.824464
epoch: 2


0it [00:00, ?it/s]

loss = 2.771381
loss = 2.761221
loss = 2.982860
loss = 2.878363
loss = 2.752702
loss = 2.776664
loss = 2.891356
loss = 2.841861
loss = 2.968832
loss = 2.861952
loss = 2.833422
loss = 2.801727
epoch: 3


0it [00:00, ?it/s]

loss = 2.912614
loss = 2.850012
loss = 2.960395
loss = 2.533790
loss = 2.861487
loss = 2.554631
loss = 2.762882
loss = 2.756321
loss = 2.797820
loss = 2.607953
loss = 2.876592
loss = 2.720827


In [33]:
model.eval()
with torch.no_grad():
  pred = model(make_batch(test, columns=['text_preprocess', 'class'], device=DEVICE)['text_preprocess'])
  pred = torch.argmax(pred, dim=1).cpu().detach().numpy()

In [34]:
from sklearn.metrics import f1_score

In [35]:
f1_score(test['class'], pred, average='macro')

0.15443694815919606