In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 10.0 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 47.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 1.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 23.0 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalli

In [None]:
import pandas as pd
import torch
from tqdm import tqdm
from transformers import BertForSequenceClassification, BertTokenizer
from transformers import AdamW

from torch.utils.data import DataLoader, TensorDataset, RandomSampler


In [None]:
df = pd.read_csv('train_texts.csv')
num_labels = len(set(list(df.values[:,2])))
d = {name:i for i,name in enumerate(set(list(df.values[:,2])))}

num_labels,d


(8,
 {'Akunin': 0,
  'Bulychev': 6,
  'Chehov': 5,
  'Dostoevsky': 2,
  'Gogol': 1,
  'King': 3,
  'Pratchett': 4,
  'Remark': 7})

In [None]:
df

Unnamed: 0,id,text,author
0,0,-Бабушка!- вскричала малютка.- Возьми меня с с...,Dostoevsky
1,1,"Знал ли Скрудж об этом? Разумеется, знал. Да и...",Dostoevsky
2,2,"-С праздником, дядя, с радостью! Дай вам Бог в...",Dostoevsky
3,3,Мы высказали только главную передовую мысль на...,Dostoevsky
4,4,"I. Отдел литературный. Повести, романы, расска...",Dostoevsky
...,...,...,...
1729,1729,"-Хотелось бы мне, чтоб он был здесь.\nКанторек...",Remark
1730,1730,Но для нас в этом-то и заключается их несостоя...,Remark
1731,1731,"-Как дела, Франц?- спрашивает Кропп.\nКеммерих...",Remark
1732,1732,"Мюллер наклоняется:\n-Мы принесли твои вещи, Ф...",Remark


In [None]:
model = BertForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased', num_labels=num_labels)
model.train()
optimizer = AdamW(model.parameters(), lr=5e-6)

Downloading:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

In [None]:
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

Downloading:   0%|          | 0.00/1.57M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

In [None]:
# abs = df.values[:,1][0]
labels = df.values[:,2]
abstracts = []
labels_ = []
for i,text in enumerate(df.values[:,1]):
  tok = tokenizer.tokenize(text)
  l = len(tok)
  lab = d[labels[i]]
  if l > 510:
    t = [tok[510*i:510*(i+1)] for i in range(l//510)]
    t.append(tok[510*(l//510):])
    lab = [d[labels[i]]]*len(t)
    tok = t 
    for j in range(len(tok)):
      abstracts.append(tok[j])
      labels_.append(lab[j])
  else:
    abstracts.append(tok)
    labels_.append(lab)

In [None]:
all_input = []
for i in abstracts:

  article = ["[CLS]"] + i + ["[SEP]"]
  input_ids = tokenizer.convert_tokens_to_ids(article)
  input_mask = [1] * len(input_ids)
  segment_ids = [0] * len(article)

  padding = [0] * (512 - len(input_ids))
  input_ids += padding
  input_mask += padding
  segment_ids += padding

  all_input.append([input_ids,input_mask,segment_ids])

all_input_ids = torch.tensor([i[0] for i in all_input], dtype=torch.long)
all_input_mask = torch.tensor([i[1] for i in all_input], dtype=torch.long)
all_segment_ids = torch.tensor([i[2] for i in all_input], dtype=torch.long)

all_label_ids = torch.tensor([i for i in labels_], dtype=torch.long)

In [None]:
train_data = TensorDataset(all_input_ids,all_input_mask,all_segment_ids,all_label_ids)
train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=6)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
model.train()
optimizer = AdamW(model.parameters(), lr=5e-7)

for i in tqdm(range(1)):
  for batch in train_dataloader:
    optimizer.zero_grad()
    input_ids = batch[0].to(device)
    input_mask = batch[1].to(device)
    segment_ids = batch[2].to(device)
    labels = batch[3].to(device)

    outputs = model(input_ids, attention_mask=input_mask,token_type_ids=segment_ids, labels=labels)
    loss = outputs.loss
    print(loss/len(input_ids))
    loss.backward()
    optimizer.step()


  0%|          | 0/1 [00:00<?, ?it/s]

tensor(0.0206, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0146, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0120, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0645, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0592, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0366, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0173, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0093, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0556, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0505, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0192, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0162, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0492, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0318, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0046, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0351, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0974, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0150, device='cuda:0', grad_fn=<DivBack

  0%|          | 0/1 [03:34<?, ?it/s]


KeyboardInterrupt: ignored

In [None]:
model.eval()
rev = { 0:'Akunin',
 6:'Bulychev',
 5:'Chehov',
  2:'Dostoevsky',
  1:'Gogol',
  3:'King',
  4:'Pratchett',
  7:'Remark'}

In [None]:
df_test = pd.read_csv('test_texts.csv')
predictions = []

for text in df_test.values[:,1]:
  inputs = tokenizer(text, return_tensors="pt",max_length=512,truncation=True)
  with torch.no_grad():
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    token_type_ids = inputs['token_type_ids'].to(device)
    outputs = model(input_ids,token_type_ids=token_type_ids, attention_mask=attention_mask)
    # print(outputs)
    logits = outputs[0]
    probs = logits.softmax(1)
  probs = probs.detach().cpu().numpy()
  predictions.append(rev[probs.argmax()])

predictions

['Dostoevsky',
 'Pratchett',
 'Akunin',
 'Remark',
 'Remark',
 'Pratchett',
 'Bulychev',
 'King',
 'Akunin',
 'Gogol',
 'Dostoevsky',
 'Chehov',
 'Bulychev',
 'Akunin',
 'Pratchett',
 'Dostoevsky',
 'Pratchett',
 'Akunin',
 'Dostoevsky',
 'Pratchett',
 'Bulychev',
 'King',
 'Bulychev',
 'King',
 'Remark',
 'Bulychev',
 'Pratchett',
 'Pratchett',
 'Pratchett',
 'Pratchett',
 'Pratchett',
 'Bulychev',
 'Bulychev',
 'Chehov',
 'Chehov',
 'Gogol',
 'Bulychev',
 'King',
 'King',
 'Bulychev',
 'Bulychev',
 'Remark',
 'Remark',
 'Pratchett',
 'Pratchett',
 'King',
 'Remark',
 'Remark',
 'Bulychev',
 'Bulychev',
 'Pratchett',
 'Pratchett',
 'Remark',
 'Bulychev',
 'King',
 'Bulychev',
 'Dostoevsky',
 'Remark',
 'Remark',
 'Dostoevsky',
 'Bulychev',
 'King',
 'Gogol',
 'Pratchett',
 'Chehov',
 'King',
 'Chehov',
 'Pratchett',
 'Remark',
 'Chehov',
 'Bulychev',
 'Bulychev',
 'Akunin',
 'Pratchett',
 'Pratchett',
 'Remark',
 'Remark',
 'Remark',
 'Pratchett',
 'Remark',
 'Bulychev',
 'Akunin',
 '

In [None]:
data

Unnamed: 0,0
1734,Dostoevsky
1735,Bulychev
1736,Akunin
1737,King
1738,Remark
...,...
2059,Bulychev
2060,Chehov
2061,Bulychev
2062,Bulychev


In [None]:
data = pd.DataFrame(data=predictions,index=np.arange(len(df.values[:,2]),len(df.values[:,2])+len(predictions)))

In [None]:
import numpy as np
data.to_csv('result.csv')

'Dostoevsky'