In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.1 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 61.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 45.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.0 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstallin

In [2]:
import pandas as pd
import torch
from tqdm import tqdm
from transformers import BertForSequenceClassification, BertTokenizer
from transformers import AdamW

from torch.utils.data import DataLoader, TensorDataset, RandomSampler


In [3]:
df = pd.read_csv('train_texts.csv')
num_labels = len(set(list(df.values[:,2])))
d = {name:i for i,name in enumerate(set(list(df.values[:,2])))}

num_labels,d

(8,
 {'Akunin': 7,
  'Bulychev': 6,
  'Chehov': 0,
  'Dostoevsky': 5,
  'Gogol': 3,
  'King': 4,
  'Pratchett': 2,
  'Remark': 1})

In [4]:
df

Unnamed: 0,id,text,author
0,0,-Бабушка!- вскричала малютка.- Возьми меня с с...,Dostoevsky
1,1,"Знал ли Скрудж об этом? Разумеется, знал. Да и...",Dostoevsky
2,2,"-С праздником, дядя, с радостью! Дай вам Бог в...",Dostoevsky
3,3,Мы высказали только главную передовую мысль на...,Dostoevsky
4,4,"I. Отдел литературный. Повести, романы, расска...",Dostoevsky
...,...,...,...
1729,1729,"-Хотелось бы мне, чтоб он был здесь.\nКанторек...",Remark
1730,1730,Но для нас в этом-то и заключается их несостоя...,Remark
1731,1731,"-Как дела, Франц?- спрашивает Кропп.\nКеммерих...",Remark
1732,1732,"Мюллер наклоняется:\n-Мы принесли твои вещи, Ф...",Remark


In [5]:
model = BertForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased', num_labels=num_labels)
model.train()
optimizer = AdamW(model.parameters(), lr=5e-6)

Downloading:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

In [6]:
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

Downloading:   0%|          | 0.00/1.57M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

In [7]:
# abs = df.values[:,1][0]
labels = df.values[:,2]
abstracts = []
labels_ = []
for i,text in enumerate(df.values[:,1]):
  tok = tokenizer.tokenize(text)
  l = len(tok)
  lab = d[labels[i]]
  if l > 510:
    t = [tok[510*i:510*(i+1)] for i in range(l//510)]
    t.append(tok[510*(l//510):])
    lab = [d[labels[i]]]*len(t)
    tok = t 
    for j in range(len(tok)):
      abstracts.append(tok[j])
      labels_.append(lab[j])
  else:
    abstracts.append(tok)
    labels_.append(lab)

In [8]:
all_input = []
for i in abstracts:

  article = ["[CLS]"] + i + ["[SEP]"]
  input_ids = tokenizer.convert_tokens_to_ids(article)
  input_mask = [1] * len(input_ids)
  segment_ids = [0] * len(article)

  padding = [0] * (512 - len(input_ids))
  input_ids += padding
  input_mask += padding
  segment_ids += padding

  all_input.append([input_ids,input_mask,segment_ids])

all_input_ids = torch.tensor([i[0] for i in all_input], dtype=torch.long)
all_input_mask = torch.tensor([i[1] for i in all_input], dtype=torch.long)
all_segment_ids = torch.tensor([i[2] for i in all_input], dtype=torch.long)

all_label_ids = torch.tensor([i for i in labels_], dtype=torch.long)

In [9]:
train_data = TensorDataset(all_input_ids,all_input_mask,all_segment_ids,all_label_ids)
train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=6)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [13]:
model.train()
optimizer = AdamW(model.parameters(), lr=5e-6)
mean_loss = 0 
num_steps = 0

for i in tqdm(range(4)):
  for j,batch in enumerate(train_dataloader):
    optimizer.zero_grad()
    input_ids = batch[0].to(device)
    input_mask = batch[1].to(device)
    segment_ids = batch[2].to(device)
    labels = batch[3].to(device)

    outputs = model(input_ids, attention_mask=input_mask,token_type_ids=segment_ids, labels=labels)
    loss = outputs.loss

    mean_loss+=loss
    num_steps+=len(input_ids)
    if j%20 == 0:
      print(mean_loss/num_steps)
      mean_loss = 0
      num_steps = 0


    loss.backward()
    optimizer.step()


  0%|          | 0/4 [00:00<?, ?it/s]

tensor(0.3001, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.3216, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.3125, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.2964, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.3104, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.3028, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.2822, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.2702, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.2600, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.2425, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.2161, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.2516, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.2267, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.2134, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.2130, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.1806, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.1848, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.1842, device='cuda:0', grad_fn=<DivBack

 25%|██▌       | 1/4 [03:45<11:15, 225.09s/it]

tensor(0.1734, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.1681, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.1541, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.1412, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.1565, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.1505, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.1478, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.1385, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.1134, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.1128, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.1388, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.1237, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.1249, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.1369, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.1332, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.1183, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.1253, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.1122, device='cuda:0', grad_fn=<DivBack

 50%|█████     | 2/4 [07:37<07:38, 229.11s/it]

tensor(0.1079, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.1010, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0812, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0997, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0959, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0731, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0760, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0805, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0906, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0722, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0678, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.1175, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0769, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0653, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0798, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0609, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0813, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0755, device='cuda:0', grad_fn=<DivBack

 75%|███████▌  | 3/4 [11:28<03:50, 230.37s/it]

tensor(0.0709, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.1649, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0638, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0678, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0578, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0420, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0414, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0431, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0575, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0534, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0463, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0617, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0644, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0397, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0465, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0472, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0486, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.0718, device='cuda:0', grad_fn=<DivBack

100%|██████████| 4/4 [15:20<00:00, 230.23s/it]

tensor(0.0484, device='cuda:0', grad_fn=<DivBackward0>)





In [14]:
model.eval()
rev = { 0:'Akunin',
 6:'Bulychev',
 5:'Chehov',
  2:'Dostoevsky',
  1:'Gogol',
  3:'King',
  4:'Pratchett',
  7:'Remark'}

In [15]:
df_test = pd.read_csv('test_texts.csv')
predictions = []

for text in df_test.values[:,1]:
  inputs = tokenizer(text, return_tensors="pt",max_length=512,truncation=True)
  with torch.no_grad():
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    token_type_ids = inputs['token_type_ids'].to(device)
    outputs = model(input_ids,token_type_ids=token_type_ids, attention_mask=attention_mask)
    # print(outputs)
    logits = outputs[0]
    probs = logits.softmax(1)
  probs = probs.detach().cpu().numpy()
  predictions.append(rev[probs.argmax()])



In [16]:
import numpy as np
data = pd.DataFrame(data=predictions,index=np.arange(len(df.values[:,2]),len(df.values[:,2])+len(predictions)))

In [None]:
data.to_csv('result.csv')