In [None]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score

import torch
import transformers
import torch.nn as nn
from transformers import AutoModel, BertTokenizer, BertForSequenceClassification, AutoConfig
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords

tqdm.pandas()

device = torch.device('cuda')

In [None]:
stop_words = stopwords.words('russian')

In [None]:
bert_config = AutoConfig.from_pretrained('rubert_cased_L-12_H-768_A-12_v2/bert_config.json')

In [None]:
tokenizer = BertTokenizer.from_pretrained('rubert_cased_L-12_H-768_A-12_v2/vocab.txt')#, stopwords=stop_words)

In [None]:
tokenizer

In [None]:
bert_config

In [None]:
bert = AutoModel.from_config(bert_config)

In [None]:
for param in bert.parameters():
    param.requires_grad = False

class BERT_Arch(nn.Module):
    
    def __init__(self, bert):
        super(BERT_Arch, self).__init__()
        self.bert = bert
        self.dropout = nn.Dropout(0.1)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(768,512)
        self.fc2 = nn.Linear(512,2)
        self.softmax = nn.LogSoftmax(dim = 1)
    
    def forward(self, sent_id, mask):
        _, cls_hs = self.bert(sent_id, attention_mask = mask, return_dict = False)
        x = self.fc1(cls_hs)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

In [None]:
df = pd.read_csv('mosgorsud.csv')

In [None]:
df.info()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.info()

In [None]:
def verdict_to_int(verdict_str):
    if 'Оставить' in verdict_str.split():
        return 0
    if (('Отменить' in verdict_str.split()) 
          or ('Изменить' in verdict_str.split())):
        return 1

In [None]:
df['target'] = df['verdict_up'].apply(verdict_to_int)

In [None]:
df.sample(5)

In [None]:
for_cloud = ' '.join(df['text'])

In [None]:
len(for_cloud)

In [None]:
for_cloud = for_cloud.lower()

In [None]:
for_cloud = for_cloud.replace('соответствии', '').replace('российской федерации', '').replace('размере', '').replace('.', '')

In [None]:
for_cloud = for_cloud.replace(' в ', '').replace(' с ', '').replace(' ст ', '').replace(' гк ', '').replace(' рф ', '')

In [None]:
len(for_cloud)

In [None]:
cloud = WordCloud().generate(for_cloud)

In [None]:
cloud_stop = WordCloud(stopwords=stop_words).generate(for_cloud)

In [None]:
plt.figure(figsize=(12, 8))
plt.imshow(cloud)
plt.axis('off')
plt.figure(figsize=(12, 8))
plt.imshow(cloud_stop)
plt.axis('off')

In [None]:
def clear_text(text):
    return " ".join(re.sub(r'[^u"а-яА-Я"0-9]', ' ', text.lower()).split())


In [None]:
df['clear_text'] = df['text'].apply(clear_text)

In [None]:
def clear_words(text):
    
    text1 = text.replace('российской федерации', '')
    text2 = text1.replace('соответствии', '')
    text3 = text2.replace(' ст ', '')
    text4 = text3.replace(' гк ', '')
    text5 = text4.replace(' рф ', '')
    text6 = text5.replace(' в ', '')
    text7 = text6.replace(' с ', '')
    text8 = text7.replace('размере', '')
    text9 = text8.replace('а также', '')
    text10 = text9.replace('также', '')
    text11 = text10.replace('решение', '')
    text12 = text11.replace('р е ш е н и е', '')
    text13 = text12.replace('уид', '')
    text14 = text13.replace('именем', '')
    return text14

In [None]:
df['clear_word'] = df['clear_text'].apply(clear_words)

In [None]:
df['clear_word']

In [None]:
df['target'].isna().sum()

In [None]:
df = df[df['target'].notna()]

In [None]:
df.info()

In [None]:
#df['target'] = df['target'].astype('int')

In [None]:
df_prep = pd.DataFrame()

In [None]:
df_prep['text'] = df['clear_word']
df_prep['target'] = df['target']

In [None]:
train_df, temp_df = train_test_split(df_prep, test_size=0.3, random_state=17)

In [None]:
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=17)

In [None]:
train_text = train_df['text'].astype('str')
train_labels = train_df['target']
val_text = val_df['text'].astype('str')
val_labels = val_df['target']
test_text = test_df['text'].astype('str')
test_labels = test_df['target']

In [None]:
test_labels

In [None]:
list_train = list(train_labels.value_counts()/train_labels.shape[0])
list_val = list(val_labels.value_counts()/val_labels.shape[0])
list_test = list(test_labels.value_counts()/test_labels.shape[0])

In [None]:
x_title = list(train_labels.value_counts().index)
x_title

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(8, 3))
axes[0].bar(x_title, list_train, width=0.6)
axes[0].set_title('train_target')
axes[1].bar(x_title, list_val, width=0.6)
axes[1].set_title('val_target')
axes[2].bar(x_title, list_test, width=0.6)
axes[2].set_title('test_target')

plt.show()

In [None]:
test_labels.value_counts()

In [None]:
train_text[6442]

In [None]:
seq_len = [len(str(i).split()) for i in train_text]
pd.Series(seq_len).hist(bins = 60)

In [None]:
train_text.values

In [None]:
tokens_train = tokenizer.batch_encode_plus(
    train_text.values,
    max_length = 500,
    padding = 'max_length',
    truncation = True
)
tokens_val = tokenizer.batch_encode_plus(
    val_text.values,
    max_length = 500,
    padding = 'max_length',
    truncation = True
)
tokens_test = tokenizer.batch_encode_plus(
    test_text.values,
    max_length = 500,
    padding = 'max_length',
    truncation = True
)


In [None]:

train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.values)

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.values)

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.values)
batch_size = 8


train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = batch_size)

val_data =  TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size = batch_size)

In [None]:
for param in bert.parameters():
    param.requires_grad = False

class BERT_Arch(nn.Module):
    
    def __init__(self, bert):
        super(BERT_Arch, self).__init__()
        self.bert = bert
        self.dropout = nn.Dropout(0.1)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(768,512)
        self.fc2 = nn.Linear(512,2)
        self.softmax = nn.LogSoftmax(dim = 1)
    
    def forward(self, sent_id, mask):
        _, cls_hs = self.bert(sent_id, attention_mask = mask, return_dict = False)
        x = self.fc1(cls_hs)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

In [None]:
model = BERT_Arch(bert)

model = model.to(device)
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr= 1e-3)

In [None]:
list(np.unique(train_labels))

In [None]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', classes=list(np.unique(train_labels)), y=train_labels)

print(class_weights)
#[0.8086199  1.31005794]

weights = torch.tensor(class_weights, dtype = torch.float)
weights = weights.to(device)
cross_entropy = nn.CrossEntropyLoss()
epochs = 20

In [None]:
def train():
    model.train()
    total_loss, total_accuracy = 0, 0
    total_preds = []
    
    for step, batch in tqdm(enumerate(train_dataloader), total = len(train_dataloader)):
        batch = [r.to(device) for r in batch]
        sent_id, mask, labels = batch
        model.zero_grad()
        preds = model(sent_id, mask)
        labels = labels.type(torch.LongTensor).to(device)
        #print(preds)
        #print(labels)
        loss = cross_entropy(preds, labels)#
        total_loss += loss.item()
        #print(loss.item())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        preds = preds.detach().cpu().numpy()
        total_preds.append(preds)
        
    avg_loss = total_loss / len(train_dataloader)
    total_preds = np.concatenate(total_preds, axis = 0)
    
    return avg_loss, total_preds

In [None]:
def evaluate():
    model.eval()
    total_loss, total_accuracy = 0,0
    total_preds = []

    for step, batch in tqdm(enumerate(val_dataloader), total = len(val_dataloader)):
        batch = [t.to(device) for t in batch]
        sent_id, mask, labels = batch
        labels = labels.type(torch.LongTensor).to(device)
        
        with torch.no_grad():
            preds = model(sent_id, mask)
            loss = cross_entropy(preds, labels)
            total_loss = total_loss + loss.item()
            preds = preds.detach().cpu().numpy()
            total_preds.append(preds)

    avg_loss = total_loss / len(val_dataloader)
    total_preds = np.concatenate(total_preds, axis = 0)

    return avg_loss, total_preds

In [None]:
best_valid_loss = float('inf')

train_losses = []
valid_losses = []

for epoch in range(epochs):
    print('\n Epoch{:} / {:}'.format(epoch+1, epochs))
    
    train_loss, _ = train()
    valid_loss, _ = evaluate()
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    print(f'\nTraining loss: {train_loss:.3f}')
    print(f'Validation loss: {valid_loss:.3f}')

In [None]:
path = 'saved_weights.pt'
model.load_state_dict(torch.load(path))

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

list_seq = np.array_split(test_seq, 50)
list_mask = np.array_split(test_mask, 50)


predictions = []
for num, elem in enumerate(list_seq):
    with torch.no_grad():
        preds = model(elem.to(device), list_mask[num].to(device))
        predictions.append(preds.detach().cpu().numpy())

In [None]:
flat_preds = [item[1] for sublist in predictions for item in sublist]
flat_preds = (flat_preds - min(flat_preds)) / (max(flat_preds) - min(flat_preds))
test_df['confidence'] = flat_preds

In [None]:
test_df['pred'] = test_df['confidence'].apply(lambda x: 1 if x>0.82 else 0)


print(classification_report(test_df['target'], test_df['pred']))

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
print(roc_auc_score(test_df['target'], test_df['confidence'] ))