In [None]:
#pip install pip --upgrade >> None
#pip install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia
#pip install --no-cache-dir transformers sentencepiece >> None

In [1]:
import math
import ast
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import os
from IPython.display import clear_output

from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import torch

In [2]:
torch.cuda.is_available()

True

In [3]:
torch.cuda.device_count()

1

In [4]:
torch.cuda.get_device_name(0)

'NVIDIA A100 80GB PCIe'

In [5]:
def load_model(model_checkpoint, device='cuda'):
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, return_dict=True).to(device)
    clear_output()
    return model

In [6]:
def load_tokenizer(model_checkpoint, use_fast=True):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=use_fast)
    clear_output()
    return tokenizer

In [7]:
def predict(data, text_col, model, tokenizer, batch_size, max_length=200, padding=True, truncation=True, return_tensors='pt', device='cuda', order=[0, 1, 2],
            exception_max_length=514): 
  predicted_probs = []
  predicted_labels = []
  not_null_indexes = data[data[text_col].notnull()].index
  data2 = data.loc[not_null_indexes]
  lengths = data2[text_col].apply(lambda x: len(x.split(' ')))
  with torch.no_grad():
    ids = [id*batch_size for id in range(math.ceil(len(data2)/batch_size))] + [len(data2)]
    for i in tqdm(range(len(ids)-1)):   
      if max_length == 'max_per_batch':
        max_l = min(max(lengths[ids[i]:ids[i+1]]), exception_max_length)
        inputs = tokenizer(list(data2.iloc[ids[i]:ids[i+1]][text_col].values), max_length=max_l, padding=padding, truncation=truncation, return_tensors=return_tensors).to(device)
      else:
        inputs = tokenizer(list(data2.iloc[ids[i]:ids[i+1]][text_col].values), max_length=max_length, padding=padding, truncation=truncation, return_tensors=return_tensors).to(device)
      outputs = model(**inputs)
      probs = torch.nn.functional.softmax(outputs.logits, dim=1).to('cpu')[:, order]
      labels = torch.argmax(probs, dim=1).numpy()

      predicted_probs.append(probs.numpy())
      predicted_labels.append(labels)
  return np.vstack(predicted_probs), np.hstack(predicted_labels)

In [8]:
def merge(data, probs, labels, id_of_model):
  data[[f'negative_{id_of_model}',f'neutral_{id_of_model}',f'positive_{id_of_model}']] = probs
  data[f'pred_label_{id_of_model}'] = labels
  return data

In [9]:
def predictions_to_df(probs, labels, id_of_model, text_type, data, df=None):
  if df is None:
    df = pd.DataFrame()
    df['index'] = np.arange(len(data))
    not_null_indexes = data[data[text_type].notnull()].index
    df.loc[not_null_indexes, [f'negative_{id_of_model}_{text_type}',f'neutral_{id_of_model}_{text_type}',f'positive_{id_of_model}_{text_type}']] = probs
    df.loc[not_null_indexes, f'pred_label_{id_of_model}_{text_type}'] = labels
    df = df.drop('index', axis=1)
  else:
    not_null_indexes = data[data[text_type].notnull()].index
    df.loc[not_null_indexes, [f'negative_{id_of_model}_{text_type}',f'neutral_{id_of_model}_{text_type}',f'positive_{id_of_model}_{text_type}']] = probs
    df.loc[not_null_indexes, f'pred_label_{id_of_model}_{text_type}'] = labels
  return df

In [10]:
def save_data(data, path, filename):
    if filename.split('.')[-1:][0] == 'csv':
        data.to_csv(os.path.join(path, filename), index=False)
    elif filename.split('.')[-1:][0] == 'json':
        data.to_json(os.path.join(path, filename))
    elif filename.split('.')[-1:][0] == 'xlsx':
        data.to_excel(os.path.join(path, filename), index=False)

In [19]:
data1 = pd.read_csv("/home/ubuntu/summary.csv")
data2 = pd.read_csv("/home/ubuntu/title.csv")

In [22]:
def reform(x):
    try:
        return ' '.join(ast.literal_eval(x))
    except:
        return None

In [23]:
data1['text'] = list(tqdm(map(lambda x: reform(x), data1['text']), total=len(data1)))

  0%|          | 0/3737450 [00:00<?, ?it/s]

In [None]:
data2['text'] = list(tqdm(map(lambda x: reform(x), data2['text']), total=len(data2)))

# sismetanin/sbert-ru-sentiment-krnd

In [None]:
model_checkpoint = "sismetanin/sbert-ru-sentiment-krnd"
model = load_model(model_checkpoint, 'cuda')
tokenizer = load_tokenizer(model_checkpoint, use_fast=False)

## Summary

In [33]:
probs1, labels1 = predict(data1, 'text', model, tokenizer, batch_size=128, max_length='max_per_batch',
                          padding=True, truncation=True, return_tensors='pt', device='cuda', order=[0, 1, 2],
                          exception_max_length=128)

  0%|          | 0/29199 [00:00<?, ?it/s]

In [None]:
df1 = predictions_to_df(probs1, labels1, id_of_model='', text_type='text', data=data1, df=None)
df1['id'] = data1['id']

In [None]:
save_data(df1, path='/home/ubuntu/', filename='summary_pred.csv')

## Title

In [None]:
probs2, labels2 = predict(data2, 'text', model, tokenizer, batch_size=100, max_length='max_per_batch',
                          padding=True, truncation=True, return_tensors='pt', device='cuda', order=[0, 1, 2],
                          exception_max_length=512)

In [None]:
df2 = predictions_to_df(probs2, labels2, id_of_model='', text_type='text', data=data2, df=None)
df2['id'] = data2['id']

In [None]:
save_data(df2, path='/home/ubuntu/', filename='title_pred.csv')