In [3]:
import pandas as pd
import numpy as np
import torch
from transformers import BertForSequenceClassification
import re
import pickle
import string
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('DeepPavlov/rubert-base-cased', do_lower_case=True)
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from tabulate import tabulate
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

Downloading:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/642 [00:00<?, ?B/s]

In [4]:
df=pd.read_excel('/content/drive/MyDrive/filmsbooksclean2.xlsx')

In [12]:
DEVICE='cpu'

In [6]:
model = torch.load('/content/drive/MyDrive/modelwithoutstrangehidden.pth',map_location='cpu')

In [7]:
from tqdm import tqdm
from torch.utils.data import TensorDataset

In [8]:
X_test=df[['Descr1','Descr2']]
y_test=df['is_duplicate']

In [9]:
def convert_to_dataset_torch(data: pd.DataFrame, labels: pd.Series) -> TensorDataset:
    input_ids = []
    attention_masks = []
    token_type_ids = []
    for _, row in tqdm(data.iterrows(), total=data.shape[0]):
        encoded_dict = tokenizer.encode_plus(row["Descr1"], row["Descr2"], max_length=512,
                                             pad_to_max_length=True,
                                             return_attention_mask=True, return_tensors='pt', truncation=True)
        # Add the encoded sentences to the list.
        input_ids.append(encoded_dict['input_ids'])
        token_type_ids.append(encoded_dict["token_type_ids"])
        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    token_type_ids = torch.cat(token_type_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels.values)
    input_ids.to(DEVICE, dtype=torch.long)
    token_type_ids.to(DEVICE, dtype=torch.long)
    attention_masks.to(DEVICE, dtype=torch.long)
    labels.to(DEVICE, dtype=torch.long)
    return TensorDataset(input_ids, attention_masks, token_type_ids, labels)


In [10]:
def eval_batch(dataloader, model, metric=accuracy_score):
    total_eval_accuracy = 0
    total_eval_loss = 0
    embs=[]

    for batch in tqdm(dataloader, desc="Evaluating", unit="batch"):
        # Unpack batch from dataloader.
        input_ids, attention_masks, token_type_ids, labels = batch

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        input_ids = input_ids.to(DEVICE, dtype=torch.long)
        token_type_ids = token_type_ids.to(DEVICE, dtype=torch.long)
        attention_masks = attention_masks.to(DEVICE, dtype=torch.long)
        labels = labels.to(DEVICE, dtype=torch.long)
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            m = (model(input_ids,
                       token_type_ids=token_type_ids,
                       attention_mask=attention_masks,
                       labels=labels)).hidden_states[12][:,0,:]

            embs.append(m.detach().cpu().numpy())



    return embs

In [13]:
test = convert_to_dataset_torch(X_test, y_test)

100%|██████████| 30/30 [00:00<00:00, 631.78it/s]


In [14]:
test_dataloader = DataLoader(test,  sampler=SequentialSampler(test), batch_size=1)

In [15]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [16]:
embeddings = eval_batch(test_dataloader, model)
df1=pd.DataFrame([])

Evaluating: 100%|██████████| 30/30 [00:51<00:00,  1.71s/batch]


In [17]:
for i,k in enumerate(embeddings):
    #df1=df1.append(pd.concat([pd.DataFrame(k[0]).T,pd.Series(df['is_duplicate'].iloc[i])],axis=1))
     df1=df1.append(pd.DataFrame(k[0]).T)

In [18]:
df1.reset_index(drop=True,inplace=True)

In [19]:
with open('/content/drive/MyDrive/logreg512.pkl', 'rb') as file:
 lr = pickle.load(file)



In [20]:
df1['duplicate']=pd.Series(lr.predict_proba(df1)[:,1])
df1['is_duplicate']=df['is_duplicate']
df1['Descr1']=df['Descr2']
df1['Descr2']=df['Descr1']
df2=df1[['Descr1','Descr2','duplicate','is_duplicate']]

In [21]:
df2.sort_values('duplicate',ascending=False,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [22]:
df2


Unnamed: 0,Descr1,Descr2,duplicate,is_duplicate
6,Волшебник Гэндальф обманом вовлекает Бильбо в ...,Хоббит Бильбо Бэггинс пускается в грандиозный ...,0.999296,1
8,Гарри Поттер ни разу даже не слышал о «Хогварц...,Жизнь десятилетнего Гарри Поттера нельзя назва...,0.999047,1
9,Автор этого повествования двадцатипятилетний А...,За убийство которого он не совершал взломщик с...,0.998903,1
4,Обгрызенная трость исчезнувший ботинок тайна ф...,Труп Чарльза Баскервиля обнаруживают неподалек...,0.9989,1
0,Энди Дюфренс был успешным банкиром пока в его ...,Бухгалтер Энди Дюфрейн обвинён в убийстве собс...,0.998616,1
10,Советский разведчик Максим Исаев под видом шта...,Германия канун окончания Второй мировой войны ...,0.99856,1
1,В центре сюжета находится герой страдающий бес...,Сотрудник страховой компании страдает хроничес...,0.998372,1
7,Тодд Андерсон и его друзья привыкшие нехотя сл...,Джон Китинг — новый преподаватель английской с...,0.998344,1
5,ШарикШариков профессор Преображенский доктор Б...,Москва год В результате одного из своих сложн...,0.99832,1
11,Доктор Ганнибал Лектер блестящий психиатр но ...,Заключенный доктор психиатрии Ганнибал Лектер ...,0.998264,1
