In [1]:
import pandas as pd
from transformers import BertTokenizerFast, BertForSequenceClassification, get_linear_schedule_with_warmup, AdamW
from sklearn.model_selection import train_test_split
import torch
from tqdm import tqdm 
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import os
import numpy as np
import random
from sklearn.metrics import accuracy_score
import re
import string

In [2]:
df = pd.read_csv('../data/rescforbert.csv')

In [3]:
df.drop('Unnamed: 0', inplace=True, axis=1)

In [4]:
df['recommend'] = pd.Series([int(x) for x in df['recommend']])

In [5]:
def seed_everything(seed = 1234):
     random.seed(seed)
     os.environ['PYTHONHASHSEED'] = str(seed)
     np.random.seed(seed)     
     torch.manual_seed(seed)
     torch.cuda.manual_seed(seed)
     torch.backends.cudnn.deterministic = True

In [6]:
seed_everything()

In [None]:
raw_model = 'DeepPavlov/rubert-base-cased'
tokenizer = BertTokenizerFast.from_pretrained(raw_model, do_lower_case=True)
model = BertForSequenceClassification.from_pretrained(
    raw_model, 
    num_labels=2, 
    output_attentions=False,
    output_hidden_states=True, 
)

In [8]:
def convert_to_dataset_torch(data: pd.DataFrame, labels: pd.Series) -> TensorDataset:
    input_ids = []
    attention_masks = []
    token_type_ids = []
    for _, row in tqdm(data.iterrows(), total=data.shape[0]):
        encoded_dict = tokenizer.encode_plus(row["keywords"], row["description"], max_length=512,
                                             pad_to_max_length=True,
                                             return_attention_mask=True, return_tensors='pt', truncation=True)
        # Add the encoded sentences to the list.
        input_ids.append(encoded_dict['input_ids'])
        token_type_ids.append(encoded_dict["token_type_ids"])
        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors.

    input_ids = torch.cat(input_ids, dim=0)
    token_type_ids = torch.cat(token_type_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels.values)
    input_ids.to(dtype=torch.long)
    token_type_ids.to(dtype=torch.long)
    attention_masks.to(dtype=torch.long)
    labels.to(dtype=torch.long)

    return TensorDataset(input_ids, attention_masks, token_type_ids, labels)

In [9]:
X_train, X_validation, y_train, y_validation = train_test_split(df[["keywords", "description"]],
                                                    df["recommend"], test_size=0.3, random_state=21, stratify=df["recommend"])

In [None]:
train_data = convert_to_dataset_torch(X_train, y_train)
validation_data = convert_to_dataset_torch(X_validation, y_validation)

In [11]:
batch_size = 4

In [12]:
train_dataloader = DataLoader(
            train_data,  
            sampler = RandomSampler(train_data),
            batch_size = batch_size,
            num_workers = 0,
            drop_last=True
        )


validation_dataloader = DataLoader(
            validation_data, 
            sampler = SequentialSampler(validation_data), 
            batch_size = batch_size, 
            num_workers = 0,
            drop_last=True
        )

In [None]:
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)

In [14]:
epochs = 5
total_steps = len(train_dataloader) * epochs

In [15]:
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [16]:
DEVICE = 'cuda:0'

In [None]:
model.to(DEVICE)

In [18]:
def fit_batch(dataloader, model, optimizer, epoch):
    total_train_loss = 0

    for batch in tqdm(dataloader, desc=f"Training epoch:{epoch}", unit="batch"):
      
        input_ids, attention_masks, token_type_ids, labels = batch

        
        input_ids = input_ids.to(DEVICE)
        token_type_ids = token_type_ids.to(DEVICE)
        attention_masks = attention_masks.to(DEVICE)
        #labels = labels.long()
        labels = labels.to(DEVICE)
        loss = (model(input_ids=input_ids,
                      token_type_ids=token_type_ids,
                      attention_mask=attention_masks,
                      labels=labels)).loss

        total_train_loss += loss
        optimizer.zero_grad()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()

    return total_train_loss

In [28]:
def eval_batch(dataloader, model, metric=accuracy_score):
    total_eval_accuracy = 0
    total_eval_loss = 0
    predictions, predicted_labels = [], []
    notright = []
    for batch in tqdm(dataloader, desc="Evaluating", unit="batch"):
   
        input_ids, attention_masks, token_type_ids, labels = batch

        input_ids = input_ids.to(DEVICE)
        token_type_ids = token_type_ids.to(DEVICE)
        attention_masks = attention_masks.to(DEVICE)
        labels = labels.to(DEVICE)
        with torch.no_grad():
           
            m = (model(input_ids,
                       token_type_ids=token_type_ids,
                       attention_mask=attention_masks,
                       labels=labels))
        total_eval_loss += m.loss

        y_pred = np.argmax(m.logits.detach().cpu().numpy(), axis=1).flatten()
        #print(torch.nn.Softmax()(m.logits), labels)
        total_eval_accuracy += metric(labels.cpu(), y_pred)

        predictions.extend(m.logits.detach().tolist())
        predicted_labels.extend(y_pred.tolist())
       
    return total_eval_accuracy, total_eval_loss, predictions, predicted_labels

In [20]:
def train(train_dataloader, validation_dataloader, model, optimizer, epochs):

    training_stats = []



    for epoch in range(0, epochs):

        
        total_train_loss = 0

        model.train()

        total_train_loss = fit_batch(train_dataloader, model, optimizer, epoch)

        avg_train_loss = total_train_loss / len(train_dataloader)
        print(f"  Train Loss: {avg_train_loss}")
   
        model.eval()

        total_eval_accuracy, total_eval_loss, _, _ = eval_batch(validation_dataloader, model)
        FILE = 'modelnew.pth'
        torch.save(model, FILE)
       
        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)

        print(f"  Accuracy: {avg_val_accuracy}")

      
        avg_val_loss = total_eval_loss / len(validation_dataloader)


        print(f"  Validation Loss: {avg_val_loss}")

     
        training_stats.append(
            {
                'epoch': epoch,
                'Training Loss': avg_train_loss,
                'Valid. Loss': avg_val_loss,
                'Valid. Accur.': avg_val_accuracy,
            }
        )

    print("")
    print("Training complete!")
    return training_stats

In [None]:
training_stats = train(train_dataloader, validation_dataloader, model, optimizer, epochs)

In [None]:
df = pd.read_csv('../input/forbert/recsforbert.csv')
str = 'фильм для двоих'
df1 = pd.DataFrame()
df1['Description'] = df['description']
df1['title'] = df['title']
df1['Query'] = pd.Series()
df1['Query'].fillna(str, inplace=True)
df1["Query"] = df1["Query"].apply(lambda x: x.lower())
df1["Query"] = df1["Query"].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x))
df1['recommend'] = pd.Series()

In [27]:
def convert_to_dataset(data: pd.DataFrame) -> TensorDataset:
    input_ids = []
    attention_masks = []
    token_type_ids = []
    for _, row in tqdm(data.iterrows(), total=data.shape[0]):
        encoded_dict = tokenizer.encode_plus(row["Query"], row["Description"], max_length=512,
                                             pad_to_max_length=True,
                                             return_attention_mask=True, return_tensors='pt', truncation=True)
        # Add the encoded sentences to the list.
        input_ids.append(encoded_dict['input_ids'])
        token_type_ids.append(encoded_dict["token_type_ids"])
        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    token_type_ids = torch.cat(token_type_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
   
    input_ids.to(dtype=torch.long)
    token_type_ids.to(dtype=torch.long)
    attention_masks.to(dtype=torch.long)
 
    return TensorDataset(input_ids, attention_masks, token_type_ids)



In [28]:
X_test=df1[['Query','Description']]

In [None]:
test = convert_to_dataset(X_test)

In [None]:
test_dataloader = DataLoader(test, sampler=SequentialSampler(test), batch_size=1)
model.eval()

In [36]:
def infer(dataloader, model):
    total_eval_accuracy = 0
    total_eval_loss = 0
    embs=[]

    for batch in tqdm(dataloader, desc="Evaluating", unit="batch"):
        # Unpack batch from dataloader.
        input_ids, attention_masks, token_type_ids = batch

        
        input_ids = input_ids.to(DEVICE, dtype=torch.long)
        token_type_ids = token_type_ids.to(DEVICE, dtype=torch.long)
        attention_masks = attention_masks.to(DEVICE, dtype=torch.long)
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            m = (model(input_ids,
                       token_type_ids=token_type_ids,
                       attention_mask=attention_masks)).logits
            embs.append(torch.nn.Softmax()(m))
    return embs

In [None]:
emn = infer(test_dataloader, model)

In [42]:
emn = [i.detach().cpu().numpy() for i in emn]

In [66]:
em = [(emn[i][0][1]) for i, k in enumerate(emn)]

In [68]:
df1['recommend'] = pd.Series(em)

In [72]:
df1.sort_values('recommend', ascending=False)

Unnamed: 0,Description,title,Query,recommend
1587,Судьбоносная встреча может случиться где угодн...,Билет на двоих,фильм для двоих,0.999944
13147,У взрослых тоже есть свои любимые игры на двои...,Невинные желания 2,фильм для двоих,0.999935
6472,"Тина Фэй («Дрянные девчонки», «Мегамозг», «Сту...",Экзамен для двоих,фильм для двоих,0.999922
8350,Горячие развлечения на троих.,Подруга моего лучшего друга,фильм для двоих,0.999920
9804,"Один — высокопоставленный чиновник, другой — о...",Обет на крови,фильм для двоих,0.999906
...,...,...,...,...
9861,В 16 веке население Маркизских островов достиг...,Тысячелетняя цивилизация. Французская Полинезия,фильм для двоих,0.000024
2727,Выращивание кофе является основной частью экон...,Выращивание кофе. Гондурас,фильм для двоих,0.000024
642,"Плоды какао растут на какао-дереве. Говорят, ч...",Какао. Гондурас,фильм для двоих,0.000023
4435,Какао происходит от какао-дерева. В Гондурасе ...,[4К] Какао. Гондурас,фильм для двоих,0.000023
