In [None]:
!pip install --upgrade scikit-learn

In [None]:
!pip install transformers
!pip install tensorboardx
!pip install simpletransformers

In [None]:
!pip install nltk



In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC

import time
import datetime

In [None]:
import torch

from transformers import BertTokenizer
from transformers import BertTokenizer, BertModel
from transformers import BertForSequenceClassification, AdamW, BertConfig
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup

import random

In [None]:
X_train = pd.read_csv('/content/ru/train.csv', sep=';')
X_test = pd.read_csv('/content/ru/test.csv', sep=';')
y_train = pd.read_csv('/content/ru/target.csv', sep=';')

for i in y_train.columns[1:]:
  y_train[str(i)] /= 10

y_train

X_train, X_val = train_test_split(X_train, test_size=0.2, shuffle=False)
y_train = y_train.set_index('id') 

In [None]:
print('Loading BERT tokenizer...')

tokenizer = BertTokenizer.from_pretrained('sberbank-ai/sbert_large_nlu_ru', do_lower_case=False)

Loading BERT tokenizer...


Downloading:   0%|          | 0.00/1.70M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/323 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/655 [00:00<?, ?B/s]

In [None]:
max_len = 0

for sent in X_train['text']:

    input_ids = tokenizer.encode(sent, add_special_tokens=False)

    if max_len < len(input_ids):

      print('New max_len:', len(input_ids))

    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

In [None]:
def tokenize_sentence(text):

  encoded_dict = tokenizer.encode_plus(
                        text,                     
                        add_special_tokens = True, 
                        max_length = 64,          
                        truncation = True,
                        pad_to_max_length = True,
                        return_tensors = 'pt',     
                   )
  return encoded_dict['input_ids']

def tokenize_sentences(sentences_in, labels_in):


  input_ids = []
  labels = []

  for i in range(0, len(sentences_in)):

    input_id = tokenize_sentence(sentences_in[i])
    input_ids.append(input_id)

  input_ids = torch.cat(input_ids, dim=0)
  labels = torch.tensor(labels_in)

  return input_ids, labels

In [None]:
X_train, X_val = X_train.loc[X_train['text'] != '-'], X_val.loc[X_val['text'] != '-']

In [None]:
labels_train = list(map(lambda x: y_train.loc[x].tolist(), X_train['id']))
labels_val = list(map(lambda x: y_train.loc[x].tolist(), X_val['id']))

In [None]:
X_train['labels'] = labels_train
X_val['labels'] = labels_val

In [None]:
input_ids_train, labels_train = tokenize_sentences(X_train['text'].tolist(), labels_train)
input_ids_val, labels_val = tokenize_sentences(X_val['text'].tolist(), labels_val)



In [None]:
model = BertForSequenceClassification.from_pretrained(
    
    "sberbank-ai/sbert_large_nlu_ru", 
    num_labels = 6, 
    output_attentions = False, 
    output_hidden_states = False, 
)

if torch.cuda.is_available():
  
  model.cuda()

2021-07-20 20:07:56.428041: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


Downloading:   0%|          | 0.00/655 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sberbank-ai/sbert_large_nlu_ru and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Data preprocessing


In [None]:
train_dataset = TensorDataset(input_ids_train, labels_train)
val_dataset = TensorDataset(input_ids_val, labels_val)

batch_size = 16

train_dataloader = DataLoader(
            train_dataset, 
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

validation_dataloader = DataLoader(
            val_dataset, .
            sampler = SequentialSampler(val_dataset), 
            batch_size = batch_size 
        )


In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 0.000001, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [None]:
epochs = 1
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
def format_time(elapsed):
 
    elapsed_rounded = int(round((elapsed)))
    
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
    
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla V100-SXM2-16GB


In [None]:
# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)

torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []

total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))

    t0 = time.time()

    total_train_loss = 0

    model.train()


    for step, batch in enumerate(train_dataloader):
        
        if step % 40 == 0 and not step == 0:
            
            elapsed = format_time(time.time() - t0)
            
          
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

   
        b_input_ids = batch[0].to(device)
        b_labels = batch[1].to(device)

        model.zero_grad()        

        outputs = model(b_input_ids, 
                        labels=b_labels)
        loss = outputs.loss
        logits = outputs.logits

       
        total_train_loss += loss.item()
       
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)            
    
.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0


    for batch in validation_dataloader:

        b_input_ids = batch[0].to(device)
        b_labels = batch[1].to(device)

        with torch.no_grad():        

            outputs = model(b_input_ids, 
                            labels=b_labels)
            (loss, logits) = (outputs.loss, outputs.logits)
            

        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        

    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch    40  of    272.    Elapsed: 0:00:09.
  Batch    80  of    272.    Elapsed: 0:00:19.
  Batch   120  of    272.    Elapsed: 0:00:28.
  Batch   160  of    272.    Elapsed: 0:00:37.
  Batch   200  of    272.    Elapsed: 0:00:46.
  Batch   240  of    272.    Elapsed: 0:00:56.

  Average training loss: 0.67
  Training epcoh took: 0:01:03

Running Validation...
  Accuracy: 0.00
  Validation Loss: 0.70
  Validation took: 0:00:04

Training complete!
Total training took 0:01:07 (h:mm:ss)


In [None]:
def tokenize_sentences_out(sentences_in):

  input_ids = []
  for i in range(0, len(sentences_in)):

    input_id = tokenize_sentence(sentences_in[i])
    input_ids.append(input_id)
    
  input_ids = torch.cat(input_ids, dim=0)

  return input_ids

In [None]:
pred_input_ids = tokenize_sentences_out(X_val['text'].tolist())

In [None]:
def predict(input_ids_in):

    model.eval()
    dataset = TensorDataset(input_ids_in)

    dataloader = DataLoader(
            dataset,  # The training samples.
            batch_size = batch_size # Trains with this batch size.
        )
    predictions = []

    for batch in dataloader:

        input_ids = batch[0].to(device)
        with torch.no_grad():
  
            outputs = model(input_ids)
        
        logits = outputs[0]

     
        logits = logits.detach().cpu().numpy()
        predictions.extend(logits)
    return predictions

def predict_by_dataloader(dataloader):


    model.eval()

    predictions = []
    
    for batch in dataloader:

        input_ids = batch[0].to(device)
        
        with torch.no_grad():
        
            outputs = model(input_ids)
        
        logits = outputs[0]

        logits = logits.detach().cpu().numpy()
        predictions.extend(logits)
    return predictions

In [None]:
torch.cuda.empty_cache()

In [None]:
predictions_val = predict_by_dataloader(validation_dataloader)

In [None]:
for target_num in range(6):

  print(f'MAE on {target_num + 1}:', mae(list(map(lambda x: x[target_num], X_val['labels'].tolist())), list(map(lambda x: x[target_num], predictions_val))))
  print(f'MAPE on {target_num + 1}:', mape(list(map(lambda x: x[target_num], X_val['labels'].tolist())), list(map(lambda x: x[target_num], predictions_val))))
  
print('MAE:', mae(X_val['labels'].tolist(), predictions_val))
print('MAPE:', mape(X_val['labels'].tolist(), predictions_val))

In [None]:
predictions_train = predict_by_dataloader(train_dataloader)
for target_num in range(6):

  print(f'MAE on {target_num + 1}:', mae(list(map(lambda x: x[target_num], X_train['labels'].tolist())), list(map(lambda x: x[target_num], predictions_train))))
  print(f'MAPE on {target_num + 1}:', mape(list(map(lambda x: x[target_num], X_train['labels'].tolist())), list(map(lambda x: x[target_num], predictions_train))))
  
print('MAE:', mae(X_train['labels'].tolist(), predictions_train))
print('MAPE:', mape(X_train['labels'].tolist(), predictions_train))

MAE on 1: 0.3154591343840087
MAPE on 1: 0.5309141178298376
MAE on 2: 0.10231118951958003
MAPE on 2: 0.1559506301120517
MAE on 3: 0.13832044911930433
MAPE on 3: 0.2114317431139802
MAE on 4: 0.29243016461721705
MAPE on 4: 0.5065477662931697
MAE on 5: 0.11744360489795629
MAPE on 5: 0.17715877619376064
MAE on 6: 0.34679943549886194
MAPE on 6: 0.6221650250806797
MAE: 0.21879399633948438
MAPE: 0.3673613431039133


In [None]:
predictions_train

In [None]:
X_train.id.unique()

In [None]:
 train.values.tolist()

# **Ranking**

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge
from sklearn import svm
from sklearn import linear_model
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

file = '/content/data_ranking.json'
with open(file) as train_file:
    data = json.load(train_file)


train = pd.DataFrame.from_dict(data, orient='index', columns=[ "id", "target"])
data = pd.DataFrame.from_dict(data, orient='index', columns=[ "id", "data", "target"])

train["id"] = train.index
data["id"] = data.index

train.index = [i for i in range(151)]

train_df = pd.DataFrame()

train_df["id"] = X_train.id
train_df["targets"] = np.nan


tak = train.values.tolist()
blet = train_df.values.tolist()
for i in blet:

  for j in tak:

    if i[0] == j[0]:
      i[1] = j[1]

      
train_df["targets"] = [i[1] for i in blet]

train_df["labels"] = [i for i in np.vstack(data.data)]

In [None]:
train_df["MAE"] = [i for i in list_of_mae[:4346]]
train_df_min_MAE = train_df.loc[train_df.groupby(["id"])["MAE"].idxmin()]

In [None]:
X = np.array([i for i in train_df["labels"]]) 
y = np.array([i for i in train_df["targets"]])

my_alpha = 0.1
my_l1ratio = 0.5

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42, shuffle=False)

reg =  LinearRegression(Ridge(alpha=0.005))
reg.fit(X_train, y_train)




LinearRegression(fit_intercept=Ridge(alpha=0.005))

In [None]:
pred = reg.predict(X_test)
for i in pred:
  for j in y_test:
    print("MAE", mae(j, i))
    print("MAPE", mape(j, i))

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import max_error

pred = reg.predict(X_test)
r2_score(y_test, pred,  multioutput='variance_weighted')

-0.14890279472154497

In [None]:
pred

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate


scoring = {'mae': make_scorer(mean_absolute_error), 
               'mape': make_scorer(mean_absolute_percentage_error)}

scores = cross_validate(reg,
                        X_train,
                        y_train,
                        cv=6,
                        scoring=scoring)
scores

In [None]:
import itertools
from pprint import pprint

def ranking_pred(ID_name):


  list_of_mae = []

  list_pred = list([i for i in X_val[X_val.id == str(ID_name)].predictions])
  combination_list = list(itertools.combinations(itertools.chain(*list_pred), 6))

  for i in combination_list:

    mae_value = mae([Xval_min_MAE.id == str(ID_name)].labels, i)

    if mae_value < Xval_min_MAE[X_val.id == str(ID_name)].MAE:

      dict_mae = {mae_value : i}
      list_of_mae.append(dict_mae)

  return list_of_mae 