# Импорты

In [74]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch

from transformers import BertModel, BertTokenizer
from torch import nn
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score
from catboost import CatBoostClassifier, Pool, cv, sum_models
from tqdm import tqdm

%matplotlib inline

# Читаем данные

In [10]:
df_det = pd.read_csv('data/details.csv')

In [11]:
df_rev = pd.read_csv('data/reviews.csv')

# Отбираем признаки

In [12]:
cat_col_det = ['brandName', 'modelName', 'Серия', 'SIM карта', 'Поддержка стандартов', 'Порт USB', 'Материал корпуса', 
          'Поддержка беспроводной зарядки', 'Цвет', 'Операционная система', 'Разъем для наушников', 'Экран',
          'Процессор', 'Оперативная память (RAM)', 'Основная камера МПикс', 'Встроенная память (ROM)']

num_col_det = ['basePrice', 'salePrice']

col_for_mod_det = ['id_prod', 'brandName', 'modelName', 'Серия', 'SIM карта', 'Поддержка стандартов', 'Порт USB', 'Материал корпуса', 
          'Поддержка беспроводной зарядки', 'Цвет', 'Операционная система', 'Разъем для наушников', 'Экран',
          'Процессор', 'Оперативная память (RAM)', 'Основная камера МПикс', 'Встроенная память (ROM)', 'basePrice', 'salePrice']

inf_col_det = [f for f in df_det.columns if f not in cat_col_det and f not in num_col_det]

In [13]:
emb_col_rev = ['benefits', 'drawbacks', 'text']
cat_col_rev = ['name']
col_for_mod_rev = ['benefits', 'drawbacks', 'text', 'name', 'score', 'product_id', 'date']
inf_col_rev = [f for f in df_rev.columns if f not in cat_col_rev and f not in emb_col_rev]
target = 'score'

In [14]:
df_det[cat_col_det] = df_det[cat_col_det].fillna('NaN')

In [15]:
df_rev[emb_col_rev] = df_rev[emb_col_rev].fillna('NaN')

In [16]:
df_rev['date'] = pd.to_datetime(df_rev['date'])

In [17]:
df_det.head(1)

Unnamed: 0,id_prod,brandName,modelName,basePrice,salePrice,Гарантия,Серия,Состояние,Разрешение экрана,Тип процессора,...,Защитная пленка для экрана,Обратная беспроводная зарядка,Важная особенность,Удачное решение,Время в режиме разговора,Тип вилки,Сканер отпечатка пальца на задней панели,Датчики,Сканер отпечатка пальца на передней панели,Кабель Type-C - USB тип А
0,30063534,Apple,iPhone 13 128GB Midnight,79999,65999,1 год,iPhone 13,новый,2532x1170 Пикс,A15 Bionic,...,,,,,,,,,,


In [18]:
df_rev.head(1)

Unnamed: 0,id_rev,product_id,date,score,name,benefits,drawbacks,text,Power,Working_hours,Functionality
0,rr61009250,30063534,2023-01-03,5.0,Динара,,,,5.0,5.0,5.0


# Сборка общего DF

In [19]:
df_all_data = df_rev[col_for_mod_rev].merge(df_det[col_for_mod_det], how='left', left_on='product_id', right_on='id_prod')

In [20]:
df_all_data = df_all_data.drop(labels=['product_id', 'id_prod'], axis=1)

In [21]:
df_all_data.head(1)

Unnamed: 0,benefits,drawbacks,text,name,score,date,brandName,modelName,Серия,SIM карта,...,Цвет,Операционная система,Разъем для наушников,Экран,Процессор,Оперативная память (RAM),Основная камера МПикс,Встроенная память (ROM),basePrice,salePrice
0,,,,Динара,5.0,2023-01-03,Apple,iPhone 13 128GB Midnight,iPhone 13,nano-SIM/eSim,...,темная ночь,iOS,,,,,,128,79999,65999


In [22]:
df_all_data.loc[df_all_data['score'] < 5, 'score'] = 0

In [23]:
df_all_data.loc[df_all_data['score'] == 5, 'score'] = 1

In [24]:
df_all_data['Оперативная память (RAM)'] = df_all_data['Оперативная память (RAM)'].astype(str)

# Подготовка текстовых фичей

## Так как у берта ограничение на длину текста, придется подрезать

In [25]:
df_all_data.loc[df_all_data['benefits'].str.len() > 512]['benefits'].str.len()

163      917
340      721
484      729
496      712
518      713
527      580
763      624
942     1719
983      586
985      542
1105     602
1118     646
1129     537
1532     522
2002     889
2850     582
3035     615
3449     908
3726     541
3858    1298
4206     564
5363     727
5375     554
5407     766
5430     557
5510     569
5531     791
5541     853
5776     595
5833     518
5879     655
5938     514
6330    1190
6811    2000
6923     548
7168     745
7169     747
7307    1357
8121     612
8171     863
Name: benefits, dtype: int64

In [26]:
def get_norm_df(df, col):
    for i in tqdm(df[col]):
        if len(i) > 512:
            df[col] = df[col].replace({i: i[0:512]})
    return df

In [30]:
get_norm_df(df_all_data, 'benefits')

100%|█████████████████████████████████████████████████████████████████████████| 8185/8185 [00:00<00:00, 1637040.59it/s]


Unnamed: 0,benefits,drawbacks,text,name,score,date,brandName,modelName,Серия,SIM карта,...,Цвет,Операционная система,Разъем для наушников,Экран,Процессор,Оперативная память (RAM),Основная камера МПикс,Встроенная память (ROM),basePrice,salePrice
0,,,,Динара,1.0,2023-01-03,Apple,iPhone 13 128GB Midnight,iPhone 13,nano-SIM/eSim,...,темная ночь,iOS,,,,,,128,79999,65999
1,"ПО топовое, железо, камера и дизайн",Нет,Перешел с Андроида на Эпл. Скажу сразу приятно...,Лев,1.0,2022-07-14,Apple,iPhone 13 128GB Midnight,iPhone 13,nano-SIM/eSim,...,темная ночь,iOS,,,,,,128,79999,65999
2,"Телефон хороший, а магазин - нет.",Сам магазин Мвидео и отношение как к нелюдям. ...,"Вечером приезжаю в магазин, оформляю кредит, п...",Александр,0.0,2024-03-18,Apple,iPhone 13 128GB Midnight,iPhone 13,nano-SIM/eSim,...,темная ночь,iOS,,,,,,128,79999,65999
3,"Бренд, стабильность, мощность, дизайн.","Отсутствие зарядки, закрытая система, нет 120H...",В общем-то наверное лучший Iphone по соотношен...,Сергей,1.0,2024-01-13,Apple,iPhone 13 128GB Midnight,iPhone 13,nano-SIM/eSim,...,темная ночь,iOS,,,,,,128,79999,65999
4,,,,Злата,0.0,2023-04-21,Apple,iPhone 13 128GB Midnight,iPhone 13,nano-SIM/eSim,...,темная ночь,iOS,,,,,,128,79999,65999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8180,,,,Аркадий,1.0,2024-01-20,realme,C67 6/128GB Green Oasis (RMX3890),C67,2 nano-SIM,...,зеленый,Android 14,3.5 мм,"6.72""/2400x1080 Пикс",Qualcomm Snapdragon 685 2.8 ГГц,6.0,108/2,128,18999,16999
8181,,,,Владимир,1.0,2024-03-19,realme,C67 6/128GB Green Oasis (RMX3890),C67,2 nano-SIM,...,зеленый,Android 14,3.5 мм,"6.72""/2400x1080 Пикс",Qualcomm Snapdragon 685 2.8 ГГц,6.0,108/2,128,18999,16999
8182,телефон просто огонь,нет,"за 14 тысяч купил и очень доволен, за эти день...",Роман,1.0,2024-03-07,realme,C67 6/128GB Green Oasis (RMX3890),C67,2 nano-SIM,...,зеленый,Android 14,3.5 мм,"6.72""/2400x1080 Пикс",Qualcomm Snapdragon 685 2.8 ГГц,6.0,108/2,128,18999,16999
8183,,,Я этот телефон еще когда он в анонсах был выпр...,Юлька Е,1.0,2024-03-28,realme,C67 6/128GB Green Oasis (RMX3890),C67,2 nano-SIM,...,зеленый,Android 14,3.5 мм,"6.72""/2400x1080 Пикс",Qualcomm Snapdragon 685 2.8 ГГц,6.0,108/2,128,18999,16999


In [31]:
model_name = 'bert-base-uncased'

In [32]:
tokenizer = BertTokenizer.from_pretrained(model_name)

In [33]:
bert = BertModel.from_pretrained(model_name)

In [34]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [35]:
bert.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [36]:
def predict_embed(input_ids, bert_model):
    with torch.no_grad():
        last_hidden_states = bert_model(input_ids).last_hidden_state[0]
    max_pool = nn.MaxPool1d(last_hidden_states.shape[0])
    embed = max_pool(last_hidden_states.T).T[0]
    return embed

In [47]:
tokens = []
for txt in tqdm(df_all_data['text'].values):
    tokens.append(torch.tensor([tokenizer.encode(txt, add_special_tokens=True)], dtype=torch.int, device=device))

100%|████████████████████████████████████████████████████████████████████████████| 8185/8185 [00:05<00:00, 1486.45it/s]


In [48]:
res = []
for tks in tqdm(tokens):
    res.append(predict_embed(tks, bert))

100%|██████████████████████████████████████████████████████████████████████████████| 8185/8185 [02:29<00:00, 54.90it/s]


In [49]:
embeds = []
for emb in tqdm(res):
    embeds.append(emb.cpu().numpy())

100%|███████████████████████████████████████████████████████████████████████████| 8185/8185 [00:00<00:00, 15384.18it/s]


In [50]:
df_all_data['text'] = embeds

In [52]:
df_all_data.head(1)

Unnamed: 0,benefits,drawbacks,text,name,score,date,brandName,modelName,Серия,SIM карта,...,Цвет,Операционная система,Разъем для наушников,Экран,Процессор,Оперативная память (RAM),Основная камера МПикс,Встроенная память (ROM),basePrice,salePrice
0,"[0.6981506, 0.0368617, 0.039393686, 0.40808898...","[0.6981506, 0.0368617, 0.039393686, 0.40808898...","[0.6981506, 0.0368617, 0.039393686, 0.40808898...",Динара,1.0,2023-01-03,Apple,iPhone 13 128GB Midnight,iPhone 13,nano-SIM/eSim,...,темная ночь,iOS,,,,,,128,79999,65999


In [53]:
df_all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8185 entries, 0 to 8184
Data columns (total 24 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   benefits                        8185 non-null   object        
 1   drawbacks                       8185 non-null   object        
 2   text                            8185 non-null   object        
 3   name                            8185 non-null   object        
 4   score                           8185 non-null   float64       
 5   date                            8185 non-null   datetime64[ns]
 6   brandName                       8185 non-null   object        
 7   modelName                       8185 non-null   object        
 8   Серия                           8185 non-null   object        
 9   SIM карта                       8185 non-null   object        
 10  Поддержка стандартов            8185 non-null   object        
 11  Порт

# Строим модель

In [54]:
col_for_inf = ['score', 'date']
num_feat = ['basePrice', 'salePrice']
embed_feat = ['benefits', 'drawbacks', 'text']
cat_feat = [f for f in df_all_data.columns if f not in embed_feat and f not in num_feat and f not in col_for_inf]
target = 'score'

In [55]:
features = [f for f in df_all_data.columns if f not in col_for_inf]

In [56]:
dev, oos = train_test_split(df_all_data, stratify=df_all_data[target], random_state=42, test_size=0.2)

In [58]:
train, valid = train_test_split(dev, stratify=dev[target], random_state=42, test_size=0.2)

In [61]:
train_pool = Pool(
    train[features],
    label=train[target],
    cat_features=cat_feat,
    embedding_features=embed_feat,
)

oos_pool = Pool(
    oos[features],
    label=oos[target],
    cat_features=cat_feat,
    embedding_features=embed_feat,
)

valid_pool = Pool(
    valid[features],
    label=valid[target],
    cat_features=cat_feat,
    embedding_features=embed_feat,
)

In [63]:
cbc = CatBoostClassifier(
    iterations=1000,
    early_stopping_rounds=100,
    depth=4,
    eval_metric='AUC',
)

In [64]:
cbc.fit(train_pool, eval_set=valid_pool)

Learning rate set to 0.047703
0:	test: 0.5000000	best: 0.5000000 (0)	total: 165ms	remaining: 2m 45s
1:	test: 0.5000000	best: 0.5000000 (0)	total: 170ms	remaining: 1m 24s
2:	test: 0.5755399	best: 0.5755399 (2)	total: 195ms	remaining: 1m 4s
3:	test: 0.5755399	best: 0.5755399 (2)	total: 203ms	remaining: 50.4s
4:	test: 0.5755399	best: 0.5755399 (2)	total: 210ms	remaining: 41.9s
5:	test: 0.5817002	best: 0.5817002 (5)	total: 240ms	remaining: 39.8s
6:	test: 0.5817002	best: 0.5817002 (5)	total: 262ms	remaining: 37.1s
7:	test: 0.5817002	best: 0.5817002 (5)	total: 275ms	remaining: 34.2s
8:	test: 0.5915683	best: 0.5915683 (8)	total: 304ms	remaining: 33.5s
9:	test: 0.5958289	best: 0.5958289 (9)	total: 333ms	remaining: 32.9s
10:	test: 0.6003783	best: 0.6003783 (10)	total: 363ms	remaining: 32.6s
11:	test: 0.6280246	best: 0.6280246 (11)	total: 383ms	remaining: 31.5s
12:	test: 0.6280246	best: 0.6280246 (11)	total: 411ms	remaining: 31.2s
13:	test: 0.6320463	best: 0.6320463 (13)	total: 441ms	remaining: 

<catboost.core.CatBoostClassifier at 0x25bb5cffa50>

In [65]:
def get_metric(model, pool, metric):
    return metric(pool.get_label(), model.predict(pool, prediction_type="RawFormulaVal"))


def print_cb_metrics(model, train_pool, valid_pool=None, oos_pool=None, oot_pool=None, metric=None):
    
    metrics = {
        'train': None,
        'valid': None,
        'oos': None,
        'oot': None,
    }
    
    if metric is None:
        metric = {'name': 'AUC', 'func': roc_auc_score}
    
    tr_m = get_metric(model, train_pool, metric.get('func'))
    print(f"TRAIN {metric.get('name')}: [{tr_m}]")
    metrics['train'] = tr_m
    
    if valid_pool is not None:
        v_m = get_metric(model, valid_pool, metric.get('func'))
        print(f"VALID {metric.get('name')}: [{v_m}]")
        metrics['valid'] = v_m
    
    if oos_pool is not None:
        o_m = get_metric(model, oos_pool, metric.get('func'))
        print(f"OOS {metric.get('name')}: [{o_m}]")
        metrics['oos'] = o_m
        
    if oot_pool is not None:
        ot_m = get_metric(model, oot_pool, metric.get('func'))
        print(f"OOT {metric.get('name')}: [{ot_m}]")
        metrics['oot'] = ot_m
    
    return metrics

In [68]:
auc = {'name': 'AUC', 'func': roc_auc_score}

In [69]:
print_cb_metrics(model=cbc, train_pool=train_pool, valid_pool=valid_pool, oos_pool=oos_pool, oot_pool=None, metric=auc)

TRAIN AUC: [0.7410457232002083]
VALID AUC: [0.7046532684530951]
OOS AUC: [0.7133219531277785]


{'train': 0.7410457232002083,
 'valid': 0.7046532684530951,
 'oos': 0.7133219531277785,
 'oot': None}

In [70]:
cv_params = {
    "iterations": 250,
    "learning_rate": 0.1,
    "depth": 4,
    "l2_leaf_reg": 1,
    "loss_function": 'Logloss',
    # "border_count": 256,
    "task_type": "CPU",
    # "gpu_ram_part": 1,
    "random_state": 42,
    "eval_metric": "F1",
    "silent": True,
}

rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=2)

cv_res, cv_models = cv(
    pool=train_pool,
    params=cv_params,
    partition_random_seed=0,
    seed=42,
    shuffle=True,
    logging_level=None,
    folds=rskf,
    return_models=True,
)

Training on fold [0/6]

bestTest = 0.937195122
bestIteration = 87

Training on fold [1/6]

bestTest = 0.937195122
bestIteration = 61

Training on fold [2/6]

bestTest = 0.9376727049
bestIteration = 94

Training on fold [3/6]

bestTest = 0.9363387146
bestIteration = 1

Training on fold [4/6]

bestTest = 0.9366240098
bestIteration = 237

Training on fold [5/6]

bestTest = 0.9369479135
bestIteration = 19



In [71]:
class CustomCBModel:
    def __init__(self, cv_models):
        self._models = cv_models
    
    def _predict(self, pool):
        preds = np.empty((pool.shape[0], len(cv_models)))
        for i, model in enumerate(self._models):
            preds[:, i] = model.predict(pool, prediction_type='Probability')[:, -1]
        preds = preds.sum(axis=1) / len(self._models)
        return preds
    
    def predict(self, pool, prediction_type='Probability', threshhold=0.5):
        if prediction_type == 'Probability':
            return self._predict(pool)
        elif prediction_type == 'Classes':
            return (self._predict(pool) >= threshhold).astype(int)
        else:
            raise NotImplementedError(f"Passed prediction_type={prediction_type} is not allowed! Use some of ['Probability', 'Classes']")

In [72]:
cbc_wrapper = CustomCBModel(cv_models)

In [75]:
f1_score(train_pool.get_label(), cbc_wrapper.predict(train_pool, prediction_type='Classes'))

0.936731999593785

In [76]:
f1_score(oos_pool.get_label(), cbc_wrapper.predict(oos_pool, prediction_type='Classes'))

0.9369720597790773

In [80]:
roc_auc_score(train_pool.get_label(), cbc_wrapper.predict(train_pool, prediction_type='Probability'))

0.6674427594080174

In [78]:
roc_auc_score(oos_pool.get_label(), cbc_wrapper.predict(oos_pool, prediction_type='Probability'))

0.5854546747750631