# Imports

In [1]:
import os
import pickle

import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.metrics import average_precision_score

os.chdir(os.path.join("..", ".."))

from solution.constants import (
    REVIEWS_PATH,
    PRODUCTS_PATH,
    RETURN_REASONS_PATH,
    RETURNS_PATH,
    TEST_PATH,
    BAD_REVIEWS_GROUPED_BY_PRODUCT_ID,
)
from solution.fasttext import embed_sentences
from solution.train import train_valid_test_split

In [56]:
from catboost import CatBoostClassifier
from sklearn.metrics import average_precision_score


# Load embeddings

In [2]:
with open(BAD_REVIEWS_GROUPED_BY_PRODUCT_ID, "rb") as f:
    bad_reviews_embs_by_id = pickle.load(f)

In [3]:
products_with_reviews = set(bad_reviews_embs_by_id.keys())

# Load reviews

In [4]:
returns_df = pd.read_parquet(RETURNS_PATH)
returns_df.sample(5)

Unnamed: 0_level_0,id,product_id,cause,comment,date_created,order_item_id,customer_id,purchase_price
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
204889,2568378,9263ca5ce2a899e383b26ec065a67801fbdf37cb3b4bb6...,WRONG_SIZE,очень большой,2023-10-10 13:14:23.047,18803819,961bf42455279bf5478e4e45d44ff27710933c042beb44...,420000
262036,3506959,b09aa725541965bc862a5ccacf7b0699b3a0cf214717c1...,WRONG_SIZE,размер не подошел,2023-11-24 12:32:18.933,24306601,11272697e45ecc3bd93911b433256e492d63fb6da48b88...,570000
71357,803150,912b3eee79cdfcbac6c0eb38e43b7eff13afa6a5e9fa22...,WRONG_SIZE,мал,2023-06-07 12:46:53.889,6983799,338707705751a6d1177d3d608175ca876c285540ec3bc1...,175000
103887,1224841,8fd87a06a4e088880af8338506f838e2ffc00db068c840...,DEFECTED,НЕТ ПОДКЛЮЧЕНИЯ,2023-07-13 08:56:31.806,9818976,0837e3b680b9c33aca9460e6798982573c2b06449ced2b...,29000
132492,1596257,ab888907afbbe1cab42fe956971e519605ffc1fe534a6d...,WRONG_SIZE,размер не подашло,2023-08-10 12:40:40.239,12312889,c76394093cb1a69ae6e78bc9e478aa081aae55ee5441c0...,44000


In [5]:
returns_df.shape

(282141, 8)

In [6]:
returns_with_reviews_df = returns_df[returns_df.product_id.isin(products_with_reviews)]
returns_with_reviews_df.shape

(197301, 8)

# Prepare dataset

In [7]:
returns_with_reviews_df_splitted = train_valid_test_split(
    df=returns_with_reviews_df, val_size=0.15, test_size=0.15, stratify_col="cause"
)

In [9]:
returns_with_reviews_df_splitted.head()

Unnamed: 0_level_0,id,product_id,cause,comment,date_created,order_item_id,customer_id,purchase_price,train,valid,test
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,69138,6a1a7601fac958ee967c73fe19315db8f6cdc3f1cd8370...,DEFECTED,брак,2023-01-02 05:37:33.846,588140,b4465ede5691891836ccc432bb8c49e1537b1d0a74f721...,106000,True,False,False
1,69148,7cab221310edf5f3c75fc38259bcb7640d080b4b05d5bb...,PHOTO_MISMATCH,думала больше,2023-01-02 05:44:34.432,773695,9bf74458174dd9c039ee6317fd48b356e8fc146f23c60b...,23000,False,True,False
2,69154,728611508a21a9214f2c8cc076d21e30046ec5c59bf359...,DEFECTED,брак,2023-01-02 05:45:31.277,695067,0a185871d03ee346b71b657d3fbaebfc35823fec2861f7...,390000,True,False,False
3,69161,f4f4031321f9b7cf1175fc6d363769297334ddd76aa2eb...,WRONG_ITEM,не тот товар,2023-01-02 05:57:35.652,635687,1123ce2b71eb64c572e6de0e14a723c17a55f67748327d...,71000,False,True,False
5,69178,5d4a48095318f551b67ef51c208282a339440aa92af602...,WRONG_SIZE,Не тот размер,2023-01-02 06:06:54.296,704331,ee8bd56b1a873f958b0e83fc68a72da22e431cfde8cefe...,35000,True,False,False


In [13]:
train_products_ids = returns_with_reviews_df_splitted[["product_id", "cause"]][
    returns_with_reviews_df_splitted.train
]

In [14]:
train_products_ids

Unnamed: 0_level_0,product_id,cause
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,6a1a7601fac958ee967c73fe19315db8f6cdc3f1cd8370...,DEFECTED
2,728611508a21a9214f2c8cc076d21e30046ec5c59bf359...,DEFECTED
5,5d4a48095318f551b67ef51c208282a339440aa92af602...,WRONG_SIZE
8,48b99e8820665207789fcc460ad4306bfa65107752610a...,WRONG_SIZE
9,f3daad81298c1983cdc886ef042213345bfcb56ae2f2ee...,DEFECTED
...,...,...
282123,4c76c0a14fcfb8d66083a56bdbbc914d2407b2a30008be...,WRONG_SIZE
282126,e11c92888d1b1615e668d4470da189ababd82183b6384f...,DEFECTED
282132,1882807554859d663b1b1629546d9fd772f971df1188cb...,BAD_QUALITY
282134,4f3e95a420d43c99ffa1ce5ef76eb8f94310d5b5500dcf...,WRONG_SIZE


In [19]:
y_train = train_products_ids.cause

x_train = []
for product_id in train_products_ids.product_id:
    x_train.append(bad_reviews_embs_by_id[product_id])

x_train = np.array(x_train)
x_train.shape

(138109, 300)

In [20]:
valid_products_ids = returns_with_reviews_df_splitted[["product_id", "cause"]][
    returns_with_reviews_df_splitted.valid
]

y_valid = valid_products_ids.cause

x_valid = []
for product_id in valid_products_ids.product_id:
    x_valid.append(bad_reviews_embs_by_id[product_id])

x_valid = np.array(x_valid)
x_valid.shape

(29596, 300)

In [21]:
test_products_ids = returns_with_reviews_df_splitted[["product_id", "cause"]][
    returns_with_reviews_df_splitted.test
]

y_test = test_products_ids.cause

x_test = []
for product_id in test_products_ids.product_id:
    x_test.append(bad_reviews_embs_by_id[product_id])

x_test = np.array(x_test)
x_test.shape

(29596, 300)

# Training

In [29]:
model = CatBoostClassifier()

best_model = model.fit(
    X=x_train,
    y=y_train,
    # embedding_features=list(range(x_train.shape[1])),
    eval_set=(x_valid, y_valid),
    use_best_model=True,
    plot=True,
    early_stopping_rounds=300,
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.11916
0:	learn: 1.4938176	test: 1.4944459	best: 1.4944459 (0)	total: 126ms	remaining: 2m 5s
1:	learn: 1.4092809	test: 1.4102230	best: 1.4102230 (1)	total: 241ms	remaining: 2m
2:	learn: 1.3492526	test: 1.3496883	best: 1.3496883 (2)	total: 352ms	remaining: 1m 57s
3:	learn: 1.3034065	test: 1.3038961	best: 1.3038961 (3)	total: 466ms	remaining: 1m 56s
4:	learn: 1.2669343	test: 1.2674078	best: 1.2674078 (4)	total: 624ms	remaining: 2m 4s
5:	learn: 1.2361062	test: 1.2363850	best: 1.2363850 (5)	total: 751ms	remaining: 2m 4s
6:	learn: 1.2114395	test: 1.2116626	best: 1.2116626 (6)	total: 866ms	remaining: 2m 2s
7:	learn: 1.1907031	test: 1.1909273	best: 1.1909273 (7)	total: 991ms	remaining: 2m 2s
8:	learn: 1.1730959	test: 1.1733620	best: 1.1733620 (8)	total: 1.11s	remaining: 2m 2s
9:	learn: 1.1573254	test: 1.1575490	best: 1.1575490 (9)	total: 1.23s	remaining: 2m 1s
10:	learn: 1.1439900	test: 1.1445475	best: 1.1445475 (10)	total: 1.35s	remaining: 2m 1s
11:	learn: 1.1329611	tes

In [30]:
best_model.save_model(fname="models/best_baseline.cbm")

In [42]:
predictions = best_model.predict_proba(X=x_test)
predictions

array([[0.07567861, 0.01637239, 0.05065903, 0.04174136, 0.81554861],
       [0.10130524, 0.69749348, 0.02894222, 0.1078223 , 0.06443676],
       [0.25246805, 0.69622546, 0.0163203 , 0.02668537, 0.00830081],
       ...,
       [0.08126197, 0.06278647, 0.02125101, 0.0321158 , 0.80258475],
       [0.11196946, 0.7672015 , 0.03840227, 0.04560925, 0.03681751],
       [0.15443231, 0.08159821, 0.03657479, 0.11859244, 0.60880226]])

In [37]:
best_model.classes_

array(['BAD_QUALITY', 'DEFECTED', 'PHOTO_MISMATCH', 'WRONG_ITEM',
       'WRONG_SIZE'], dtype=object)

In [38]:
return_reasons_df = pd.read_parquet(RETURN_REASONS_PATH)
return_reasons_df.head()

Unnamed: 0,id,reason
0,1,DEFECTED
1,2,WRONG_ITEM
2,3,BAD_QUALITY
3,4,PHOTO_MISMATCH
4,5,WRONG_SIZE


In [47]:
rearrange_idxs = [1, 3, 0, 2, 4]

predictions = predictions[:, rearrange_idxs]

classes = best_model.classes_[rearrange_idxs]
classes

array(['DEFECTED', 'WRONG_ITEM', 'BAD_QUALITY', 'PHOTO_MISMATCH',
       'WRONG_SIZE'], dtype=object)

# Evaluation

In [48]:
label_to_index_dict = {}

for index, reason in enumerate(classes):
    label_to_index_dict[reason] = index

label_to_index_dict

{'DEFECTED': 0,
 'WRONG_ITEM': 1,
 'BAD_QUALITY': 2,
 'PHOTO_MISMATCH': 3,
 'WRONG_SIZE': 4}

In [52]:
test_indeces = y_test.apply(lambda label: label_to_index_dict[label])
test_indeces

index
36        4
69        0
122       0
123       0
124       0
         ..
282110    0
282117    4
282124    1
282130    0
282131    4
Name: cause, Length: 29596, dtype: int64

In [54]:
y_true_array = np.zeros((len(test_indeces), len(label_to_index_dict)))

for row, index in enumerate(test_indeces):
    y_true_array[row, index] = 1

y_true_array

array([[0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.]])

In [57]:
average_precision_score(y_true=y_true_array, y_score=predictions,  average="macro")

0.322493003118864

In [73]:
from random import randint

random_predictions = np.zeros((len(test_indeces), len(label_to_index_dict)))

for row in range(len(test_indeces)):
    index = randint(0, 4)
    random_predictions[row, index] = 1


average_precision_score(y_true=y_true_array, y_score=random_predictions,  average="macro")

0.1992126590212673