In [2]:
import pandas as pd

# Loading Data

In [3]:
train_ftrs = pd.read_json('./input_data/attribute_train.data', lines=True)
train_target = pd.read_json('./input_data/attribute_train.solution', lines=True)
val_ftrs = pd.read_json('./input_data/attribute_val.data', lines=True)
val_target = pd.read_json('./input_data/attribute_val.solution', lines=True)
test_ftrs = pd.read_json('./input_data/attribute_test.data', lines=True)

In [31]:
val_ftrs['details_Manufacturer'] = val_ftrs['details_Manufacturer'].fillna('unknown')
test_ftrs['details_Manufacturer'] = test_ftrs['details_Manufacturer'].fillna('unknown')

## Combining all these three information into a single sentence

In [32]:
def combine_infos(row):
    text = 'The title of the product is ' + row['title'] + '. This product was bought at the store ' + row['store'] + '. The manufacturer of this product is ' + row['details_Manufacturer'] + '.'
    return text

In [33]:
train_prprcsd = pd.DataFrame()

train_prprcsd['Description'] = train_ftrs.apply(combine_infos, axis=1)

In [34]:
train_prprcsd

Unnamed: 0,Description
0,The title of the product is Enclume Angled Pot...
1,The title of the product is Schutt Vengeance D...
2,The title of the product is Easton 2014 MAKO S...
3,The title of the product is Bilstein B46-0929 ...
4,The title of the product is Apple Red Cardstoc...
...,...
443494,The title of the product is Sony DCR-HC32 Mini...
443495,The title of the product is Monster Truck Park...
443496,The title of the product is 3dRose Pyrenees Do...
443497,The title of the product is adidas F50 Lesto S...


### Validation set

In [35]:
val_prprcsd = pd.DataFrame()

val_prprcsd['Description'] = val_ftrs.apply(combine_infos, axis=1)

In [36]:
val_prprcsd

Unnamed: 0,Description
0,"The title of the product is Pendleton, Eco-Wis..."
1,The title of the product is JP London MD3A049 ...
2,The title of the product is Lawn Fawn LF2938 F...
3,The title of the product is ANCHEER Foldable E...
4,The title of the product is Schecter Jeff Loom...
...,...
95030,The title of the product is Dodge Ram Rebel Bl...
95031,The title of the product is AM Autoparts Door ...
95032,The title of the product is Manduka eQua Plus ...
95033,The title of the product is Elmer's Products E...


### Test set

In [37]:
test_prprcsd = pd.DataFrame()

test_prprcsd['Description'] = test_ftrs.apply(combine_infos, axis=1)

In [38]:
test_prprcsd

Unnamed: 0,Description
0,The title of the product is CURT 58180 Trailer...
1,The title of the product is CafePress Andrew J...
2,The title of the product is Garage-Pro Driver ...
3,The title of the product is Husky Liners Front...
4,The title of the product is Nearly Natural 130...
...,...
95031,The title of the product is Discraft Avenger S...
95032,The title of the product is ProLume Prolumeme ...
95033,The title of the product is Nearly Natural 484...
95034,The title of the product is Gorilla Automotive...


## Let's create embeddings for these texts

In [39]:
from transformers import AutoModel
from tqdm import tqdm
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True).to(device)

train_embeddings = []

for i in tqdm(range(0, len(train_prprcsd['Description'])), desc="Encoding descriptions"):

    with torch.no_grad():
        embedding = model.encode(train_prprcsd['Description'][i])

    train_embeddings.append(embedding)

    del embedding

In [40]:
import pickle

with open('train_embeddings.pkl', 'wb') as f:
    pickle.dump(train_embeddings, f)

Each output embedding is of dimension 768

In [41]:
with open('train_embeddings.pkl', 'rb') as f:
    train_embeddings = pickle.load(f)

In [42]:
len(train_embeddings)

443499

### For validation and test sets

In [43]:
val_embeddings = []

for i in tqdm(range(0, len(val_prprcsd['Description'])), desc="Encoding descriptions"):

    with torch.no_grad():
        embedding = model.encode(val_prprcsd['Description'][i])

    val_embeddings.append(embedding)

    del embedding

In [44]:
with open('val_embeddings.pkl', 'wb') as f:
    pickle.dump(val_embeddings, f)

In [45]:
test_embeddings = []

for i in tqdm(range(0, len(test_prprcsd['Description'])), desc="Encoding descriptions"):

    with torch.no_grad():
        embedding = model.encode(test_prprcsd['Description'][i])

    test_embeddings.append(embedding)

    del embedding

In [46]:
with open('test_embeddings.pkl', 'wb') as f:
    pickle.dump(test_embeddings, f)

In [47]:
with open('val_embeddings.pkl', 'rb') as f:
    val_embeddings = pickle.load(f)

In [48]:
with open('test_embeddings.pkl', 'rb') as f:
    test_embeddings = pickle.load(f)

### Label Encoding

In [90]:
len(train_target['details_Brand'].value_counts())

5066

In [85]:
len(train_target['L0_category'].value_counts())

27

In [86]:
len(train_target['L1_category'].value_counts())

163

In [91]:
train_target['L1_category'].value_counts()

L1_category
Replacement Parts                            78780
Home Dcor Products                           23083
Computers & Accessories                      21964
Kitchen & Dining                             21700
Office & School Supplies                     20941
                                             ...  
Meat & Seafood                                  22
Test, Measure & Inspect                         22
eBook Readers & Accessories                     22
Mobility Aids & Equipment                       22
Headlight Assemblies, Parts & Accessories       22
Name: count, Length: 163, dtype: int64

In [94]:
train_target['L0_category'][train_target['L1_category'] == 'Computers & Accessories'].value_counts()

L0_category
Electronics    21964
Name: count, dtype: int64

In [87]:
len(train_target['L2_category'].value_counts())

612

In [88]:
len(train_target['L3_category'].value_counts())

1252

In [89]:
len(train_target['L4_category'].value_counts())

962

In [49]:
from sklearn.preprocessing import LabelEncoder

le_db = LabelEncoder()
le_L0 = LabelEncoder()
le_L1 = LabelEncoder()
le_L2 = LabelEncoder()
le_L3 = LabelEncoder()
le_L4 = LabelEncoder()

In [50]:
trtgt_db = le_db.fit_transform(train_target['details_Brand'])
trtgt_L0 = le_L0.fit_transform(train_target['L0_category'])
trtgt_L1 = le_L1.fit_transform(train_target['L1_category'])
trtgt_L2 = le_L2.fit_transform(train_target['L2_category'])
trtgt_L3 = le_L3.fit_transform(train_target['L3_category'])
trtgt_L4 = le_L4.fit_transform(train_target['L4_category'])

## Model Training

In [51]:
from xgboost import XGBClassifier

In [52]:
xgb_db = XGBClassifier(tree_method='hist', device='cuda')
xgb_db.fit(train_embeddings, trtgt_db)

In [53]:
xgb_db.save_model('./XGB_models/xgb_detailsBrand.bin')

In [54]:
xgb_db = XGBClassifier()
xgb_db.load_model('./XGB_models/xgb_detailsBrand.bin')

In [55]:
xgb_L0 = XGBClassifier(tree_method='hist', device='cuda')
xgb_L0.fit(train_embeddings, trtgt_L0)

In [56]:
xgb_L0.save_model('./XGB_models/xgb_L0category.bin')

In [57]:
xgb_L0 = XGBClassifier()
xgb_L0.load_model('./XGB_models/xgb_L0category.bin')

In [58]:
xgb_L1 = XGBClassifier(tree_method='hist', device='cuda')
xgb_L1.fit(train_embeddings, trtgt_L1)

In [59]:
xgb_L1.save_model('./XGB_models/xgb_L1category.bin')

In [60]:
xgb_L1 = XGBClassifier()
xgb_L1.load_model('./XGB_models/xgb_L1category.bin')

In [61]:
xgb_L2 = XGBClassifier(tree_method='hist', device='cuda')
xgb_L2.fit(train_embeddings, trtgt_L2)

In [62]:
xgb_L2.save_model('./XGB_models/xgb_L2category.bin')

In [63]:
xgb_L2 = XGBClassifier()
xgb_L2.load_model('./XGB_models/xgb_L2category.bin')

In [64]:
xgb_L3 = XGBClassifier(tree_method='hist', device='cuda')
xgb_L3.fit(train_embeddings, trtgt_L3)

In [65]:
xgb_L3.save_model('./XGB_models/xgb_L3category.bin')

In [66]:
xgb_L3 = XGBClassifier()
xgb_L3.load_model('./XGB_models/xgb_L3category.bin')

In [67]:
xgb_L4 = XGBClassifier(tree_method='hist', device='cuda')
xgb_L4.fit(train_embeddings, trtgt_L4)

In [68]:
xgb_L4.save_model('./XGB_models/xgb_L4category.bin')

In [69]:
xgb_L4 = XGBClassifier()
xgb_L4.load_model('./XGB_models/xgb_L4category.bin')

## Prediction

In [70]:
enc_pred_db = xgb_db.predict(test_embeddings)
enc_pred_L0 = xgb_L0.predict(test_embeddings)
enc_pred_L1 = xgb_L1.predict(test_embeddings)
enc_pred_L2 = xgb_L2.predict(test_embeddings)
enc_pred_L3 = xgb_L3.predict(test_embeddings)
enc_pred_L4 = xgb_L4.predict(test_embeddings)

In [71]:
le_db.inverse_transform(enc_pred_db)

array(['Covercraft', 'CafePress', 'AUTOANDART', ..., 'PHILIPS',
       'Callahan BRAKE PARTS', 'Garage-Pro'], dtype=object)

In [72]:
le_L0.inverse_transform(enc_pred_L0)

array(['Automotive', 'Automotive', 'Automotive', ..., 'Home & Kitchen',
       'Automotive', 'Office Products'], dtype=object)

In [73]:
le_L1.inverse_transform(enc_pred_L1)

array(['Exterior Accessories', 'Exterior Accessories',
       'Lights & Lighting Accessories', ..., 'Home Dcor Products',
       'Tires & Wheels', 'Office & School Supplies'], dtype=object)

In [74]:
le_L2.inverse_transform(enc_pred_L2)

array(['Towing Products & Winches', 'Bumper Stickers, Decals & Magnets',
       'Body & Trim', ..., 'Artificial Plants & Flowers',
       'Accessories & Parts', 'Envelopes, Mailers & Shipping Supplies'],
      dtype=object)

In [75]:
le_L3.inverse_transform(enc_pred_L3)

array(['Decorative Accessories', 'Bumper Stickers', 'Body', ...,
       'Dinnerware & Serveware', 'Air Filters & Accessories',
       'Golf Clubs'], dtype=object)

In [76]:
le_L4.inverse_transform(enc_pred_L4)

array(['Batteries', 'Carburetors', 'Cookie Cutters', ..., 'Basic Collars',
       'Cookie Cutters', 'Tail Light Assemblies'], dtype=object)

## Preparing for submission

In [77]:
submission_v1 = pd.DataFrame()

In [78]:
submission_v1['indoml_id'] = test_ftrs['indoml_id']
submission_v1['details_Brand'] = le_db.inverse_transform(enc_pred_db)
submission_v1['L0_category'] = le_L0.inverse_transform(enc_pred_L0)
submission_v1['L1_category'] = le_L1.inverse_transform(enc_pred_L1)
submission_v1['L2_category'] = le_L2.inverse_transform(enc_pred_L2)
submission_v1['L3_category'] = le_L3.inverse_transform(enc_pred_L3)
submission_v1['L4_category'] = le_L4.inverse_transform(enc_pred_L4)

In [79]:
submission_v1['details_Brand'] = test_ftrs['store']

In [80]:
submission_v1.to_csv('./submissions/submission_v2.csv')

In [81]:
submission_v1_dict = submission_v1.to_dict('records')

In [82]:
import json

with open('./submissions/attribute_test_XGBv1.predict', 'w') as f:
    for record in submission_v1_dict:
        f.write(json.dumps(record) + '\n')

In [83]:
import zipfile

file = './submissions/attribute_test_XGBv1.predict'

with zipfile.ZipFile('./submissions/submission_XGBv1.zip', 'w') as f:
    f.write(file, arcname=file)