In [1]:
import datetime
import pandas as pd
import torch

In [2]:
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print(x)
else:
    print("MPS device not found.")

device = torch.device("mps") if torch.backends.mps.is_available() else "cpu"
device

tensor([1.], device='mps:0')


device(type='mps')

In [16]:
all_products_df = pd.read_json('../../data/products.json')
all_products_df.fillna("",inplace=True)

products_df = all_products_df[all_products_df['item_name'] != '']
products_df['all_keywords'] = products_df['item_name']

products_df = products_df[:500]

In [4]:
category_small_codes = [category_small_code[0] for category_small_code in products_df['category_small_code'].tolist()]

500

In [5]:
category_names = products_df['category_small_name'].tolist()

all_categories = []
for category_name in category_names:
    categories = []
    for name in category_name:
        if name is not None:
            categories.append(name)

    all_categories.append(categories)

category_small_names = [category_small_name for category_small_name in all_categories]
len(category_small_names)

500

In [22]:
labels = list(dict.fromkeys(category_small_codes))
label_dict = {code: {'label': labels.index(code), 'category_name': name} for code, name in zip(category_small_codes, category_small_names)}

In [7]:
train_texts = products_df['all_keywords'].to_list()
train_labels = [label_dict[category_small_code]['label'] for category_small_code in category_small_codes]

500

In [8]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [9]:
from transformers import AutoTokenizer, GPT2ForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialogRPT-updown")
model = GPT2ForSequenceClassification.from_pretrained("microsoft/DialogRPT-updown", num_labels=len(train_labels), ignore_mismatched_sizes=True).to(device)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at microsoft/DialogRPT-updown and are newly initialized because the shapes did not match:
- score.weight: found shape torch.Size([1, 1024]) in the checkpoint and torch.Size([500, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from torch.utils.data import DataLoader

train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [11]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [12]:
from tqdm import tqdm

epochs = 25

for epoch in range(epochs):
    model.train()
    average_loss = 0
    total_loss = 0

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}"):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_dataloader)
    print(f"Average Loss: {average_loss}")

Epoch 1: 100%|██████████| 32/32 [01:44<00:00,  3.27s/it]


Average Loss: 6.077704086899757


Epoch 2: 100%|██████████| 32/32 [01:45<00:00,  3.28s/it]


Average Loss: 5.315264269709587


Epoch 3: 100%|██████████| 32/32 [01:37<00:00,  3.03s/it]


Average Loss: 4.279220201075077


Epoch 4: 100%|██████████| 32/32 [01:38<00:00,  3.09s/it]


Average Loss: 3.6631169989705086


Epoch 5: 100%|██████████| 32/32 [01:37<00:00,  3.05s/it]


Average Loss: 3.2592661306262016


Epoch 6: 100%|██████████| 32/32 [01:40<00:00,  3.15s/it]


Average Loss: 2.8987045735120773


Epoch 7: 100%|██████████| 32/32 [01:49<00:00,  3.44s/it]


Average Loss: 2.5548619963228703


Epoch 8: 100%|██████████| 32/32 [01:49<00:00,  3.42s/it]


Average Loss: 2.230925478041172


Epoch 9: 100%|██████████| 32/32 [01:45<00:00,  3.30s/it]


Average Loss: 1.8484684564173222


Epoch 10: 100%|██████████| 32/32 [01:44<00:00,  3.28s/it]


Average Loss: 1.3650821465998888


Epoch 11: 100%|██████████| 32/32 [01:42<00:00,  3.21s/it]


Average Loss: 1.0240700729191303


Epoch 12: 100%|██████████| 32/32 [01:36<00:00,  3.03s/it]


Average Loss: 0.761782986111939


Epoch 13: 100%|██████████| 32/32 [01:35<00:00,  2.99s/it]


Average Loss: 0.5757962833158672


Epoch 14: 100%|██████████| 32/32 [01:38<00:00,  3.09s/it]


Average Loss: 0.4340930241160095


Epoch 15: 100%|██████████| 32/32 [01:49<00:00,  3.43s/it]


Average Loss: 0.34720771899446845


Epoch 16: 100%|██████████| 32/32 [01:48<00:00,  3.41s/it]


Average Loss: 0.2735220920294523


Epoch 17: 100%|██████████| 32/32 [01:55<00:00,  3.60s/it]


Average Loss: 0.21409555291756988


Epoch 18: 100%|██████████| 32/32 [01:55<00:00,  3.62s/it]


Average Loss: 0.20047709555365145


Epoch 19: 100%|██████████| 32/32 [01:53<00:00,  3.53s/it]


Average Loss: 0.15321434324141592


Epoch 20: 100%|██████████| 32/32 [02:03<00:00,  3.86s/it]


Average Loss: 0.14670632465276867


Epoch 21: 100%|██████████| 32/32 [02:00<00:00,  3.77s/it]


Average Loss: 0.13331855786964297


Epoch 22: 100%|██████████| 32/32 [02:06<00:00,  3.95s/it]


Average Loss: 0.11739905725698918


Epoch 23: 100%|██████████| 32/32 [02:03<00:00,  3.87s/it]


Average Loss: 0.08476958971004933


Epoch 24: 100%|██████████| 32/32 [02:02<00:00,  3.82s/it]


Average Loss: 0.08015372406225652


Epoch 25: 100%|██████████| 32/32 [02:06<00:00,  3.94s/it]

Average Loss: 0.0653611786547117





In [20]:
def predict(sentence_dict):
    for sentence in sentence_dict.keys():
        inputs = tokenizer(sentence, return_tensors="pt").to(device)
        ground_truth = sentence_dict[sentence]

        with torch.no_grad():
            logits = model(**inputs).logits

        predicted_class_id = logits.argmax().item()

        for key in label_dict.keys():
            if label_dict[key].get('label') == predicted_class_id:
                print(f"sentence: {sentence}\npredict: {label_dict[key].get('category_name')[0]}\nground_truth: {ground_truth}")
        print()

In [21]:
sentence_dict = {
    "amour velvet double ribbon hair claw middle ver": '헤어핀',
    "classic mink cubic pin beige": '헤어핀',
    'cubic metal ball mix velvet headband': '머리띠',
    'Hand Stitch Beanie Black': '비니'
}

predict(sentence_dict)

sentence: amour velvet double ribbon hair claw middle ver
predict: 헤어핀
ground_truth: 헤어핀

sentence: classic mink cubic pin beige
predict: 헤어핀
ground_truth: 헤어핀

sentence: cubic metal ball mix velvet headband
predict: 머리띠,밴드
ground_truth: 머리띠

sentence: Hand Stitch Beanie Black
predict: 비니
ground_truth: 비니

