### Category, Brand Text classification

In [25]:
import fasttext
import pandas as pd
import torch
from konlpy.tag import Okt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, Dataset
from torchtext.vocab import build_vocab_from_iterator
from nltk import ngrams

In [26]:
keyword_types_df = pd.read_csv('../data/text_classification/keyword_types.csv')
similar_keyword_df = pd.read_csv('../data/text_classification/similar_keyword.csv')

### Merge Data

In [27]:
print(len(keyword_types_df[keyword_types_df['label'] == 'brand']))
print(len(keyword_types_df[keyword_types_df['label'] == 'category']))

35681
4705


In [28]:
category_keywords = [keyword for keywords in similar_keyword_df['similar_keyword'].to_list() for keyword in keywords.split(',') if keyword != '']
category_labels = ['category' for _ in category_keywords]

category_df = pd.DataFrame({'keyword': category_keywords, 'label': category_labels})

keyword_df = pd.concat([keyword_types_df, category_df])


# keyword_df.drop_duplicates(inplace=True)
keyword_df['keyword'] = keyword_df['keyword'].astype('str')
keyword_df = keyword_df.sample(frac = 1)

print(len(keyword_df[keyword_df['label'] == 'brand']))
print(len(keyword_df[keyword_df['label'] == 'category']))


35681
40039


### Train, Test split

In [29]:
corpus = keyword_df['keyword'].to_list()
labels = keyword_df['label'].to_list()

with open("../data/text_classification/category_brand_keyword.txt", "w", encoding="utf-8") as file:
    for text, label in zip(corpus, labels):
        file.write(f"__label__{label} {text}\n")

X_train, X_test, y_train, y_test = train_test_split(corpus, labels, test_size=0.2, random_state=42)

In [33]:
model = fasttext.train_supervised("category_brand_keyword.txt", lr=0.5, epoch=1000, wordNgrams=1, ws=1)

Read 0M words
Number of words:  40634
Number of labels: 2
Progress:  99.9% words/sec/thread: 1324050 lr:  0.000544 avg.loss:  0.008020 ETA:   0h 0m 0s

Accuracy: 99.97%


Progress: 100.0% words/sec/thread: 1318389 lr:  0.000000 avg.loss:  0.008012 ETA:   0h 0m 0s


In [34]:
# 모델 평가 및 정확도 계산
y_pred = [model.predict(text)[0][0].split("__label__")[1] for text in X_test]
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 99.97%


In [42]:
# 새로운 텍스트에 대한 예측
new_texts = ["이십삼점오", "니트", "바람막이", "후드티", "블랭크룸", "가디건", "키링", "모자", "후드집업"]

for new_text in new_texts:
    predicted_label = model.predict(new_text)[0][0].split("__label__")[1]
    print(f"{new_text}: {predicted_label}")

이십삼점오: brand
니트: category
바람막이: category
후드티: brand
블랭크룸: brand
가디건: category
키링: category
모자: category
후드집업: category
