### Category, Brand Text classification

In [76]:
import fasttext
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [77]:
keyword_df = pd.read_csv('../data/text_classification/brand_category_classification/keyword.csv')

#### Train, Test data split

In [78]:
corpus = keyword_df['keyword'].to_list()
labels = keyword_df['label'].to_list()

with open("../data/text_classification/brand_category_classification/brand_category_classification_label.txt", "w", encoding="utf-8") as file:
    for text, label in zip(corpus, labels):
        file.write(f"__label__{label} {text}\n")

X_train, X_test, y_train, y_test = train_test_split(corpus, labels, test_size=0.3, random_state=42)

#### Train

In [79]:
model = fasttext.train_supervised(
    "../data/text_classification/brand_category_classification/brand_category_classification_label.txt",
    lr=0.5,
    epoch=7000,
    wordNgrams=1,
    ws=1,
    minCountLabel=1
)

Read 0M words
Number of words:  86924
Number of labels: 2
Progress: 100.0% words/sec/thread: 1395638 lr:  0.000000 avg.loss:  0.049501 ETA:   0h 0m 0s1395546 lr:  0.183454 avg.loss:  0.064809 ETA:   0h 2m33s


In [80]:
model.save_model("../model/text_classification/brand_category_classification/fasttext_model.bin")

#### Test

In [None]:
# 모델 평가 및 정확도 계산
y_pred = [model.predict(text)[0][0].split("__label__")[1] for text in X_test]
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

#### Predict

In [83]:
def predict(keywords):
    result = {'keyword': [], 'predicted_label': []}

    for keyword in keywords:
        predicted_label = model.predict(keyword)[0][0].split("__label__")[1]

        result['keyword'].append(keyword)
        result['predicted_label'].append(predicted_label)

    return result

In [84]:
top_query_df = pd.read_csv('../data/text_classification/brand_category_classification/top_10000_query.csv')

In [85]:
import datetime

current_time = datetime.datetime.now()

test_keywords = top_query_df['keyword'].to_list()
predict_df = pd.DataFrame.from_dict(predict(test_keywords))

predict_df.to_csv('../model/text_classification/brand_category_classification/predicts.csv')

print(f"created predicts.csv, {current_time}")

created predicts.csv, 2023-09-12 00:02:47.666222


In [86]:
predict_df

Unnamed: 0,keyword,predicted_label
0,이미스,brand
1,마뗑킴,brand
2,던스트,brand
3,키링,category
4,백팩,category
...,...,...
9995,발란,category
9996,worthwhile movement,brand
9997,18,category
9998,card wallet,category


In [None]:
# keyword_types_df = pd.read_csv('../data/text_classification/keyword_types.csv')
# # similar_keyword_df = pd.read_csv('../data/text_classification/similar_keyword.csv')
# category_df = pd.read_csv('../data/text_classification/category.csv')
# app_search_keyword_df = pd.read_csv('../data/text_classification/09_03_app_search_keyword.csv')
# brand_df = pd.read_csv('../data/text_classification/brands.csv')

In [None]:
# len(brand_df)

In [None]:
# keyword_types_df

In [None]:
# keyword_types_df[keyword_types_df['keyword'] == '스텐드오일']

### Merge Data

In [None]:
# brands = brand_df['front_brand_name_kor'].to_list()[:4130]
# brand_labels = ['brand' for _ in brands]
#
# brand_keyword_df = pd.DataFrame({'keyword': brands, 'label': brand_labels})
#
# category_keywords = [keyword for keywords in similar_keyword_df['similar_keyword'].to_list() for keyword in keywords.split(',') if keyword != '']
# category_labels = ['category' for _ in category_keywords]
#
# similar_keyword_category_df = pd.DataFrame({'keyword': category_keywords, 'label': category_labels})
#
# app_search_keyword_df = app_search_keyword_df[['keyword', 'brand_name']]
#
# app_search_keyword_df[~app_search_keyword_df['keyword'].isin(brands)]
# app_search_keyword_df = app_search_keyword_df[~app_search_keyword_df.apply(lambda row: row['brand_name'] in row['keyword'], axis=1)]
# app_search_keyword_df = app_search_keyword_df[~app_search_keyword_df.apply(lambda row: row['keyword'] in row['brand_name'], axis=1)]
# app_search_keyword_df = app_search_keyword_df[~app_search_keyword_df['keyword'].astype(str).str.isdigit()]
# app_search_keyword_df = app_search_keyword_df[~app_search_keyword_df['keyword'].str.contains(r'^[0-9!@#$%^&*()+]+$')]
# app_search_keyword_df = app_search_keyword_df[~app_search_keyword_df['keyword'].astype(str).str.match(r'^\d+\.\d+$')]
# app_search_keyword_df = app_search_keyword_df[~(app_search_keyword_df['keyword'].str.replace(r'\s+', '', regex=True) == app_search_keyword_df['brand_name'].str.replace(r'\s+', '', regex=True))]
# app_search_keyword_df = app_search_keyword_df[~app_search_keyword_df.apply(lambda row: row['brand_name'].replace(" ", "") in row['keyword'], axis=1)]
# app_search_keyword_df = app_search_keyword_df[~app_search_keyword_df.apply(lambda row: row['brand_name'].replace(" ", "") in row['keyword'].replace(" ", ""), axis=1)]
# app_search_keyword_df = app_search_keyword_df[~app_search_keyword_df['brand_name'].isin(['게스언더웨어', '24/7 시리즈', '24/7 시리즈 포 우먼', '호텔파리칠', '홈그로운 서플라이', '호와스', '헤지스골프', '헤이', '하킷', '헤라', '헤레우'])]
# app_search_keyword_df = app_search_keyword_df[~app_search_keyword_df['keyword'].isin(['호텔파리칠'])]
# app_search_keyword_df['keyword'] = app_search_keyword_df['keyword'].str.replace(r'\s+', '', regex=True)
# app_search_keyword_df = app_search_keyword_df.drop_duplicates(subset='keyword', keep='first')
#
# app_search_keywords = app_search_keyword_df['keyword'].to_list()
# app_search_keywords_labels = ['category' for _ in app_search_keywords]
#
# app_search_category_df = pd.DataFrame({'keyword': app_search_keywords, 'label': app_search_keywords_labels})
#
# keyword_df = pd.concat([keyword_types_df, similar_keyword_category_df, category_df, app_search_category_df, brand_keyword_df])
#
# keyword_df['keyword'] = keyword_df['keyword'].astype('str')
# keyword_df = keyword_df.sample(frac = 1)

### Train, Test split

In [None]:
# print(len(keyword_df[keyword_df['label'] == 'brand']))
# print(len(keyword_df[keyword_df['label'] == 'category']))

In [None]:
# # 추가된 내용
# # brand_df = keyword_df[keyword_df['label'] == 'brand'][:39811]
# # category_df = keyword_df[keyword_df['label'] == 'category'][:39811]
#
# # keyword_df = pd.concat([brand_df, category_df])
# keyword_df = keyword_df.sample(frac = 1)

In [None]:
# keyword_df[keyword_df.label.isna()]

In [None]:
# corpus = keyword_df['keyword'].to_list()
# labels = keyword_df['label'].to_list()
#
# # corpus = keyword_df['keyword'].to_list()[:30000]
# # labels = keyword_df['label'].to_list()[:30000]
#
# with open("../data/text_classification/category_brand_keyword.txt", "w", encoding="utf-8") as file:
#     for text, label in zip(corpus, labels):
#         file.write(f"__label__{label} {text}\n")
#
# X_train, X_test, y_train, y_test = train_test_split(corpus, labels, test_size=0.2, random_state=42)

### Train

In [None]:
# model = fasttext.train_supervised(
#     "../data/text_classification/category_brand_keyword.txt",
#     lr=0.5,
#     epoch=7000,
#     wordNgrams=1,
#     ws=1,
#     minCountLabel=1
# )

In [None]:
# # 모델 평가 및 정확도 계산
# y_pred = [model.predict(text)[0][0].split("__label__")[1] for text in X_test]
# accuracy = accuracy_score(y_test, y_pred)
# print(f"Accuracy: {accuracy * 100:.2f}%")

In [None]:
# def predict(keywords):
#     result = {'keyword': [], 'predicted_label': []}
#
#     for keyword in keywords:
#         predicted_label = model.predict(keyword)[0][0].split("__label__")[1]
#
#         result['keyword'].append(keyword)
#         result['predicted_label'].append(predicted_label)
#
#         print(f"{keyword}: {predicted_label}")