### Category, Brand Text classification

In [1]:
import datetime
import fasttext
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
keyword_df = pd.read_csv('../data/text_classification/brand_category_classification/keyword.csv')
keyword_df['keyword'] = keyword_df['keyword'].replace(['NaN', 'nan'], pd.NA)
keyword_df['keyword'] = keyword_df['keyword'].astype('str')
keyword_df['keyword'] = keyword_df['keyword'].str.strip()

keyword_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228541 entries, 0 to 228540
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   keyword  228541 non-null  object
 1   label    228541 non-null  object
dtypes: object(2)
memory usage: 3.5+ MB


#### Train, Test data split

In [3]:
corpus = keyword_df['keyword'].to_list()
labels = keyword_df['label'].to_list()

with open("../data/text_classification/brand_category_classification/brand_category_classification_label.txt", "w", encoding="utf-8") as file:
    for text, label in zip(corpus, labels):
        file.write(f"__label__{label} {text}\n")

X_train, X_test, y_train, y_test = train_test_split(corpus, labels, test_size=0.3, random_state=42)

#### Train

In [4]:
model = fasttext.train_supervised(
    "../data/text_classification/brand_category_classification/brand_category_classification_label.txt",
    lr=0.5,
    epoch=7000,
    wordNgrams=1,
    ws=1,
    minCountLabel=1
)

Read 0M words
Number of words:  87816
Number of labels: 2
Progress: 100.0% words/sec/thread: 1363352 lr:  0.000000 avg.loss:  0.046427 ETA:   0h 0m 0s


In [5]:
model.save_model("../model/text_classification/brand_category_classification/fasttext_model.bin")

#### Test

In [6]:
def evaluate_model(keywords, labels):
    predictions = []
    for keyword in keywords:
        predicted_label = model.predict(keyword)[0][0].split("__label__")[1]
        predictions.append(0 if predicted_label == 'brand' else 1)

    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    f1 = f1_score(labels, predictions)

    return accuracy, precision, recall, f1 

In [7]:
from sklearn.preprocessing import LabelEncoder

current_time = datetime.datetime.now()

label_encoder = LabelEncoder()

y_labels = label_encoder.fit_transform(y_test)

accuracy, precision, recall, f1 = evaluate_model(X_test, y_labels)
print(f'{current_time} - 정확도: {accuracy * 100:.2f}, 정밀도: {precision * 100:.2f}, 재현율: {recall * 100:.2f}, F1 점수: {f1 * 100:.2f}')


2023-09-18 19:43:34.105794 - 정확도: 99.87, 정밀도: 99.85, 재현율: 99.89, F1 점수: 99.87


In [8]:
# 모델 평가 및 정확도 계산
y_pred = [model.predict(text)[0][0].split("__label__")[1] for text in X_test]
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 99.87%


#### Predict

In [9]:
def predict(keywords):
    result = {'keyword': [], 'predicted_label': []}

    for keyword in keywords:
        predicted_label = model.predict(keyword)[0][0].split("__label__")[1]

        result['keyword'].append(keyword)
        result['predicted_label'].append(predicted_label)

    return result

In [10]:
predict(['z플립'])

{'keyword': ['z플립'], 'predicted_label': ['category']}

In [11]:
top_query_df = pd.read_csv('../data/text_classification/brand_category_classification/top_10000_query.csv')

In [12]:
import datetime

current_time = datetime.datetime.now()

test_keywords = top_query_df['keyword'].to_list()
predict_df = pd.DataFrame.from_dict(predict(test_keywords))

predict_df.to_csv('../model/text_classification/brand_category_classification/fasttext-predicts.csv', index=False)

print(f"created fasttext-predicts.csv, {current_time}")

created fasttext-predicts.csv, 2023-09-18 19:43:34.607488


In [13]:
predict_df

Unnamed: 0,keyword,predicted_label
0,이미스,brand
1,마뗑킴,brand
2,던스트,brand
3,키링,category
4,백팩,category
...,...,...
9995,발란,category
9996,worthwhile movement,brand
9997,18,category
9998,card wallet,category


### 테스트 데이터로 지표 측정

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [15]:
test_brand_df = pd.read_csv('../data/text_classification/brand_category_classification/test/test_brand.csv')
test_brand_category_df = pd.read_csv('../data/text_classification/brand_category_classification/test/test_brand_category.csv')
test_category_df = pd.read_csv('../data/text_classification/brand_category_classification/test/test_category.csv')

test_brand_df['keyword'] = test_brand_df['keyword'].astype('str')
test_brand_category_df['keyword'] = test_brand_category_df['keyword'].astype('str')
test_category_df['keyword'] = test_category_df['keyword'].astype('str')

test_df = pd.concat([test_brand_df, test_category_df])

In [16]:
corpus = test_df['keyword'].to_list()
labels = test_df['label'].to_list()

In [17]:
def evaluate_model(keywords, labels):
    predictions = []
    for keyword in keywords:
        predicted_label = model.predict(keyword)[0][0].split("__label__")[1]
        predictions.append(0 if predicted_label == 'brand' else 1)

    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    f1 = f1_score(labels, predictions)

    return accuracy, precision, recall, f1 

In [18]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)


In [19]:
accuracy, precision, recall, f1 = evaluate_model(corpus, labels)
print(f'{current_time} - 정확도: {accuracy * 100:.2f}, 정밀도: {precision * 100:.2f}, 재현율: {recall * 100:.2f}, F1 점수: {f1 * 100:.2f}')

2023-09-18 19:43:34.607488 - 정확도: 97.84, 정밀도: 93.75, 재현율: 98.25, F1 점수: 95.95


In [20]:
model.get_input_matrix().shape

(87816, 100)