In [1]:
import datetime
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset

In [2]:
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


In [3]:
keyword_df = pd.read_csv('../data/text_classification/brand_category_classification/keyword.csv')
keyword_df['keyword'] = keyword_df['keyword'].replace(['NaN', 'nan'], pd.NA)
keyword_df['keyword'] = keyword_df['keyword'].astype('str')
keyword_df['keyword'] = keyword_df['keyword'].str.strip()

keyword_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228541 entries, 0 to 228540
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   keyword  228541 non-null  object
 1   label    228541 non-null  object
dtypes: object(2)
memory usage: 3.5+ MB


In [4]:
max_length = keyword_df['keyword'].apply(len).max()

print(f'max length: {max_length}')

max length: 59


In [5]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, char_to_idx, max_seq_length):
        self.texts = texts
        self.labels = labels
        self.char_to_idx = char_to_idx
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        try:
            char_indices = [self.char_to_idx[c] for c in text if c in self.char_to_idx]
            char_indices += [0] * (self.max_seq_length - len(char_indices))
        except:
            print(f"idx: {idx}")
            print(f"text: {text}")

        return torch.tensor(char_indices), torch.tensor(label)

class CharCNN(nn.Module):
    def __init__(self, num_features, embed_dim, num_classes):
        super(CharCNN, self).__init__()
        self.embedding = nn.Embedding(num_features, embed_dim)
        
        self.conv1 = nn.Conv1d(embed_dim, 128, kernel_size=7, padding=0)
        self.bn1 = nn.BatchNorm1d(128)
        self.conv2 = nn.Conv1d(embed_dim, 128, kernel_size=7, padding=0)
        self.bn2 = nn.BatchNorm1d(128)
        self.conv3 = nn.Conv1d(embed_dim, 128, kernel_size=3, padding=0)
        self.bn3 = nn.BatchNorm1d(128)
        self.conv4 = nn.Conv1d(embed_dim, 128, kernel_size=3, padding=0)
        self.bn4 = nn.BatchNorm1d(128)
        
        self.fc1 = nn.Linear(512, 1024)
        self.fc2 = nn.Linear(1024, num_classes)
        
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x1 = F.relu(self.bn1(self.conv1(x)))
        x2 = F.relu(self.bn2(self.conv2(x)))
        x3 = F.relu(self.bn3(self.conv3(x)))
        x4 = F.relu(self.bn4(self.conv4(x)))
        x1 = F.max_pool1d(x1, x1.size(2)).squeeze(2)
        x2 = F.max_pool1d(x2, x2.size(2)).squeeze(2)
        x3 = F.max_pool1d(x3, x3.size(2)).squeeze(2)
        x4 = F.max_pool1d(x4, x4.size(2)).squeeze(2)
        x = torch.cat((x1, x2, x3, x4), 1)
        x = F.relu(self.fc1(x))
        
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x

In [6]:
batch_size = 32

In [7]:
label_encoder = LabelEncoder()
label_encoder.fit(keyword_df['label'].to_list())

texts = keyword_df['keyword'].to_list()
labels = label_encoder.transform(keyword_df['label'].to_list())

charset = set("".join(texts))
num_features = len(charset)

num_features

1903

In [8]:
char_to_idx = {char: idx for idx, char in enumerate(charset)}

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

train_dataset = CustomDataset(train_texts, train_labels, char_to_idx, max_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = CustomDataset(val_texts, val_labels, char_to_idx, max_length)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

test_dataset = CustomDataset(test_texts, test_labels, char_to_idx, max_length)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

best_val_loss = float('inf')
patience = 10
counter = 0

In [9]:
embed_dim = 128
num_classes = 2
learning_rate = 0.01
num_epochs = 30

device = torch.device("mps") if torch.backends.mps.is_available() else "cpu"
# device="cpu"

In [10]:
start_time = datetime.datetime.now()

print(f'start: {start_time}')

model = CharCNN(num_features, embed_dim, num_classes).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        inputs, labels = batch
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_loader:
            inputs, labels = batch
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), '../model/text_classification/brand_category_classification/char-cnn2_emb_dim_128.model')
        counter = 0
    else:
        counter += 1

    accuracy = 100 * correct / total
    current_time = datetime.datetime.now()
    print(f'{current_time} - epoch [{epoch + 1}/{num_epochs}] loss: {loss.item():.4f} val acc: {accuracy:.2f}%')

    if counter >= patience:
        print("early stop")
        break

end_time = datetime.datetime.now()
print(f'end: {end_time}')

elapsed_time = start_time - start_time
print(f'elapsed: {elapsed_time}')

model.load_state_dict(torch.load('../model/text_classification/brand_category_classification/char-cnn2_emb_dim_128.model'))
# torch.save(model.state_dict(), '../model/text_classification/brand_category_classification/char-cnn2.model')

print(f'model saved')

start: 2023-09-27 12:33:36.554880
2023-09-27 12:36:08.256139 - epoch [1/30] loss: 0.0294 val acc: 96.51%
2023-09-27 12:38:38.899432 - epoch [2/30] loss: 0.0167 val acc: 96.93%
2023-09-27 12:41:10.864629 - epoch [3/30] loss: 0.0045 val acc: 96.95%
2023-09-27 12:43:38.315852 - epoch [4/30] loss: 0.0267 val acc: 97.13%
2023-09-27 12:46:06.181445 - epoch [5/30] loss: 0.0135 val acc: 97.29%
2023-09-27 12:48:31.989002 - epoch [6/30] loss: 0.0199 val acc: 97.20%
2023-09-27 12:50:56.623061 - epoch [7/30] loss: 0.0247 val acc: 97.56%
2023-09-27 12:53:24.849731 - epoch [8/30] loss: 0.0253 val acc: 97.55%
2023-09-27 12:55:46.389853 - epoch [9/30] loss: 0.0160 val acc: 97.49%
2023-09-27 12:58:00.652078 - epoch [10/30] loss: 0.0084 val acc: 97.60%
2023-09-27 13:00:14.891079 - epoch [11/30] loss: 0.0100 val acc: 97.67%
2023-09-27 13:02:29.043726 - epoch [12/30] loss: 0.0091 val acc: 97.70%
2023-09-27 13:04:45.627926 - epoch [13/30] loss: 0.0258 val acc: 97.41%
2023-09-27 13:07:04.328352 - epoch [14/

In [11]:
def evaluate_model(model, val_loader):
    model.eval()
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in val_loader:
            inputs, labels = batch
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            all_predictions.extend(predicted.tolist())
            all_labels.extend(labels.tolist())

    accuracy = accuracy_score(all_labels, all_predictions)
    precision = precision_score(all_labels, all_predictions)
    recall = recall_score(all_labels, all_predictions)
    f1 = f1_score(all_labels, all_predictions)

    return accuracy, precision, recall, f1

In [12]:
current_time = datetime.datetime.now()

accuracy, precision, recall, f1 = evaluate_model(model, val_loader)
print(f'{current_time} - 정확도: {accuracy * 100:.2f}, 정밀도: {precision * 100:.2f}, 재현율: {recall * 100:.2f}, F1 점수: {f1 * 100:.2f}')


2023-09-27 13:13:48.932611 - 정확도: 97.56, 정밀도: 99.08, 재현율: 95.98, F1 점수: 97.51


In [13]:
def predict_text(text, model, char_to_idx, max_seq_length):
    char_indices = [char_to_idx[c] for c in text if c in char_to_idx]
    char_indices += [0] * (max_seq_length - len(char_indices))
    
    model.eval()
    
    inputs = torch.tensor(char_indices).unsqueeze(0)
    inputs = inputs.to(device)
    
    with torch.no_grad():
        outputs = model(inputs)

    _, predicted = torch.max(outputs.data, 1)
    
    return predicted.item()

current_time = datetime.datetime.now()

texts = ["이십삼점오", "니트", "바람막이", "후드티", "블랭크룸", "가디건", "키링", "모자", "후드집업", "오패"]

for text in texts:
    predicted_class = predict_text(text, model, char_to_idx, max_length)
    print(f"{current_time} - {text}: {'category' if predicted_class == 1 else 'brand'}")

2023-09-27 13:14:00.357044 - 이십삼점오: brand
2023-09-27 13:14:00.357044 - 니트: category
2023-09-27 13:14:00.357044 - 바람막이: category
2023-09-27 13:14:00.357044 - 후드티: category
2023-09-27 13:14:00.357044 - 블랭크룸: brand
2023-09-27 13:14:00.357044 - 가디건: category
2023-09-27 13:14:00.357044 - 키링: category
2023-09-27 13:14:00.357044 - 모자: category
2023-09-27 13:14:00.357044 - 후드집업: category
2023-09-27 13:14:00.357044 - 오패: brand


In [15]:
top_query_df = pd.read_csv('../data/text_classification/brand_category_classification/top_10000_query.csv')

test_keywords = top_query_df['검색어'].to_list()
result = {'keyword': [], 'predicted_label': []}
all_predictions = []

for text in test_keywords:
    predicted_class = predict_text(text, model, char_to_idx, max_length)
    all_predictions.append(predicted_class)

    result['keyword'].append(text)
    result['predicted_label'].append('category' if predicted_class == 1 else 'brand')

current_time = datetime.datetime.now()
    
predict_df = pd.DataFrame.from_dict(result)
predict_df.to_csv('../model/text_classification/brand_category_classification/char-cnn2_emb_dim_128_predicts.csv', index=False)

print(f"created predicts.csv, {current_time}")

created predicts.csv, 2023-09-27 13:24:14.348224


### 테스트 데이터

In [17]:
label_encoder = LabelEncoder()
label_encoder.fit(top_query_df['Type'].to_list())

test_labels = label_encoder.fit_transform(top_query_df['Type'].to_list())

accuracy = accuracy_score(test_labels, all_predictions)
precision = precision_score(test_labels, all_predictions)
recall = recall_score(test_labels, all_predictions)
f1 = f1_score(test_labels, all_predictions)

print(f'Top 10,000 query에 대한 정확도: {accuracy:.2f}, 정밀도: {precision:.2f}, 재현율: {recall:.2f}')

Top 10,000 query에 대한 정확도: 0.93, 정밀도: 0.93, 재현율: 0.78
