In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset

In [2]:
keyword_types_df = pd.read_csv('../data/text_classification/keyword_types.csv')
similar_keyword_df = pd.read_csv('../data/text_classification/similar_keyword.csv')
category_df = pd.read_csv('../data/text_classification/category.csv')
app_search_keyword_df = pd.read_csv('../data/text_classification/09_03_app_search_keyword.csv')
brand_df = pd.read_csv('../data/text_classification/brands.csv')

In [3]:
category_keywords = [keyword for keywords in similar_keyword_df['similar_keyword'].to_list() for keyword in keywords.split(',') if keyword != '']
category_labels = ['category' for _ in category_keywords]

similar_keyword_category_df = pd.DataFrame({'keyword': category_keywords, 'label': category_labels})

app_search_keyword_df = app_search_keyword_df[['keyword', 'brand_name']]
app_search_keyword_df = app_search_keyword_df[~app_search_keyword_df.apply(lambda row: row['brand_name'] in row['keyword'], axis=1)]
app_search_keyword_df = app_search_keyword_df[~app_search_keyword_df.apply(lambda row: row['keyword'] in row['brand_name'], axis=1)]
app_search_keyword_df = app_search_keyword_df[~app_search_keyword_df.apply(lambda row: row['keyword'] in row['brand_name'], axis=1)]
app_search_keyword_df = app_search_keyword_df[~app_search_keyword_df['keyword'].astype(str).str.isdigit()]
app_search_keyword_df = app_search_keyword_df[~app_search_keyword_df['keyword'].str.contains(r'^[0-9!@#$%^&*()+]+$')]
app_search_keyword_df = app_search_keyword_df[~app_search_keyword_df['keyword'].astype(str).str.match(r'^\d+\.\d+$')]
app_search_keyword_df = app_search_keyword_df[~(app_search_keyword_df['keyword'].str.replace(r'\s+', '', regex=True) == app_search_keyword_df['brand_name'].str.replace(r'\s+', '', regex=True))]
app_search_keyword_df = app_search_keyword_df[~app_search_keyword_df.apply(lambda row: row['brand_name'].replace(" ", "") in row['keyword'], axis=1)]
app_search_keyword_df = app_search_keyword_df[~app_search_keyword_df.apply(lambda row: row['brand_name'].replace(" ", "") in row['keyword'].replace(" ", ""), axis=1)]
app_search_keyword_df = app_search_keyword_df[~app_search_keyword_df['brand_name'].isin(['게스언더웨어', '24/7 시리즈', '24/7 시리즈 포 우먼', '호텔파리칠', '홈그로운 서플라이', '호와스', '헤지스골프', '헤이', '하킷', '헤라', '헤레우'])]
app_search_keyword_df = app_search_keyword_df[~app_search_keyword_df['keyword'].isin(['호텔파리칠'])]
app_search_keyword_df['keyword'] = app_search_keyword_df['keyword'].str.replace(r'\s+', '', regex=True)
app_search_keyword_df = app_search_keyword_df.drop_duplicates(subset='keyword', keep='first')

app_search_keywords = app_search_keyword_df['keyword'].to_list()
app_search_keywords_labels = ['category' for _ in app_search_keywords]

app_search_category_df = pd.DataFrame({'keyword': app_search_keywords, 'label': app_search_keywords_labels})

brands = brand_df['front_brand_name_kor'].to_list()[:4130]
brand_labels = ['brand' for _ in brands]

brand_keyword_df = pd.DataFrame({'keyword': brands, 'label': brand_labels})

keyword_df = pd.concat([keyword_types_df, similar_keyword_category_df, category_df, app_search_category_df, brand_keyword_df])

# keyword_df.drop_duplicates(inplace=True)
keyword_df['keyword'] = keyword_df['keyword'].astype('str')
keyword_df = keyword_df.sample(frac = 1)

print(len(brands))
print(len(keyword_df[keyword_df['label'] == 'brand']))
print(len(keyword_df[keyword_df['label'] == 'category']))

In [4]:
brand_df = keyword_df[keyword_df['label'] == 'brand'][:30000]
category_df = keyword_df[keyword_df['label'] == 'category'][:30000]

keyword_df = pd.concat([brand_df, category_df])
keyword_df = keyword_df.sample(frac = 1)

print(len(keyword_df))

60000


In [5]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, char_to_idx, max_seq_length):
        self.texts = texts
        self.labels = labels
        self.char_to_idx = char_to_idx
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        char_indices = [self.char_to_idx[c] for c in text if c in self.char_to_idx]
        char_indices += [0] * (self.max_seq_length - len(char_indices))

        return torch.tensor(char_indices), torch.tensor(label)

class CharCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(CharCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)  # 논문에서는 256차원의 임베딩을 사용
        self.conv1 = nn.Conv1d(512, 512, kernel_size=7, padding=0)  # 커널 크기 7
        self.conv2 = nn.Conv1d(512, 512, kernel_size=7, padding=0)
        self.conv3 = nn.Conv1d(512, 512, kernel_size=3, padding=0)  # 커널 크기 3
        self.conv4 = nn.Conv1d(512, 512, kernel_size=3, padding=0)
        self.fc1 = nn.Linear(1024, 1024)
        self.fc2 = nn.Linear(1024, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x1 = F.relu(self.conv1(x))
        x2 = F.relu(self.conv2(x))
        x3 = F.relu(self.conv3(x))
        x4 = F.relu(self.conv4(x))
        x1 = F.max_pool1d(x1, x1.size(2)).squeeze(2)
        x2 = F.max_pool1d(x2, x2.size(2)).squeeze(2)
        x3 = F.max_pool1d(x3, x3.size(2)).squeeze(2)
        x4 = F.max_pool1d(x4, x4.size(2)).squeeze(2)
        x = torch.cat((x1, x2, x3, x4), 1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x

In [6]:
vocab_size = 128
embed_dim = 512
num_classes = 2
max_seq_length = 100
learning_rate = 0.01
batch_size = 4096
num_epochs = 100

device = torch.device("mps") if torch.backends.mps.is_available() else "cpu"

In [7]:
label_encoder = LabelEncoder()
label_encoder.fit(keyword_df['label'].to_list())

texts = [keyword.replace(" ", "") for keyword in keyword_df['keyword'].to_list()]
labels = label_encoder.transform(keyword_df['label'].to_list())

char_to_idx = {chr(i): i for i in range(128)}  # 문자를 인덱스로 매핑
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)

train_dataset = CustomDataset(train_texts, train_labels, char_to_idx, max_seq_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = CustomDataset(val_texts, val_labels, char_to_idx, max_seq_length)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

best_val_loss = float('inf')
patience = 10
counter = 0

In [None]:
model = CharCNN(vocab_size, embed_dim, num_classes).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        inputs, labels = batch
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_loader:
            inputs, labels = batch
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    # if val_loss < best_val_loss:
    #     best_val_loss = val_loss
    #     torch.save(model.state_dict(), 'best_model.pth')
    #     counter = 0
    # else:
    #     counter += 1

    accuracy = 100 * correct / total
    print(f'epoch [{epoch + 1}/{num_epochs}] loss: {loss.item():.4f} val acc: {accuracy:.2f}%')

    # if counter >= patience:
    #     print("조기 종료: 검증 손실이 더 이상 감소하지 않습니다.")
    #     break

# model.load_state_dict(torch.load('best_model.pth'))

In [None]:
def evaluate_model(model, val_loader):
    model.eval()
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in val_loader:
            inputs, labels = batch
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            all_predictions.extend(predicted.tolist())
            all_labels.extend(labels.tolist())

    accuracy = accuracy_score(all_labels, all_predictions)
    precision = precision_score(all_labels, all_predictions)
    recall = recall_score(all_labels, all_predictions)
    f1 = f1_score(all_labels, all_predictions)

    return accuracy, precision, recall, f1

accuracy, precision, recall, f1 = evaluate_model(model, val_loader)
print(f'정확도: {accuracy:.2f}, 정밀도: {precision:.2f}, 재현율: {recall:.2f}, F1 점수: {f1:.2f}')


In [None]:
def predict_text(text, model, char_to_idx, max_seq_length):
    char_indices = [char_to_idx[c] for c in text if c in char_to_idx]
    char_indices += [0] * (max_seq_length - len(char_indices))
    
    model.eval()
    
    inputs = torch.tensor(char_indices).unsqueeze(0)
    inputs = inputs.to(device)
    
    with torch.no_grad():
        outputs = model(inputs)

    _, predicted = torch.max(outputs.data, 1)
    
    return predicted.item()


texts = ["이십삼점오", "니트", "바람막이", "후드티", "블랭크룸", "가디건", "키링", "모자", "후드집업", "오패"]

for text in texts:
    predicted_class = predict_text(text, model, char_to_idx, max_seq_length)
    print(f"{text}: {'category' if predicted_class == 1 else 'brand'}")