<a href="https://colab.research.google.com/github/IRevan/AI/blob/master/UPAA/UPAA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader

# 데이터 제너레이터 정의
class CustomDataset(Dataset):
    def __init__(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            data = [line.strip().split('\t') for line in file]
        remove_set = [['']]
        data = [i for i in data if i not in remove_set]
        self.questions = [pair[0] for pair in data]
        self.answers = [pair[0] for pair in data]

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        return self.questions[idx], self.answers[idx]

file_path = '/content/drive/MyDrive/Colab Notebooks/cleaned_output.txt'

# 전체 데이터에 대한 단어장 생성
all_tokens = [word for sentence in CustomDataset(file_path).questions + CustomDataset(file_path).answers for word in sentence.split()]
vocab = list(set(all_tokens))
word_to_index = {word: idx for idx, word in enumerate(vocab)}
index_to_word = {idx: word for idx, word in enumerate(vocab)}

# 모델 정의
class SimpleChatbot(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleChatbot, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input, lengths):
        embedded = self.embedding(input)
        packed = pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)
        output, _ = self.rnn(packed)
        padded, _ = pad_packed_sequence(output, batch_first=True)
        output = self.fc(padded)
        return output

    def chat(self, initial_input, max_len=20):
        # 초기 입력에 대한 처리
        initial_input_tensor = torch.tensor([word_to_index[word] for word in initial_input.split()], dtype=torch.long).unsqueeze(0).cuda()
        initial_embedded = self.embedding(initial_input_tensor)

        # 초기 은닉 상태를 0으로 초기화
        hidden = torch.zeros(1, 1, hidden_size).cuda()

        # 디코더 실행
        decoder_output, _ = self.rnn(initial_embedded, hidden)

        # 예측 결과 초기화
        predicted_sequence = [word_to_index['<sos>']]

        for _ in range(max_len):
            output = self.fc(decoder_output.data)
            _, top_index = output.topk(1)
            predicted_sequence.append(top_index.item())

            # 디코더 입력 업데이트
            decoder_input = torch.tensor([top_index.item()], dtype=torch.long).unsqueeze(0).cuda()
            decoder_embedded = self.embedding(decoder_input)

            # 다음 스텝 예측을 위해 다음 히든 상태 계산
            decoder_output, _ = self.rnn(decoder_embedded, _)  # 이 부분을 수정

            # <eos>를 만나면 종료
            if top_index.item() == word_to_index['<eos>']:
                break

        # 예측된 시퀀스를 단어로 변환
        predicted_words = [index_to_word[idx] for idx in predicted_sequence]

        return predicted_words

# 하이퍼파라미터 설정
input_size = len(vocab)  # 어휘 사전 크기
hidden_size = 32
output_size = len(vocab)  # 어휘 사전 크기

# 데이터 로드 및 전처리
dataset = CustomDataset(file_path)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# 모델, 손실 함수, 최적화 함수 초기화
model = SimpleChatbot(input_size, hidden_size, output_size).cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# 학습
epochs = 1000
for epoch in range(epochs):
    total_loss = 0
    for batch in dataloader:
        questions, answers = batch

        # 텍스트를 토큰 인덱스로 변환
        questions_tensor = [torch.tensor([word_to_index[word] for word in sentence.split()], dtype=torch.long).cuda() for sentence in questions]
        answers_tensor = [torch.tensor([word_to_index[word] for word in sentence.split()], dtype=torch.long).cuda() for sentence in answers]

        # 패딩을 처리하기 위해 각 시퀀스의 길이를 구합니다.
        questions_lengths = [len(seq) for seq in questions_tensor]
        answers_lengths = [len(seq) for seq in answers_tensor]

        # 패딩된 텐서 생성
        questions_padded = nn.utils.rnn.pad_sequence(questions_tensor, batch_first=True)
        answers_padded = nn.utils.rnn.pad_sequence(answers_tensor, batch_first=True)

        optimizer.zero_grad()
        output = model(questions_padded, questions_lengths)
        loss = criterion(output.view(-1, output_size), answers_padded.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    if epoch % 100 == 0:
        print(f'Epoch {epoch}/{epochs}, Loss: {total_loss}')

Epoch 0/1000, Loss: 3540.778185427189
Epoch 100/1000, Loss: 0.00012093914685351592
Epoch 200/1000, Loss: 4.793883514508934e-05
Epoch 300/1000, Loss: 4.450732864924589e-05
Epoch 400/1000, Loss: 4.203426154170131e-05
Epoch 500/1000, Loss: 4.113318623666373e-05
Epoch 600/1000, Loss: 4.104423642381505e-05
Epoch 700/1000, Loss: 4.0920939154531766e-05
Epoch 800/1000, Loss: 4.081914100240169e-05
Epoch 900/1000, Loss: 4.0481658010627086e-05


In [33]:
# 챗봇 기능 테스트
user_input = "3D프린팅"
predicted_response = model.chat(user_input)
print(f"사용자: {user_input}")
print(f"챗봇: {' '.join(predicted_response)}")

KeyError: ignored