데이터 불러오기

In [28]:
!unzip '/content/drive/MyDrive/new_york_times.zip' -d new_york_times


  inflating: new_york_times/CommentsApril2018.csv  
  inflating: new_york_times/CommentsFeb2017.csv  
  inflating: new_york_times/CommentsFeb2018.csv  
  inflating: new_york_times/CommentsJan2017.csv  
  inflating: new_york_times/CommentsJan2018.csv  
  inflating: new_york_times/CommentsMarch2017.csv  
  inflating: new_york_times/CommentsMarch2018.csv  
  inflating: new_york_times/CommentsMay2017.csv  


In [29]:
import glob

In [30]:
files = glob.glob('./new_york_times/*.csv')
files[0]

'./new_york_times/ArticlesFeb2017.csv'

In [31]:
import pandas as pd
import string

In [32]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [33]:
df = pd.read_csv(files[0])
df.headline[:6]

0    N.F.L. vs. Politics Has Been Battle All Season...
1                               Voice. Vice. Veracity.
2                          A Stand-Up’s Downward Slide
3              New York Today: A Groundhog Has Her Day
4                 A Swimmer’s Communion With the Ocean
5                                       Trail Activity
Name: headline, dtype: object

In [34]:
# 특수문자 제거(string.punctuation) -> 각 단어마다 고유수자를 numbering(dictionary)

In [35]:
df.headline[11]

'Questions for: ‘On Alaska’s Coldest Days, a Village Draws Close for Warmth’'

In [36]:
temp = "".join(chr for chr in df.headline[11] if chr not in string.punctuation  )

In [37]:
temp_dic = {}
for v in temp.split():
  if v not in temp_dic.keys():
    temp_dic[v] = len(temp_dic.keys())

In [38]:
temp_dic

{'Questions': 0,
 'for': 1,
 '‘On': 2,
 'Alaska’s': 3,
 'Coldest': 4,
 'Days': 5,
 'a': 6,
 'Village': 7,
 'Draws': 8,
 'Close': 9,
 'Warmth’': 10}

학습용 데이터셋

In [39]:
import pandas as pd
import string
import glob
from torch.utils.data.dataset import Dataset

In [41]:
import numpy as np

class TextGeneration(Dataset):
  def __init__(self):
    # 모든 파일의 헤더라인의 텍스를 불러옴
    all_headlines = []
    for file in glob.glob('./new_york_times/*.csv'):
      if 'Articles' in file:
        df = pd.read_csv(file)
        all_headlines.extend( list(df.headline.values) )
        break
    # 'Unknown' 제거
    all_headlines = [h for h in all_headlines if h != 'Unknown']
    self.corpus =  [self.clean_text(x) for x in all_headlines ]  # 말뭉치
    self.BOW = {}

    for line in self.corpus:
      for word in line.split():
        if word not in self.BOW.keys():
          self.BOW[word] = len(self.BOW.keys())

    # 모델의 입력으로 사용할 데이터
    self.data = self.generate_sequence(self.corpus)

  def clean_text(self, txt): # 특수문자 제거
    txt = ''.join( v for v in txt if v not in string.punctuation).lower()
    return txt

  def generate_sequence(self, txt):
    seq = []
    for line in txt:
      line = line.split()
      line_bow = [self.BOW[word] for word in line]
      # 단어2개를 입력으로, 그 다음 단어를 정답
      data = [ ( [line_bow[i],line_bow[i+1] ] ,line_bow[i+2] )  for i in range(len(line_bow)-2)  ]
      seq.extend(data)
    return seq
  def __len__(self):
    return len(self.data)
  def __getitem__(self, index):
    data = np.array(self.data[index][0])
    label = np.array(self.data[index][1]).astype(np.float32)
    return data, label

In [48]:
from torch.utils.data.dataloader import DataLoader
dataset = TextGeneration()
loader = DataLoader(dataset, batch_size=64)
data, label = next(iter(loader))
data.shape, label.shape

(torch.Size([64, 2]), torch.Size([64]))

In [62]:
import torch.nn as nn
import torch

class LSTM(nn.Module):
  def __init__(self,num_embeddings):
    super(LSTM, self).__init__()
    # 희소벡터 - 밀집벡터 : 임베딩층
    self.embed = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=16)
    # LSTM층을 5개 쌓음
    self.lstm = nn.LSTM(
        input_size = 16, hidden_size = 64, num_layers=5,batch_first=True
    )
    # 분류 MLP  (2,64)
    self.fc1 = nn.Linear(in_features=2*64, out_features=num_embeddings)
    self.fc2 = nn.Linear(in_features=num_embeddings, out_features=num_embeddings)
    self.relu = nn.ReLU()
  def forward(self, x):
    x = self.embed(x)
    x, _ = self.lstm(x)  # RNN처럼 전체출력,마지막 은닉층의 상태 반환 # 64, 2, 64
    x = torch.reshape(x, (x.shape[0],-1))  # 64, 128
    x = self.fc1(x)
    x = self.relu(x)
    x = self.fc2(x)
    return x

학습루프

In [63]:
import tqdm
from torch.utils.data.dataloader import DataLoader
from torch.optim.adam import Adam

device = "cuda" if torch.cuda.is_available() else "cpu"

# 데이터셋
# 로더
# 모델
# 옵티마이져
dataset = TextGeneration()
loader = DataLoader(dataset,batch_size=64)
model = LSTM(num_embeddings=len(dataset.BOW) ).to(device)
optim = Adam(model.parameters(),lr=0.001)

In [65]:
for epoch in range(10):
  iterator = tqdm.tqdm(loader)
  for data, label in iterator:
    # 기울기 초기화
    optim.zero_grad()
    # 모델의 예측값
    pred = model(torch.tensor(data,dtype=torch.long).to(device) )
    # 정답도 long tensor
    loss = nn.CrossEntropyLoss()(pred, torch.tensor(label,dtype=torch.long).to(device) )
    # 오차 역전파
    loss.backward()
    optim.step()

    iterator.set_description(f"epoch:{epoch+1} loss:{loss.item()}")
# 모델저장
torch.save(model.state_dict(),'lstm.pth')

  pred = model(torch.tensor(data,dtype=torch.long).to(device) )
  loss = nn.CrossEntropyLoss()(pred, torch.tensor(label,dtype=torch.long).to(device) )
epoch:1 loss:6.523890018463135: 100%|██████████| 59/59 [00:12<00:00,  4.88it/s]
epoch:2 loss:6.4290313720703125: 100%|██████████| 59/59 [00:10<00:00,  5.51it/s]
epoch:3 loss:6.367801189422607: 100%|██████████| 59/59 [00:13<00:00,  4.50it/s]
epoch:4 loss:6.260339260101318: 100%|██████████| 59/59 [00:09<00:00,  6.45it/s]
epoch:5 loss:6.176182746887207: 100%|██████████| 59/59 [00:10<00:00,  5.54it/s]
epoch:6 loss:6.140148639678955: 100%|██████████| 59/59 [00:12<00:00,  4.90it/s]
epoch:7 loss:6.076382160186768: 100%|██████████| 59/59 [00:08<00:00,  6.89it/s]
epoch:8 loss:6.0234293937683105: 100%|██████████| 59/59 [00:10<00:00,  5.60it/s]
epoch:9 loss:5.920074462890625: 100%|██████████| 59/59 [00:11<00:00,  5.02it/s]
epoch:10 loss:5.629584789276123: 100%|██████████| 59/59 [00:08<00:00,  6.85it/s]


모델성능평가 : 문장생성

In [117]:
def generator(model, BOW, str_data = "finding an ",strlen=10):
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
  with torch.no_grad():
    for p in range(strlen):
      # 입력을 텐서로 변환

      words = torch.tensor([  BOW[w] for w in str_data.split() ], dtype=torch.long).to(device)
      # 입력으로 사용가능하게  배치차원 추가  문장의 마지막 두 단어를 선택
      input_tensor = torch.unsqueeze(words[-2:],dim=0)
      # (1, 2)
      output = model(input_tensor)
      output_word = (torch.argmax(output).cpu().numpy()) # 최대 값이 들어 있는 번호를 반환
      str_data += list(BOW.keys())[output_word]
      str_data += " "

  print(f"predict sentence : {str_data}")

In [118]:
model.load_state_dict(torch.load('lstm.pth', map_location=device))
pred = generator(model, dataset.BOW)

predict sentence : finding an trump the trump of the of of of of of 
