In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("aashita/nyt-comments")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/aashita/nyt-comments?dataset_version_number=13...


100%|██████████| 480M/480M [00:23<00:00, 21.0MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/aashita/nyt-comments/versions/13


In [10]:
import pandas as pd
import os
import string
import numpy as np
import glob
import torch.nn as nn
import torch
from torch.utils.data.dataset import Dataset
import tqdm
from torch.utils.data.dataloader import DataLoader
from torch.optim.adam import Adam

In [6]:
df=pd.read_csv('/root/.cache/kagglehub/datasets/aashita/nyt-comments/versions/13/ArticlesApril2017.csv')
df.columns

Index(['abstract', 'articleID', 'articleWordCount', 'byline', 'documentType',
       'headline', 'keywords', 'multimedia', 'newDesk', 'printPage', 'pubDate',
       'sectionName', 'snippet', 'source', 'typeOfMaterial', 'webURL'],
      dtype='object')

In [16]:
class TextGeneration(Dataset):
  '''
  self.BOW : 문서 전체에 대한 사전, {'ABC' : 123, 'EEG' :556}
  self.corpus : text lines, ['ghsldhsidg','sdfnsldfnunvls']
  self.data : 텍스트 한줄 을 모델에 input할 때 format,corpus 한줄에 해당하는 BOW의 value값 [([0,1],2),([1,2],3),([2,3],4),([3,4],5),([4,5],6)]
  '''
  def clean_text(self,txt):
    txt= "".join(v for v in txt if v not in string.punctuation).lower()
    return txt

  def __init__(self):
    all_headlines=[]

    for filename in glob.glob('/root/.cache/kagglehub/datasets/aashita/nyt-comments/versions/13/*.csv'):
        if 'Articles' in filename:
          article_df=pd.read_csv(filename)
          all_headlines.extend(list(article_df.headline.values))
          break

    all_headlines=[h for h in all_headlines if h!='Unknown']

    self.corpus=[self.clean_text(x) for x in all_headlines]
    self.BOW={}

    for line in self.corpus:
      for word in line.split():
        if word not in self.BOW.keys():
          self.BOW[word]=len(self.BOW.keys())

    self.data=self.generate_sequence(self.corpus)

  def generate_sequence(self,txt):
    seq=[]

    for line in txt:
      line=line.split()
      line_bow=[self.BOW[word] for word in line]

      data=[([line_bow[i], line_bow[i+1]],line_bow[i+2])
            for i in range(len(line_bow)-2)]

      seq.extend(data)
      return seq

  def __len__(self):
    return len(self.data)

  def __getitem__(self,i):
    data=np.array(self.data[i][0])
    label=np.array(self.data[i][1]).astype(np.float32)
    return data,label


In [28]:
class LSTM(nn.Module):
  def __init__(self, num_embeddings):
    super(LSTM,self).__init__()

    self.embed=nn.Embedding(num_embeddings=num_embeddings,embedding_dim=16)

    self.lstm=nn.LSTM(
        input_size=16,
        hidden_size=64,
        num_layers=5,
        batch_first=True)


    self.fc1=nn.Linear(128,num_embeddings)
    self.fc2=nn.Linear(num_embeddings,num_embeddings)

    self.relu=nn.ReLU()

  def forward(self,x):
    x=self.embed(x)

    x,_=self.lstm(x)
    x=torch.reshape(x,(x.shape[0],-1))
    x=self.fc1(x)
    x=self.relu(x)
    x=self.fc2(x)

    return x

In [30]:
device='cuda' if torch.cuda.is_available() else "cpu"

dataset=TextGeneration()
model=LSTM(num_embeddings=len(dataset.BOW)).to(device) #사전(BOW)의 길이만큼 embedding, BOW가 one-hot encoding 처럼 0이 많은 희소 행렬이기 때문에 임베딩 적용
loader=DataLoader(dataset,batch_size=64)
optim=Adam(model.parameters(),lr=0.001)

for epoch in range(200):
  iterator=tqdm.tqdm(loader)
  for data, label in iterator:
    optim.zero_grad()

    pred=model(torch.tensor(data,dtype=torch.long).to(device))

    loss=nn.CrossEntropyLoss()(pred,torch.tensor(label,dtype=torch.long).to(device))

    loss.backward()
    optim.step()

    iterator.set_description(f'epoch{epoch} loss:{loss.item()}')

torch.save(model.state_dict(),'lstm.pth')

  pred=model(torch.tensor(data,dtype=torch.long).to(device))
  loss=nn.CrossEntropyLoss()(pred,torch.tensor(label,dtype=torch.long).to(device))
epoch0 loss:7.7467942237854: 100%|██████████| 1/1 [00:00<00:00,  2.16it/s]
epoch1 loss:7.619811058044434: 100%|██████████| 1/1 [00:00<00:00, 120.58it/s]
epoch2 loss:7.490128517150879: 100%|██████████| 1/1 [00:00<00:00, 96.31it/s]
epoch3 loss:7.352680206298828: 100%|██████████| 1/1 [00:00<00:00, 105.46it/s]
epoch4 loss:7.202883243560791: 100%|██████████| 1/1 [00:00<00:00, 101.32it/s]
epoch5 loss:7.036068916320801: 100%|██████████| 1/1 [00:00<00:00, 101.09it/s]
epoch6 loss:6.847692012786865: 100%|██████████| 1/1 [00:00<00:00, 93.90it/s]
epoch7 loss:6.633042335510254: 100%|██████████| 1/1 [00:00<00:00, 106.18it/s]
epoch8 loss:6.387450218200684: 100%|██████████| 1/1 [00:00<00:00, 102.55it/s]
epoch9 loss:6.105827331542969: 100%|██████████| 1/1 [00:00<00:00, 98.80it/s]
epoch10 loss:5.782652854919434: 100%|██████████| 1/1 [00:00<00:00, 100.51it/s]
epo

In [38]:
def generate(model, BOW, string='finding an ', strlen=10):
  device = 'cuda' if torch.cuda.is_available() else 'cpu'

  print(f'input word : {string}')

  with torch.no_grad():
    for p in range(strlen):
      words=torch.tensor([BOW[w] for w in string.split()],dtype=torch.long).to(device)

      input_tensor=torch.unsqueeze(words[-2:],dim=0)
      output=model(input_tensor)
      output_word=(torch.argmax(output).cpu().numpy())
      string += list(BOW.keys())[output_word]
      string += " "

  print(f'predicted sentence: {string}')

model.load_state_dict(torch.load('lstm.pth',map_location=device))
pred=generate(model,dataset.BOW)


input word : finding an 
predicted sentence: finding an topple topple to pillars obama’s topple obama’s pillars pillars topple 


  model.load_state_dict(torch.load('lstm.pth',map_location=device))
