<a href="https://colab.research.google.com/github/KimDukJung/bbc/blob/main/bbc_classic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1번 블록

# 기본 패키지 임포트
import csv
import numpy as np
from time import time

# 데이터 처리 관련 패키지 임포트
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# 인공신경망 관련 패키지 임포트
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout


In [None]:
# 2번 블록
# 하이퍼파라미터

MY_VOCAB = 5000  # 내가 사용할 단어 수
MY_EMBED = 64  # 임베딩 차원
MY_HIDDEN = 100  # LSTM 출력 셀의 크기
MY_LEN = 200    # 기사 최대 길이

MY_SPLIT = 0.8   # 학습용 데이터 비율
MY_SAMPLE = 123   # 샘플용 기사를 123번으로 하겠다
MY_EPOCH = 10   # 에포크 횟수

# 데이터 보관 공간 설정
original = []   # 원본 기사
processed = []   # 전처리 된 기사
labels = []    # 기사 정답, 카테고리 라벨

In [None]:
# 3번 블록

# 제외어(stopword) 설정
nltk.download('stopwords')
MY_STOP = set(nltk.corpus.stopwords.words('english'))

# 제외어 출력
print('영어 제외어', MY_STOP)
print('제외어 개수 : ', len(MY_STOP))
print(type(MY_STOP))
print('the' in MY_STOP)

영어 제외어 {'weren', 'been', "mustn't", "didn't", "shan't", "haven't", 'll', 'm', 'up', 'own', 'should', 'myself', 'their', 'are', 'me', 'has', 'hasn', 'than', 'doesn', "should've", "you're", 'be', 'theirs', "it's", 'them', 'have', 'over', 'doing', 'because', 'a', 'that', 'then', 'both', 'i', 'can', "hadn't", 'won', 'an', 'we', "that'll", 't', "isn't", 'against', 'couldn', 'while', 'all', "don't", 'herself', 'most', 'at', 'not', 'these', 'further', 'between', "weren't", "she's", 'too', 'he', 'yours', 'him', 'it', 'haven', 'being', 'isn', 'ourselves', "needn't", 'o', 'ours', 'in', 'of', 'having', 'ain', 's', 'needn', 'his', 'again', 'through', 'itself', 'few', 'himself', 'am', 'had', 'which', 'or', "you've", 'yourself', 'd', 'do', 'and', "couldn't", "won't", 'after', 'those', 'to', 'there', 'above', 'yourselves', 'same', "you'll", 'each', 'hadn', "shouldn't", 'here', 'her', 'were', 'mightn', 're', 'you', 'into', 'as', 'until', 'where', 'with', 'during', 'below', 'hers', "hasn't", 'was', 'su

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# 4번 블록

# 파일읽고 처리
path = '/content/drive/MyDrive/dataset/bbc-text.csv'

with open(path, 'r', encoding='utf-8') as file:
  # header 처리
  reader = csv.reader(file)
  header = next(reader)
  print(header)
  # 기사 한줄씩 처리
  for row in reader:
    labels.append(row[0])
    original.append(row[1])
    news = row[1]

    # 제외어 검색 후 삭제
    # print('작업 전 : ',news)
    for word in MY_STOP:
      token = ' ' + word + ' '
      news = news.replace(token, ' ')
    # print('작업 후 : ',news)
    processed.append(news)

print('처리한 전체 기사 개수 : ',len(processed))

['category', 'text']
처리한 전체 기사 개수 :  2225


In [None]:
# 5번 블록

print('샘플 기사 원본 : ', original[MY_SAMPLE])
print('샘플 기사 카테고리 : ', labels[MY_SAMPLE])
print('샘플 기사 타입 : ', type(original[MY_SAMPLE]))
print('샘플의 총 단어 수 : ', len(original[MY_SAMPLE].split()))

print('제외어 제거된 샘플 : ', processed[MY_SAMPLE])
print('제외어 제외된 샘플의 총 단어 수 : ', len(processed[MY_SAMPLE].split()))

샘플 기사 원본 :  screensaver tackles spam websites net users are getting the chance to fight back against spam websites  internet portal lycos has made a screensaver that endlessly requests data from sites that sell the goods and services mentioned in spam e-mail. lycos hopes it will make the monthly bandwidth bills of spammers soar by keeping their servers running flat out. the net firm estimates that if enough people sign up and download the tool  spammers could end up paying to send out terabytes of data.   we ve never really solved the big problem of spam which is that its so damn cheap and easy to do   said malte pollmann  spokesman for lycos europe.  in the past we have built up the spam filtering systems for our users   he said   but now we are going to go one step further.    we ve found a way to make it much higher cost for spammers by putting a load on their servers.  by getting thousands of people to download and use the screensaver  lycos hopes to get spamming websites constantl