<a href="https://colab.research.google.com/github/KimDukJung/bbc/blob/main/bbc_classic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1번 블록

# 기본 패키지 임포트
import csv
import numpy as np
from time import time

# 데이터 처리 관련 패키지 임포트
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# 인공신경망 관련 패키지 임포트
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout


In [None]:
# 2번 블록
# 하이퍼파라미터

MY_VOCAB = 5000  # 내가 사용할 단어 수
MY_EMBED = 64  # 임베딩 차원
MY_HIDDEN = 100  # LSTM 출력 셀의 크기
MY_LEN = 200    # 기사 최대 길이

MY_SPLIT = 0.8   # 학습용 데이터 비율
MY_SAMPLE = 123   # 샘플용 기사를 123번으로 하겠다
MY_EPOCH = 10   # 에포크 횟수

# 데이터 보관 공간 설정
original = []   # 원본 기사
processed = []   # 전처리 된 기사
labels = []    # 기사 정답, 카테고리 라벨

In [None]:
# 3번 블록

# 제외어(stopword) 설정
nltk.download('stopwords')
MY_STOP = set(nltk.corpus.stopwords.words('english'))

# 제외어 출력
print('영어 제외어', MY_STOP)
print('제외어 개수 : ', len(MY_STOP))
print(type(MY_STOP))
print('the' in MY_STOP)

영어 제외어 {'those', 'does', 'doing', 'with', 'off', 'such', 'wouldn', 'this', 'nor', 'until', 'couldn', 'all', 'himself', "mightn't", 'his', "won't", "you're", 'be', 'have', 'an', 'yourselves', "you'd", "she's", 'the', 'having', 'mustn', 'before', 'further', 'haven', 'what', 'you', 'he', 'her', 'aren', 'am', 'over', 'him', 'at', 'shan', 'than', 'your', 'then', 'myself', "aren't", 'my', 're', "didn't", "mustn't", 'which', 'on', 'don', 'through', 'between', 'ma', "doesn't", 'hadn', 'once', "hadn't", 'against', 'me', 'as', 'above', 'where', 'it', "hasn't", "that'll", 'by', 'm', "wouldn't", 'who', 'here', 'not', 'why', 'more', 'hers', "weren't", 'and', 'themselves', 'because', 'no', 'ours', 'into', 'ain', 'during', 'its', 'to', 'a', 'but', 'our', 'while', 's', "couldn't", 'under', 'didn', "needn't", "wasn't", 'there', 'has', 'had', 'any', 'won', 'whom', 'after', "isn't", 'these', 'herself', 'just', 'about', 'mightn', "don't", 'll', "haven't", 'd', 'yours', 'of', 'for', 'if', 'both', 'will', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# 4번 블록

# 파일읽고 처리
path = '/content/drive/MyDrive/dataset/bbc-text.csv'

with open(path, 'r', encoding='utf-8') as file:
  # header 처리
  reader = csv.reader(file)
  header = next(reader)
  print(header)
  # 기사 한줄씩 처리
  for row in reader:
    labels.append(row[0])
    original.append(row[1])
    news = row[1]

    # 제외어 검색 후 삭제
    # print('작업 전 : ',news)
    for word in MY_STOP:
      token = ' ' + word + ' '
      news = news.replace(token, ' ')
    # print('작업 후 : ',news)
    processed.append(news)

print('처리한 전체 기사 개수 : ',len(processed))

['category', 'text']
처리한 전체 기사 개수 :  2225


In [None]:
# 5번 블록

print('샘플 기사 원본 : ', original[MY_SAMPLE])
print('샘플 기사 카테고리 : ', labels[MY_SAMPLE])
print('샘플 기사 타입 : ', type(original[MY_SAMPLE]))
print('샘플의 총 단어 수 : ', len(original[MY_SAMPLE].split()))

print('제외어 제거된 샘플 : ', processed[MY_SAMPLE])
print('제외어 제외된 샘플의 총 단어 수 : ', len(processed[MY_SAMPLE].split()))

샘플 기사 원본 :  screensaver tackles spam websites net users are getting the chance to fight back against spam websites  internet portal lycos has made a screensaver that endlessly requests data from sites that sell the goods and services mentioned in spam e-mail. lycos hopes it will make the monthly bandwidth bills of spammers soar by keeping their servers running flat out. the net firm estimates that if enough people sign up and download the tool  spammers could end up paying to send out terabytes of data.   we ve never really solved the big problem of spam which is that its so damn cheap and easy to do   said malte pollmann  spokesman for lycos europe.  in the past we have built up the spam filtering systems for our users   he said   but now we are going to go one step further.    we ve found a way to make it much higher cost for spammers by putting a load on their servers.  by getting thousands of people to download and use the screensaver  lycos hopes to get spamming websites constantl

In [None]:
# 6번 블록
# 단어를 정수로 변환(mapping)
# oov => out of vocabulary 특수문자

A_token = Tokenizer(num_words=MY_VOCAB, oov_token='!')

A_token.fit_on_texts(processed)

# print('총 기사 수', A_token.document_count)
print('단어 집합 : ', A_token.word_index)
# print('단어 집합 크기 : ', len(A_token.word_index))
# print('총 단어 수', len(A_token.word_counts))
# print('단어 집합 : ', A_token.word_counts)


# 전체 기사 데이터 토큰 처리
A_tokenized = A_token.texts_to_sequences(processed)
# print('A_tokenized의 타입 : ',type(A_tokenized))
print(A_tokenized[MY_SAMPLE])
print(A_tokenized[112])
# print(len(A_tokenized[MY_SAMPLE]))
# print(len(A_tokenized[112]))

longest = max([len(item) for item in A_tokenized])
print('기사의 최대 길이 : ', longest)
shortest = min([len(item) for item in A_tokenized])
print('기사의 최소 길이 : ', shortest)

[3170, 1, 816, 877, 115, 136, 382, 347, 716, 28, 816, 877, 228, 1, 3171, 27, 3170, 1, 4869, 203, 569, 734, 1770, 126, 4026, 816, 260, 395, 3171, 700, 21, 1649, 3630, 2848, 2606, 1, 2324, 2550, 453, 2918, 570, 115, 63, 2290, 381, 7, 1160, 780, 1859, 2606, 11, 92, 1571, 1051, 1, 203, 281, 154, 1, 138, 364, 816, 1, 2224, 847, 2, 1, 1, 178, 3171, 139, 255, 1109, 816, 1, 727, 136, 2, 52, 60, 10, 818, 3923, 195, 41, 21, 56, 494, 245, 2606, 1362, 1, 2550, 382, 1021, 7, 780, 70, 3170, 3171, 700, 23, 1, 877, 3994, 453, 343, 322, 1393, 3, 1, 2, 3427, 582, 816, 877, 297, 1, 56, 203, 2295, 2403, 2, 3170, 2708, 1069, 660, 812, 1287, 3885, 1538, 1, 466, 224, 503, 1538, 1, 31, 96, 1, 681, 111, 2, 10, 1898, 912, 2, 381, 7, 1160, 1, 877, 11, 722, 256, 1, 1287, 224, 503, 111, 3171, 79, 70, 260, 395, 716, 28, 2, 3, 1, 4, 1604, 10, 823, 455, 158, 823, 455, 2, 569, 2178, 4026, 816, 260, 395, 891, 734, 1770, 126, 220, 3678, 569, 316, 86, 1051, 816, 260, 395, 3678, 23, 1, 1452, 681, 111, 415, 569, 3170, 760,

In [None]:
# 길이를 맞추기위한 패딩
A_tokenized = pad_sequences(A_tokenized, maxlen = MY_LEN, padding='pre', truncating='pre')

longest = max([len(item) for item in A_tokenized])
print('기사의 최대 길이 : ', longest)
shortest = min([len(item) for item in A_tokenized])
print('기사의 최소 길이 : ', shortest)

기사의 최대 길이 :  200
기사의 최소 길이 :  200


In [None]:
print('샘플 기사 처리본', A_tokenized[MY_SAMPLE])
print('짧은 샘플 기사 처리본', A_tokenized[112] )

샘플 기사 처리본 [   2 3427  582  816  877  297    1   56  203 2295 2403    2 3170 2708
 1069  660  812 1287 3885 1538    1  466  224  503 1538    1   31   96
    1  681  111    2   10 1898  912    2  381    7 1160    1  877   11
  722  256    1 1287  224  503  111 3171   79   70  260  395  716   28
    2    3    1    4 1604   10  823  455  158  823  455    2  569 2178
 4026  816  260  395  891  734 1770  126  220 3678  569  316   86 1051
  816  260  395 3678   23    1 1452  681  111  415  569 3170  760  367
  189   14    1 3885 1595    1 1375  347 3755   27 3171  200    7  660
  569  848  816 1770  569 3258   70 2066 4064 4055  416 3792   77 3630
 2848   11   21  816 1730    2    3    1  569    6 1429 4186  203 4869
  251  664   65  910  231  569    1 3261  136 2819  136 3171  780   70
 3170  297 3170  571  877    1 4869  203 3170  269  633  383  139   35
  233    1 2643  193 4459  611    3    1    2 3170 1965  264   32  231
   12   96  276  430 1538  379    1  816  111 3075  111  323    2  

In [None]:
# 7번 블록

# 카테고리 라벨을 토큰 처리
C_token = Tokenizer()
C_token.fit_on_texts(labels)

# print('총 카테고리 수 ', C_token.document_count)
# print('총 단어 수 ', len(C_token.word_counts))
# print('각 단어의 사용 횟수', C_token.word_counts)
print('단어를 정수로', C_token.word_index)

# 단어를 숫자로 매핑
C_tokenized = C_token.texts_to_sequences(labels)
print('C_tokenized 개수 : ',len(C_tokenized))
print(type(C_tokenized))
print(C_tokenized[MY_SAMPLE])
C_tokenized = np.array(C_tokenized)


단어를 정수로 {'sport': 1, 'business': 2, 'politics': 3, 'tech': 4, 'entertainment': 5}
C_tokenized 개수 :  2225
<class 'list'>
[4]


In [None]:
# 데이터 4분할
X_train, X_test, Y_train, Y_test = train_test_split(A_tokenized, C_tokenized,
                                                    train_size=MY_SPLIT, shuffle = False)

# 데이터 모양 확인
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(1780, 200)
(445, 200)
(1780, 1)
(445, 1)


In [None]:
# 8번 블록

# RNN 구현

model = Sequential()

model.add(Embedding(input_dim = MY_VOCAB, output_dim = MY_EMBED))

model.add(Dropout(rate=0.5))

# LSTM 층
model.add(LSTM(units = MY_HIDDEN))

model.add(Dense(units = 6, activation = 'softmax'))

model.summary()

In [None]:
# 9번 블록

# RNN 환경 설정
model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

# 학습 진행
begin = time()

model.fit(X_train,Y_train, epochs=MY_EPOCH, verbose=1)

end = time()

print('학습 시간 : {:.2}초'.format( end - begin))

Epoch 1/10
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 189ms/step - accuracy: 0.2302 - loss: 1.7113
Epoch 2/10
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 280ms/step - accuracy: 0.3211 - loss: 1.5639
Epoch 3/10
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 208ms/step - accuracy: 0.5270 - loss: 1.1687
Epoch 4/10
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 256ms/step - accuracy: 0.8081 - loss: 0.6324
Epoch 5/10
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 200ms/step - accuracy: 0.9656 - loss: 0.2044
Epoch 6/10
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 144ms/step - accuracy: 0.9593 - loss: 0.2108
Epoch 7/10
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 188ms/step - accuracy: 0.9790 - loss: 0.0805
Epoch 8/10
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 158ms/step - accuracy: 0.9835 - loss: 0.0744
Epoch 9/10
[1m56/56[0m [32m━━

In [None]:
# 평가
score = model.evaluate(X_test, Y_test, verbose=1)

print('정확도 : {:.2f}'.format(score[1]))

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 70ms/step - accuracy: 0.9311 - loss: 0.2652
정확도 : 0.93


In [None]:
# 예측
pred = model.predict(X_test)
print('추측값 ',pred)
pred = pred.argmax(axis=1)

print('추측값 ',pred)
print('실제값 ',Y_test.flatten())

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 59ms/step
추측값  [[2.42269860e-04 1.09133008e-03 1.14341569e-03 1.60607466e-04
  9.54949646e-04 9.96407509e-01]
 [3.79381562e-03 2.03959667e-03 3.31386924e-03 1.43468855e-02
  9.71368253e-01 5.13750827e-03]
 [2.22511444e-04 7.01254932e-04 4.44449484e-04 9.97377932e-01
  1.03570684e-03 2.18000932e-04]
 ...
 [1.03606586e-03 3.46430019e-03 6.71436358e-03 8.93463264e-04
  3.19218123e-03 9.84699667e-01]
 [1.04765072e-02 3.00620552e-02 1.10948198e-02 8.63556802e-01
  6.66966140e-02 1.81132313e-02]
 [1.33349129e-03 9.81366754e-01 5.20077301e-03 5.47486078e-03
  3.71736358e-03 2.90672993e-03]]
추측값  [5 4 3 1 1 4 2 5 5 3 3 3 2 5 1 5 5 2 1 3 4 2 1 2 4 3 3 1 1 3 2 2 2 2 5 2 3
 3 4 4 5 1 5 2 3 1 1 4 4 2 4 1 2 2 3 1 1 3 3 5 5 3 2 3 3 2 4 3 3 3 3 3 5 5
 4 3 1 3 1 4 1 1 1 5 4 5 4 1 4 1 1 5 5 2 5 5 3 2 1 4 4 3 2 1 2 5 1 3 5 1 1
 2 3 4 4 2 2 1 3 5 1 1 3 5 4 1 5 3 3 1 3 4 5 1 3 2 5 3 5 3 1 3 2 2 3 2 4 1
 2 5 2 1 1 3 4 3 4 3 3 1 1 1 2 4 5 2 1 2

In [None]:
# 실제 기사 일부로 예측
news = ["India's airlines and airports received 999 hoax bomb threats this year as of 14 November, the country's deputy civil aviation minister told its parliament. This was nearly 10 times more than the threats received in 2023, Mr Murlidhar Mohol said. More than 500 of the year's threats were received just in the last two weeks of October. The dramatic surge in hoax threats had wreaked havoc on flight schedules, causing widespread disruption in services."]

# 토큰 처리
news= A_token.texts_to_sequences(news)
print(news)
print('총 단어 수: ',len(news[0]))

# 패딩
news = pad_sequences(news, maxlen = MY_LEN, padding='pre', truncating='pre')
print('총 단어 수: ',len(news[0]))

# 예측
pred = model.predict(news)
print('추측값 ',pred)
pred = pred.argmax(axis=1)

print('추측값 ',pred)

[[1, 1439, 1799, 4934, 578, 1, 1, 3195, 2576, 2013, 5, 1, 1569, 691, 418, 1259, 1, 1540, 849, 3376, 76, 22, 1, 610, 2013, 4519, 978, 85, 231, 2282, 4306, 1259, 2576, 578, 602, 1, 3, 1, 1, 2, 2282, 4306, 1240, 1569, 1259, 1, 2576, 1, 578, 1, 602, 1259, 12, 15, 428, 1569, 585, 1259, 2829, 3404, 602, 1, 2576, 3843, 1, 1, 586, 2061, 1, 4704, 3149, 1, 602, 126]]
총 단어 수:  74
총 단어 수:  200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
추측값  [[0.00554171 0.03546233 0.0417811  0.00709947 0.02099266 0.8891227 ]]
추측값  [5]
