In [1]:
# 라이브러리 불러오기

import pandas as pd
import numpy as np
import seaborn as sns
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# train, test , submission로드하기

train = pd.read_csv('C:/Users/parkk/ais7/235670/train.csv')
test = pd.read_csv('C:/Users/parkk/ais7/235670/test_x.csv')
submission = pd.read_csv('C:/Users/parkk/ais7/235670/sample_submission.csv')

train.shape, test.shape, submission.shape

((54879, 3), (19617, 2), (19617, 6))

In [3]:
# train 내용 확인하기

train.head()

Unnamed: 0,index,text,author
0,0,"He was almost choking. There was so much, so m...",3
1,1,"“Your sister asked for it, I suppose?”",2
2,2,"She was engaged one day as she walked, in per...",1
3,3,"The captain was in the porch, keeping himself ...",4
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3


In [4]:
# test 내용 확인하기

test.head()

Unnamed: 0,index,text
0,0,“Not at all. I think she is one of the most ch...
1,1,"""No,"" replied he, with sudden consciousness, ""..."
2,2,As the lady had stated her intention of scream...
3,3,“And then suddenly in the silence I heard a so...
4,4,His conviction remained unchanged. So far as I...


In [5]:
# 부호 제거하는 함수 만들고 적용하기

def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)

train['text']=train['text'].apply(alpha_num)
test['text']=test['text'].apply(alpha_num)

In [6]:
# train, test 모두 소문자로 변환하기

train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()

In [7]:
# 불용어 제거하고 할당하기

def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)

stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

train['text'] = train['text'].apply(remove_stopwords)
test['text'] = test['text'].apply(remove_stopwords)

In [8]:
# X_train, X_test, y_train 할당하기

X_train = train['text']
X_test = test['text']
y_train = train['author']

In [9]:
# tokenizer num_words 지정해주기

tokenizer = Tokenizer(num_words = 20000)

In [10]:
# X_train에 tokenizer fit 해주기

tokenizer.fit_on_texts(X_train)
tokenizer = Tokenizer(num_words = 20000)

In [11]:
# text_to_sequences로 텍스트 안의 단어들을 숫자의 시퀀스 형태로 만들어 준다.
# pad_sequences로 같은 길이의 시퀀스로 변환해준다. padding='post'는 뒤를 0으로 채워준다.

train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=500)

test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=500)

In [19]:
# NLP모델 만들어주기

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(20000, 16, input_length=500),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

In [20]:
# model compile 설정하기

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [21]:
# model summary 확인하기

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 500, 16)           320000    
                                                                 
 global_average_pooling1d_1   (None, 16)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_2 (Dense)             (None, 24)                408       
                                                                 
 dense_3 (Dense)             (None, 24)                600       
                                                                 
 dense_4 (Dense)             (None, 5)                 125       
                                                                 
Total params: 321,133
Trainable params: 321,133
Non-trainable params: 0
________________________________________________

In [22]:
# model fit 해주기

history = model.fit(train_padded, y_train, epochs=20, verbose=2, validation_split=0.2)
history

Epoch 1/20
1372/1372 - 5s - loss: 1.5715 - accuracy: 0.2752 - val_loss: 1.5715 - val_accuracy: 0.2680 - 5s/epoch - 4ms/step
Epoch 2/20
1372/1372 - 4s - loss: 1.5703 - accuracy: 0.2761 - val_loss: 1.5722 - val_accuracy: 0.2680 - 4s/epoch - 3ms/step
Epoch 3/20
1372/1372 - 4s - loss: 1.5705 - accuracy: 0.2761 - val_loss: 1.5713 - val_accuracy: 0.2680 - 4s/epoch - 3ms/step
Epoch 4/20
1372/1372 - 4s - loss: 1.5703 - accuracy: 0.2761 - val_loss: 1.5715 - val_accuracy: 0.2680 - 4s/epoch - 3ms/step
Epoch 5/20
1372/1372 - 4s - loss: 1.5703 - accuracy: 0.2761 - val_loss: 1.5714 - val_accuracy: 0.2680 - 4s/epoch - 3ms/step
Epoch 6/20
1372/1372 - 4s - loss: 1.5703 - accuracy: 0.2761 - val_loss: 1.5716 - val_accuracy: 0.2680 - 4s/epoch - 3ms/step
Epoch 7/20
1372/1372 - 4s - loss: 1.5704 - accuracy: 0.2761 - val_loss: 1.5712 - val_accuracy: 0.2680 - 4s/epoch - 3ms/step
Epoch 8/20
1372/1372 - 4s - loss: 1.5703 - accuracy: 0.2761 - val_loss: 1.5712 - val_accuracy: 0.2680 - 4s/epoch - 3ms/step
Epoch 9/

<keras.callbacks.History at 0x24c4f98c940>

In [23]:
# test 데이터 predict 해주기

y_pred = model.predict(test_padded)
y_pred



array([[0.24385561, 0.12774248, 0.20836984, 0.27960014, 0.14043194],
       [0.24385561, 0.12774248, 0.20836984, 0.27960014, 0.14043194],
       [0.24385561, 0.12774248, 0.20836984, 0.27960014, 0.14043194],
       ...,
       [0.24385561, 0.12774248, 0.20836984, 0.27960014, 0.14043194],
       [0.24385561, 0.12774248, 0.20836984, 0.27960014, 0.14043194],
       [0.24385561, 0.12774248, 0.20836984, 0.27960014, 0.14043194]],
      dtype=float32)

In [24]:
# predict 값 submission에 할당해주기

submission[['0','1','2','3','4']] = y_pred
submission

Unnamed: 0,index,0,1,2,3,4
0,0,0.243856,0.127742,0.20837,0.2796,0.140432
1,1,0.243856,0.127742,0.20837,0.2796,0.140432
2,2,0.243856,0.127742,0.20837,0.2796,0.140432
3,3,0.243856,0.127742,0.20837,0.2796,0.140432
4,4,0.243856,0.127742,0.20837,0.2796,0.140432
...,...,...,...,...,...,...
19612,19612,0.243856,0.127742,0.20837,0.2796,0.140432
19613,19613,0.243856,0.127742,0.20837,0.2796,0.140432
19614,19614,0.243856,0.127742,0.20837,0.2796,0.140432
19615,19615,0.243856,0.127742,0.20837,0.2796,0.140432
