In [23]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
from matplotlib import rcParams, pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
import re
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import BatchNormalization,Dense, SpatialDropout1D, Embedding, LSTM, GlobalMaxPooling1D, Conv1D, Dropout, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.optimizers import Adam
import warnings 
warnings.filterwarnings(action='ignore')
rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

In [24]:
data_dir    = Path('C:\\Users\\USER\\Desktop\\open\\')
tst_dir     = Path('C:\\Users\\USER\\Desktop\\open\\')
feature_dir = Path('C:\\Users\\USER\\Desktop\\open\\feature\\')
sub_dir     = Path('C:\\Users\\USER\\Desktop\\open\\sub\\')
val_dir     = Path('C:\\Users\\USER\\Desktop\\open\\val\\')

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test_x.csv'
sample_file = data_dir / 'sample_submission.csv'

target_col = 'author'
n_fold = 5
n_class = 5
seed = 42

algo_name = 'lstm'
feature_name = 'emb'
model_name = f'{algo_name}_{feature_name}'

feature_file = feature_dir / f'{feature_name}.csv'
p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'
sub_file = sub_dir / f'{model_name}.csv'

In [25]:
train = pd.read_csv(trn_file, index_col=0)
test = pd.read_csv(tst_file, index_col=0)


In [26]:
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)


def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)


stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [27]:
#전처리 적용
train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()
train['text'] = train['text'].apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].apply(alpha_num).apply(remove_stopwords)

In [28]:
# train test 분리
X_train = train['text'].values
X_test = test['text'].values
y = train['author'].values
print(X_train.shape, X_test.shape, y.shape)

(54879,) (19617,) (54879,)


In [29]:
#파라미터 설정
vocab_size = 15000
embedding_dim = 64
max_length = 400
padding_type='post'
#tokenizer에 fit
tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
#데이터를 sequence로 변환해주고 padding
train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)
trn = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)
tst = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)
print(trn.shape, tst.shape)

(54879, 400) (19617, 400)


In [30]:
from sklearn.model_selection import train_test_split
y = pd.get_dummies(train['author']).values
#Y = dataset3['user_suggestion'].values
X_train, X_test, Y_train, Y_test = train_test_split(trn,y, test_size = 0.2, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(43903, 400) (43903, 5)
(10976, 400) (10976, 5)


In [31]:
import os
glove_file = 'C:\\Users\\user\\Desktop\\open\\glove.6B.200d.txt'
embeddings_index = {}
f = open(glove_file,encoding='UTF8') 
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

embedding_dim = 200
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    if i < vocab_size:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
tf.keras.backend.clear_session()

Found 400000 word vectors.


In [32]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length = max_length, weights = [embedding_matrix]))
#model.add(Conv1D(128, 5, activation = 'relu'))
#model.add(GlobalAveragePooling1D())
model.add(SpatialDropout1D(0.4))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(BatchNormalization())
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(BatchNormalization())
model.add(Bidirectional(LSTM(64)))
model.add(BatchNormalization())
#model.add(GlobalAveragePooling1D())
model.add(Dense(64,activation='relu'))
model.add(BatchNormalization())
model.add(Dense(32,activation='relu'))
model.add(BatchNormalization())
model.add(Dense(5,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='rmsprop',metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 400, 200)          3000000   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 400, 200)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 400, 128)          135680    
_________________________________________________________________
batch_normalization (BatchNo (None, 400, 128)          512       
_________________________________________________________________
bidirectional_1 (Bidirection (None, 400, 128)          98816     
_________________________________________________________________
batch_normalization_1 (Batch (None, 400, 128)          512       
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128)               9

In [33]:
tf.random.set_seed(123)
np.random.seed(123)
batch_size = 256
 
model.fit(X_train, Y_train, epochs = 5, batch_size=batch_size, verbose = 1, validation_split = 0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x245cc42cb50>

In [34]:
#p_tst += clf.predict(tst)
pred = np.argmax(model.predict(X_test), axis = 1)
actual = np.argmax(Y_test, axis = 1)

In [35]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(actual, pred)
print("Accuracy of LSTM  is {}".format(acc))#0.6348396501457726

Accuracy of LSTM  is 0.6325619533527697


In [41]:
pred = model.predict(tst)
pred.shape

(19617, 5)

In [43]:
sub = pd.read_csv(sample_file, index_col=0)
sub[sub.columns] = pred
sub.to_csv(sub_file)#0.829608 dacon