# Importing / Library

In [None]:
import warnings 
warnings.filterwarnings(action='ignore')
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import re
import os

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.layers import Dense, Embedding, LSTM, GlobalMaxPooling1D, Conv1D, Dropout, Bidirectional, Input, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras import Sequential
from keras.utils import to_categorical

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from google.colab import drive
drive.mount('/content/drive')
train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/소설/train.csv")
test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/소설/test_x.csv")
sub = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/소설/sample_submission.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train.head()

Unnamed: 0,index,text,author
0,0,"He was almost choking. There was so much, so m...",3
1,1,"“Your sister asked for it, I suppose?”",2
2,2,"She was engaged one day as she walked, in per...",1
3,3,"The captain was in the porch, keeping himself ...",4
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3


In [None]:
test.head()

Unnamed: 0,index,text
0,0,“Not at all. I think she is one of the most ch...
1,1,"""No,"" replied he, with sudden consciousness, ""..."
2,2,As the lady had stated her intention of scream...
3,3,“And then suddenly in the silence I heard a so...
4,4,His conviction remained unchanged. So far as I...


In [None]:
sub.head()

Unnamed: 0,index,0,1,2,3,4
0,0,0,0,0,0,0
1,1,0,0,0,0,0
2,2,0,0,0,0,0
3,3,0,0,0,0,0
4,4,0,0,0,0,0


# Preprocessing

Removing punctuation

In [None]:
import string
punct = string.punctuation
punct = punct +"“" + "”"
def remove_punctuation(text_sentence):
    text = "".join([word for word in text_sentence if word not in punct])
    return text
train['text_nopunct'] = train['text'].apply(lambda x: remove_punctuation(x))
test['text_nopunct'] = test['text'].apply(lambda x: remove_punctuation(x))
train.head()

Unnamed: 0,index,text,author,text_nopunct
0,0,"He was almost choking. There was so much, so m...",3,He was almost choking There was so much so muc...
1,1,"“Your sister asked for it, I suppose?”",2,Your sister asked for it I suppose
2,2,"She was engaged one day as she walked, in per...",1,She was engaged one day as she walked in peru...
3,3,"The captain was in the porch, keeping himself ...",4,The captain was in the porch keeping himself c...
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3,Have mercy gentlemen odin flung up his hands D...


Tokenizer

In [None]:
import re
def tokenize(text_sentence): 
    token = re.split('\W+', text_sentence)
    return token
train['text_tokenize'] = train['text'].apply(lambda x: tokenize(x))
test['text_tokenize'] = test['text'].apply(lambda x: tokenize(x))
train.head()

Unnamed: 0,index,text,author,text_nopunct,text_tokenize
0,0,"He was almost choking. There was so much, so m...",3,He was almost choking There was so much so muc...,"[He, was, almost, choking, There, was, so, muc..."
1,1,"“Your sister asked for it, I suppose?”",2,Your sister asked for it I suppose,"[, Your, sister, asked, for, it, I, suppose, ]"
2,2,"She was engaged one day as she walked, in per...",1,She was engaged one day as she walked in peru...,"[, She, was, engaged, one, day, as, she, walke..."
3,3,"The captain was in the porch, keeping himself ...",4,The captain was in the porch keeping himself c...,"[The, captain, was, in, the, porch, keeping, h..."
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3,Have mercy gentlemen odin flung up his hands D...,"[, Have, mercy, gentlemen, odin, flung, up, hi..."


Stopwords

In [None]:
# 불용어 처리하는 경우 코드 실행
'''
import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
def remove_stopword(text_sentence): 
    text = [word for word in text_sentence if word not in stopwords]
    return text
train['text_nostopword'] = train['text_tokenize'].apply(lambda x: remove_stopword(x))
test['text_nostopword'] = test['text_tokenize'].apply(lambda x: remove_stopword(x))
train.head()
'''

"\nimport nltk\nnltk.download('stopwords')\nstopwords = nltk.corpus.stopwords.words('english')\ndef remove_stopword(text_sentence): \n    text = [word for word in text_sentence if word not in stopwords]\n    return text\ntrain['text_nostopword'] = train['text_tokenize'].apply(lambda x: remove_stopword(x))\ntest['text_nostopword'] = test['text_tokenize'].apply(lambda x: remove_stopword(x))\ntrain.head()\n"

Lemmatisation

In [None]:
import nltk
nltk.download('omw-1.4')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet

def word_lemmatizer(text):
  lem_text = [WordNetLemmatizer().lemmatize(i,pos='v') for i in text]
  return lem_text

train["lemmatized"] = train["text_tokenize"].apply(lambda x: word_lemmatizer(x))
test["lemmatized"] = test["text_tokenize"].apply(lambda x: word_lemmatizer(x))
train.head()

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,index,text,author,text_nopunct,text_tokenize,lemmatized
0,0,"He was almost choking. There was so much, so m...",3,He was almost choking There was so much so muc...,"[He, was, almost, choking, There, was, so, muc...","[He, be, almost, choke, There, be, so, much, s..."
1,1,"“Your sister asked for it, I suppose?”",2,Your sister asked for it I suppose,"[, Your, sister, asked, for, it, I, suppose, ]","[, Your, sister, ask, for, it, I, suppose, ]"
2,2,"She was engaged one day as she walked, in per...",1,She was engaged one day as she walked in peru...,"[, She, was, engaged, one, day, as, she, walke...","[, She, be, engage, one, day, as, she, walk, i..."
3,3,"The captain was in the porch, keeping himself ...",4,The captain was in the porch keeping himself c...,"[The, captain, was, in, the, porch, keeping, h...","[The, captain, be, in, the, porch, keep, himse..."
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3,Have mercy gentlemen odin flung up his hands D...,"[, Have, mercy, gentlemen, odin, flung, up, hi...","[, Have, mercy, gentlemen, odin, fling, up, hi..."


In [None]:
train["lemmatize_joined"] = train["lemmatized"].apply(lambda x: ' '.join(x))
test["lemmatize_joined"] = test["lemmatized"].apply(lambda x: ' '.join(x))
train.head()

Unnamed: 0,index,text,author,text_nopunct,text_tokenize,lemmatized,lemmatize_joined
0,0,"He was almost choking. There was so much, so m...",3,He was almost choking There was so much so muc...,"[He, was, almost, choking, There, was, so, muc...","[He, be, almost, choke, There, be, so, much, s...",He be almost choke There be so much so much he...
1,1,"“Your sister asked for it, I suppose?”",2,Your sister asked for it I suppose,"[, Your, sister, asked, for, it, I, suppose, ]","[, Your, sister, ask, for, it, I, suppose, ]",Your sister ask for it I suppose
2,2,"She was engaged one day as she walked, in per...",1,She was engaged one day as she walked in peru...,"[, She, was, engaged, one, day, as, she, walke...","[, She, be, engage, one, day, as, she, walk, i...",She be engage one day as she walk in peruse J...
3,3,"The captain was in the porch, keeping himself ...",4,The captain was in the porch keeping himself c...,"[The, captain, was, in, the, porch, keeping, h...","[The, captain, be, in, the, porch, keep, himse...",The captain be in the porch keep himself caref...
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3,Have mercy gentlemen odin flung up his hands D...,"[, Have, mercy, gentlemen, odin, flung, up, hi...","[, Have, mercy, gentlemen, odin, fling, up, hi...",Have mercy gentlemen odin fling up his hand D...


In [None]:
# train test 분리
X_train = train['lemmatize_joined'].values
X_test = test['lemmatize_joined'].values
y = train['author'].values

In [None]:
X_train

array(['He be almost choke There be so much so much he want to say but strange exclamations be all that come from his lips The Pole gaze fixedly at him at the bundle of note in his hand look at odin and be in evident perplexity ',
       ' Your sister ask for it I suppose ',
       ' She be engage one day as she walk in peruse Jane s last letter and dwell on some passages which prove that Jane have not write in spirit when instead of be again surprise by Mr odin she saw on look up that odin be meet her Putting away the letter immediately and force a smile she say ',
       ..., ' Your sincere well wisher friend and sister LUCY odin ',
       ' Then you want me to lend you money ',
       'It certainly have not occur to me before but I say Yes I should like that '],
      dtype=object)

# Tokenization, Padding

In [None]:
#tokenizer에 fit
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
vocab_size=len(word_index)+1

In [None]:
print(vocab_size)

27188


In [None]:
train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)

In [None]:
max_length = max(len(l) for l in train_sequences)
print(max_length)

476


In [None]:
padding_type='post'
train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)
test_padded = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)

In [None]:
train_padded

array([[  11,    2,  242, ...,    0,    0,    0],
       [  48,  294,   98, ...,    0,    0,    0],
       [  26,    2,  684, ...,    0,    0,    0],
       ...,
       [  48, 2382,   74, ...,    0,    0,    0],
       [  70,   12,  160, ...,    0,    0,    0],
       [  13,  317,    9, ...,    0,    0,    0]], dtype=int32)

# Embedding

In [None]:
embedding_dict= dict()
f = open('/content/drive/MyDrive/Colab Notebooks/소설/glove.6B.100d.txt', encoding='utf8')

for line in f:
    word_vector = line.split()
    word = word_vector[0]
    word_vector_arr = np.asarray(word_vector[1:], dtype='float32')
    embedding_dict[word] = word_vector_arr
f.close

embedding_matrix = np.zeros((vocab_size, 100))

for word, i in word_index.items():
    temp = embedding_dict.get(word)
    if temp is not None:
        embedding_matrix[i] = temp

In [None]:
embedding_matrix.shape

(27188, 100)

In [None]:
print(tokenizer.word_index.items())



# Modeling

bi-lstm, nfold=5

In [None]:
n_fold = 5
n_class = 5
cv = StratifiedKFold(n_splits=n_fold, shuffle=True)

In [None]:
def get_model():
    model = Sequential([
        Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length),
        Bidirectional(LSTM(64, return_sequences=True)),
        Bidirectional(LSTM(64)),
        Dense(n_class, activation='softmax')
    ])
    
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=.01))
    return model

In [None]:
get_model().summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 476, 100)          2718800   
                                                                 
 bidirectional (Bidirectiona  (None, 476, 128)         84480     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 5)                 645       
                                                                 
Total params: 2,902,741
Trainable params: 2,902,741
Non-trainable params: 0
_________________________________________________________________


In [None]:
val = np.zeros((train_padded.shape[0], n_class))
test = np.zeros((test_padded.shape[0], n_class))

for i, (i_train, i_val) in enumerate(cv.split(train_padded, y), 1):
    print(f'training model for CV #{i}')
    clf = get_model()
    
    es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3,
                       verbose=1, mode='min', baseline=None, restore_best_weights=True)

    clf.fit(train_padded[i_train], 
            to_categorical(y[i_train]),
            validation_data=(train_padded[i_val], to_categorical(y[i_val])),
            epochs=10,
            batch_size=512,
            callbacks=[es])
    val[i_val, :] = clf.predict(train_padded[i_val])
    test += clf.predict(test_padded) / n_fold

training model for CV #1
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 5: early stopping
training model for CV #2
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 5: early stopping
training model for CV #3
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 6: early stopping
training model for CV #4
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 5: early stopping
training model for CV #5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 6: early stopping


In [None]:
print(f'Accuracy (CV): {accuracy_score(y, np.argmax(val, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), val):8.4f}')

Accuracy (CV):  75.7466%
Log Loss (CV):   0.6654


In [None]:
test

array([[2.66509065e-02, 4.94228153e-01, 2.94169819e-01, 1.69565224e-01,
        1.53859145e-02],
       [8.11351906e-02, 2.34870344e-01, 3.47555821e-02, 2.46502722e-01,
        4.02736172e-01],
       [9.67791855e-01, 1.07340139e-02, 3.94947255e-03, 1.11718151e-03,
        1.64074956e-02],
       ...,
       [1.66850217e-03, 9.96896073e-01, 3.93960072e-04, 9.11371630e-04,
        1.30092412e-04],
       [3.80059323e-03, 9.94576752e-01, 5.96445941e-04, 7.23903144e-04,
        3.02255747e-04],
       [4.72982059e-01, 6.33203270e-03, 1.13867248e-02, 9.93538619e-03,
        4.99363754e-01]])

In [None]:
sub[['0','1','2','3','4']] = test
sub.head()

Unnamed: 0,index,0,1,2,3,4
0,0,0.026651,0.494228,0.29417,0.169565,0.015386
1,1,0.081135,0.23487,0.034756,0.246503,0.402736
2,2,0.967792,0.010734,0.003949,0.001117,0.016407
3,3,0.003035,0.007763,0.967378,0.005473,0.016352
4,4,0.958567,0.004431,0.004888,0.003272,0.028842


bi-lstm, nfold=10

In [None]:
n_fold = 10
n_class = 5
cv = StratifiedKFold(n_splits=n_fold, shuffle=True)

In [None]:
def get_model():
    model = Sequential([
        Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length),
        Bidirectional(LSTM(64, return_sequences=True)),
        Bidirectional(LSTM(64)),
        Dense(n_class, activation='softmax')
    ])
    
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=.01))
    return model

In [None]:
val = np.zeros((train_padded.shape[0], n_class))
test = np.zeros((test_padded.shape[0], n_class))

for i, (i_train, i_val) in enumerate(cv.split(train_padded, y), 1):
    print(f'training model for CV #{i}')
    clf = get_model()
    
    es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3,
                       verbose=1, mode='min', baseline=None, restore_best_weights=True)

    clf.fit(train_padded[i_train], 
            to_categorical(y[i_train]),
            validation_data=(train_padded[i_val], to_categorical(y[i_val])),
            epochs=10,
            batch_size=512,
            callbacks=[es])
    val[i_val, :] = clf.predict(train_padded[i_val])
    test += clf.predict(test_padded) / n_fold

training model for CV #1
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 5: early stopping
training model for CV #2
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 5: early stopping
training model for CV #3
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 5: early stopping
training model for CV #4
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 5: early stopping
training model for CV #5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 5: early stopping
training model for CV #6
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 5: early stopping
training model for CV #7
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 5: early stopping
training model for CV #8
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 5: early stopping
training model for CV #9
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 5: early stopping
training model for CV #10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch

In [None]:
print(f'Accuracy (CV): {accuracy_score(y, np.argmax(val, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), val):8.4f}')

Accuracy (CV):  75.9689%
Log Loss (CV):   0.6474


In [None]:
test

array([[2.55353956e-02, 3.68113535e-01, 4.94604130e-01, 8.80179126e-02,
        2.37290271e-02],
       [2.19663911e-01, 2.43203456e-01, 2.75160638e-02, 1.49696124e-01,
        3.59920444e-01],
       [9.74712759e-01, 1.15908732e-02, 3.32887074e-03, 2.66769125e-03,
        7.69977899e-03],
       ...,
       [2.23116553e-03, 9.95749682e-01, 3.32524729e-04, 1.48731340e-03,
        1.99288956e-04],
       [4.10498829e-03, 9.94176351e-01, 5.23774414e-04, 8.54361879e-04,
        3.40532125e-04],
       [5.99746912e-01, 3.78990710e-03, 1.28036388e-02, 3.94847174e-02,
        3.44174812e-01]])

In [None]:
sub[['0','1','2','3','4']] = test
sub.head()

Unnamed: 0,index,0,1,2,3,4
0,0,0.025535,0.368114,0.494604,0.088018,0.023729
1,1,0.219664,0.243203,0.027516,0.149696,0.35992
2,2,0.974713,0.011591,0.003329,0.002668,0.0077
3,3,0.00228,0.006155,0.976947,0.003542,0.011076
4,4,0.873679,0.008809,0.013819,0.032337,0.071357


+stopwords 코드 실행 후 위 코드 동일하게 반복 