In [1]:
import numpy as np
import pandas as pd
import re
from tqdm.notebook import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
path = '/content/drive/MyDrive/Colab Notebooks/nlp 스터디/'
train = pd.read_csv(path+"train_eng.csv")
test = pd.read_csv(path+"test_eng2.csv")

In [None]:
train

Unnamed: 0,index,eng_title,topic_idx
0,0,Incheon → Finland Flight Cancellation...Travel...,4
1,1,We're going to go beyond Silicon Valley.Google...,4
2,2,The solution to easing tensions in Iran's fore...,4
3,3,Lighting special relationships between Korean ...,4
4,4,Hope to reach out to Xi Jinping Trump as soon ...,4
...,...,...,...
43470,45649,KB Financial Group has partnered with IB Stife...,1
43471,45650,Reviewing postponement and closure of school o...,2
43472,45651,Bulletin Board Kiwoom Securities 2020 Kiwoom H...,1
43473,45652,"The answer is Bae Ki-dong, director of the Nat...",2


In [None]:
train['eng_title'][177]

'Ministry of National Defense 하라 Stop frivolous behavior at the threat of South Korea-U.S. training…Comprehensive punishment for provocation.'

In [None]:
re.findall('[가-힣]+', train['eng_title'][177])

[]

In [None]:
test.isnull().sum()

index         0
eng_title    43
dtype: int64

# 전처리

In [4]:
def cleaner(title):
    # remove none alphanumeric and aposthrope
    title = "".join(re.sub("([^0-9A-Za-z \t'])"," ", title))
    
    # remove extra whitespace
    title = " ".join(title.split())
    
    # remove leading and trailing space
    title = title.strip()
    return title

In [5]:
train['eng_title'] = train['eng_title'].apply(lambda x: cleaner(x))
test['eng_title'] = test['eng_title'].apply(lambda x: cleaner(x))

길이분포 확인하고

In [None]:
train = train[train['eng_title'].apply(lambda x: len(x.split()) > 3)]

In [None]:
train['topic_idx'].value_counts()

2    7193
4    7049
5    6729
6    6255
1    5852
3    5698
0    4614
Name: topic_idx, dtype: int64

## k-fold

In [6]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 12.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.6 MB/s 
Collecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 43.4 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 35.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 48.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fou

In [7]:
import transformers
from transformers import BertTokenizer,AdamWeightDecay,TFRobertaModel,TFBertModel

import tensorflow as tf
import keras
from keras.callbacks import EarlyStopping,ModelCheckpoint

import sklearn
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.model_selection import StratifiedKFold
import random
import os

In [8]:
def reset_seeds(seed, reset_graph_with_backend=None):
    if reset_graph_with_backend is not None:
        K = reset_graph_with_backend
        K.clear_session()
        tf.compat.v1.reset_default_graph()
        print("KERAS AND TENSORFLOW GRAPHS RESET")  

    np.random.seed(seed)
    random.seed(seed+100)
    tf.compat.v1.set_random_seed(seed+200)
    os.environ['CUDA_VISIBLE_DEVICES'] = ''  
    print("RANDOM SEEDS RESET {}".format(seed))  

SEED = 1514
reset_seeds(SEED)

RANDOM SEEDS RESET 1514


In [9]:
# Cross validation, StratifiedKfold
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
folds=[]

# Unusual sentences must be included in the training data
# Seperate train set and validation set in each folds
for train_idx, valid_idx in skf.split(train, train['topic_idx']):
    train_idx = np.array(list(set(list(train_idx))))
    valid_idx = np.array(list(set(set(valid_idx))))
    folds.append((train_idx, valid_idx))

In [13]:
def convert_data(data_df,case,mask_token):
    global tokenizer
    
    tokens, masks, segments, targets = [], [], [], []
    
    for i in tqdm(range(len(data_df))):
        # tokenize
        token = tokenizer.encode(data_df[DATA_COLUMN][i], max_length=SEQ_LEN, padding='max_length',truncation=True)
       
        # making input mask
        num_zeros = token.count(mask_token)
        mask = [1]*(SEQ_LEN-num_zeros) + [0]*num_zeros
        
        # making segment
        segment = [0]*SEQ_LEN
 
        # token, mask, segment
        tokens.append(token)
        masks.append(mask)
        segments.append(segment)
        
        if case=='train':
          # label values
          targets.append(data_df[LABEL_COLUMN][i])
 
    # convert to array format    
    tokens = np.array(tokens)
    masks = np.array(masks)
    segments = np.array(segments)
    if case=='train':
      targets = np.array(targets)

    if case=='train':
       return [tokens, masks, segments], targets
    if case=='test':
       return [tokens, masks, segments]

In [14]:
# Load data and convert to bert input format
def load_data(pandas_dataframe,case,mask_token):
    data_df = pandas_dataframe
    data_df[DATA_COLUMN] = data_df[DATA_COLUMN].astype(str)
    if case=='train':
      data_df[LABEL_COLUMN] = data_df[LABEL_COLUMN].astype(int)
      data_x, data_y = convert_data(data_df,'train',mask_token)
      return data_x, data_y
    if case=='test':
      data_x = convert_data(data_df,'test',mask_token)
      return data_x

In [15]:
# Main Tokenizer used in RobertaModel
tokenizer = BertTokenizer.from_pretrained('klue/roberta-small')

Downloading:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/173 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/734k [00:00<?, ?B/s]

In [None]:
# Define max_len
SEQ_LEN = 20
DATA_COLUMN = "eng_title"
LABEL_COLUMN = "topic_idx"
 
# train
train_x0, train_y0 = load_data(train.iloc[folds[0][0]].reset_index(drop=True),'train',1)
train_x1, train_y1 = load_data(train.iloc[folds[1][0]].reset_index(drop=True),'train',1)
train_x2, train_y2 = load_data(train.iloc[folds[2][0]].reset_index(drop=True),'train',1)
train_x3, train_y3 = load_data(train.iloc[folds[3][0]].reset_index(drop=True),'train',1)


valid_x0, valid_y0 = load_data(train.iloc[folds[0][1]].reset_index(drop=True),'train',1)
valid_x1, valid_y1 = load_data(train.iloc[folds[1][1]].reset_index(drop=True),'train',1)
valid_x2, valid_y2 = load_data(train.iloc[folds[2][1]].reset_index(drop=True),'train',1)
valid_x3, valid_y3 = load_data(train.iloc[folds[3][1]].reset_index(drop=True),'train',1)

  0%|          | 0/32542 [00:00<?, ?it/s]

  0%|          | 0/32542 [00:00<?, ?it/s]

  0%|          | 0/32543 [00:00<?, ?it/s]

  0%|          | 0/32543 [00:00<?, ?it/s]

  0%|          | 0/10848 [00:00<?, ?it/s]

  0%|          | 0/10848 [00:00<?, ?it/s]

  0%|          | 0/10847 [00:00<?, ?it/s]

  0%|          | 0/10847 [00:00<?, ?it/s]

  0%|          | 0/9131 [00:00<?, ?it/s]

In [16]:
# Define max_len
SEQ_LEN = 20
DATA_COLUMN = "eng_title"
LABEL_COLUMN = "topic_idx"

# test
test_x = load_data(test,'test',1)

  0%|          | 0/9131 [00:00<?, ?it/s]

# Albert

In [None]:
albert_url='https://tfhub.dev/tensorflow/albert_en_base/2'
encoder = hub.KerasLayer(albert_url)
preprocessor_url="https://tfhub.dev/tensorflow/albert_en_preprocess/3"
preprocessor = hub.KerasLayer(preprocessor_url) 

In [None]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
encoder_inputs = preprocessor(text_input)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]     
embedding_model = tf.keras.Model(text_input, pooled_output) 

In [None]:
model = tf.keras.Sequential()
model.add(embedding_model)
model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dense(30, activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dense(8, activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dense(1))
model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['accuracy'])
history = model.fit(train_data.shuffle(10000).batch(128), epochs=10,validation_data=validation_data.batch(128),verbose=1) 

# bert

In [None]:
import os
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import transformers

from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizerFast
from transformers import TFBertModel

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased",do_lower_case=True)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
MAX_LEN=128

def bert_tokenize(data,max_len=MAX_LEN) :
    input_ids = []
    attention_masks = []
    for title in data:
        encoded = tokenizer.encode_plus(
            title,
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids),np.array(attention_masks)

In [None]:
train_input_ids, train_attention_masks = bert_tokenize(x_train, MAX_LEN)
val_input_ids, val_attention_masks = bert_tokenize(x_valid, MAX_LEN)
test_input_ids, test_attention_masks = bert_tokenize(x_test, MAX_LEN)

In [None]:
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

In [None]:
def create_model(bert_model, max_len=MAX_LEN):
    
    # parameter
    optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
    loss = tf.keras.losses.CategoricalCrossentropy()
    accuracy = tf.keras.metrics.CategoricalAccuracy()

    input_ids = tf.keras.Input(shape=(max_len,),dtype='int32')
    attention_masks = tf.keras.Input(shape=(max_len,),dtype='int32')
    
    bert = bert_model([input_ids,attention_masks])[1]
    dropout = tf.keras.layers.Dropout(0.2)(bert)
    output = tf.keras.layers.Dense(5, activation="softmax")(dropout)
    
    model = tf.keras.models.Model(inputs = [input_ids,attention_masks], outputs = output)
    model.compile(optimizer, loss=loss, metrics=accuracy)
    
    return model

In [None]:
model = create_model(bert_model, MAX_LEN)
model.summary()

In [None]:
EPOCHS = 10
BATCH_SIZE = 32
checkpoint_filepath = '/BERT'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_categorical_accuracy',
    mode='max',
    save_best_only=True)

train_history = model.fit([train_input_ids,train_attention_masks], 
                          y_train, 
                          validation_data=([val_input_ids,val_attention_masks], y_valid),
                          epochs=EPOCHS, 
                          batch_size=BATCH_SIZE,
                          callbacks=[model_checkpoint_callback])

In [None]:
plt.plot(train_history.history['categorical_accuracy'])
plt.plot(train_history.history['val_categorical_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
plt.plot(train_history.history['loss'])
plt.plot(train_history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
# model.load_weights(checkpoint_filepath)

y_pred = model.predict([test_input_ids,test_attention_masks])

y_pred = np.argmax(y_pred, axis=1)
y_test = np.argmax(y_test, axis=1)

In [None]:
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred) # Also gives the accuracy for the two lists actual and pred
print("Accuracy: %.2lf " % (accuracy*100))

# Robert

In [10]:
# Define RobertaModel using pretrained model
class Klue_RobertaClassifier(tf.keras.Model):
    def __init__(self, num_class):
        super(Klue_RobertaClassifier, self).__init__()

        self.bert = TFRobertaModel.from_pretrained("klue/roberta-small", from_pt=True)
        self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)
        self.classifier = tf.keras.layers.Dense(num_class, kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range,seed=42), 
                                                name="classifier")
        
    def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):
        
        # outputs value : sequence_output, pooled_output, (hidden_states), (attentions)
        outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output, training=training)
        logits = self.classifier(pooled_output)

        return logits

klue_roberta_model = Klue_RobertaClassifier(num_class=7)

Downloading:   0%|          | 0.00/545 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/260M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'roberta.embeddings.position_ids', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream 

In [None]:
# Learn about each fold around the for gate and store the best weights.
for i in range(4):
  print('########## Fold {} : \n'.format(i))

  klue_roberta_model = Klue_RobertaClassifier(num_class=7)

  # Defining loss function, optimizer and metric
  optimizer = AdamWeightDecay(1e-5,weight_decay_rate=1e-4)
  loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
  metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
  klue_roberta_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

  # Adding an ealrystop to prevent overfitting
  earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001,patience=2)

  checkpoint_path = os.path.join(path,'weight_klue_roberta_back_skf_fold_v0{}.h4'.format(i))
  cp_callback = ModelCheckpoint(checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)

  # Training
  history = klue_roberta_model.fit(globals()['train_x{}'.format(i)],globals()['train_y{}'.format(i)], epochs=3, batch_size=32,
                             validation_data=(globals()['valid_x{}'.format(i)],globals()['valid_y{}'.format(i)]), callbacks=[earlystop_callback, cp_callback])

  klue_roberta_model.load_weights(path+'weight_klue_roberta_back_skf_fold_v0{}.h4'.format(i))

  preds=tf.argmax(klue_roberta_model.predict(globals()['valid_x{}'.format(i)]),axis=1)

  print('Validation set ACC: ',accuracy_score(globals()['valid_y{}'.format(i)],preds))
  print('Validation set Confusion Matrix: \n',confusion_matrix(globals()['valid_y{}'.format(i)],preds))

In [17]:
# Load all weights and predict labels toward test data
for i in range(0,3):
 klue_roberta_model.load_weights(path+'weight_klue_roberta_back_skf_fold_v0{}.h4'.format(i))
 globals()['results_{}'.format(i)] = klue_roberta_model.predict(test_x)

# Save forecasts list toward test data
results_test_list=[results_0,results_1,results_2]
np.save(path+'results_klue_roberta_back_list_v11.npy',results_test_list)

In [18]:
sample_submission= pd.read_csv(path+"sample_submission.csv")

In [12]:
results_test_list = np.load(path+'results_klue_roberta_back_list_v11.npy')

results_test_list

array([[[ 4.6140532e+00, -2.0628411e-03,  8.4564704e-01, ...,
         -7.0508617e-01, -2.1385651e+00, -2.2521629e+00],
        [-6.8166751e-01, -1.9924812e+00,  1.5916886e+00, ...,
         -7.5838870e-01, -1.3785888e+00,  6.8644577e-01],
        [ 2.1258705e+00,  3.1886783e-01,  1.5837773e+00, ...,
         -6.9460160e-01, -2.1543012e+00, -1.5011481e+00],
        ...,
        [ 7.0677745e-01, -1.3103632e+00,  3.1125777e+00, ...,
          1.1258031e+00, -2.5331645e+00, -5.5323231e-01],
        [ 1.3964168e+00, -2.9586747e-01,  3.5535746e+00, ...,
         -1.6117247e+00, -1.0175514e+00, -1.3157556e+00],
        [ 2.7455626e+00, -1.2438782e+00,  2.5055802e+00, ...,
         -2.1509044e+00, -2.6992688e+00,  2.4047971e-01]],

       [[ 2.7279150e+00, -2.4453467e-01,  1.8686670e+00, ...,
         -1.2829285e+00, -2.3881986e+00, -2.0761130e+00],
        [-1.4613583e+00, -1.7834041e+00,  1.5890553e+00, ...,
         -1.9689643e+00, -1.5724789e+00,  1.5640781e+00],
        [ 8.0358434e-01, 

In [14]:
results_klue_roberta_eng_list_wyj= (results_test_list[0]+ results_test_list[1]+results_test_list[2])/3
results_klue_roberta_eng_list_wyj

array([[ 3.6581144 , -0.12749662,  1.35749   , ..., -0.8827978 ,
        -2.3636093 , -2.2534618 ],
       [-1.1242046 , -1.9530278 ,  1.2224592 , ..., -1.3597864 ,
        -1.5367724 ,  0.85666496],
       [ 1.1478622 ,  0.29038358,  1.9780225 , ..., -0.50912017,
        -2.2102149 , -0.66596293],
       ...,
       [ 0.27877712, -1.2320801 ,  3.177211  , ...,  1.2234738 ,
        -1.7332608 , -1.0334452 ],
       [ 1.1745554 , -0.19975084,  3.188596  , ..., -1.4523762 ,
        -1.5077481 , -1.7915941 ],
       [ 1.8743086 , -1.434657  ,  2.6913445 , ..., -2.128713  ,
        -2.624015  ,  0.99845344]], dtype=float32)

In [15]:
np.save(path+'results_klue_roberta_eng_list_wyj.npy',results_klue_roberta_eng_list_wyj)

In [19]:
# Calculate forecasts for final test data by averaging all forecasts
test_pred=tf.argmax((results_test_list[0]+results_test_list[1]+results_test_list[2])/3,axis=1)
sample_submission['topic_idx']=test_pred
pd.merge(test,sample_submission).head(30)

Unnamed: 0,index,eng_title,topic_idx
0,45654,YouTube will run a space for creators until th...,0
1,45655,"On Parents' Day, it became clear and cloudy, s...",3
2,45656,"Starting next year, the number of papers will ...",2
3,45657,"Kim Myung-ja, the new chairman of the class pr...",2
4,45658,"2 new novels, including gray man writer Kim Do...",3
5,45659,Do a live broadcast outside. Action cam's excl...,2
6,45660,"Entering the Leo River, the outpost of the rou...",5
7,45661,Going to work in the midst of fine dust.,3
8,45662,WhatsApp tax of 230 won led to the resignation...,2
9,45663,Vietnam's economy continues to grow 6.71 perce...,4


In [None]:
# Final submission
sample_submission.to_csv(os.path.join(path,'klue_roberta_back_skf_v01.csv'), index=False)