In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 7.9MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 44.1MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 39.0MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=3cde6efb23b94

# torch.load시 cuda 오류가 나서 모델 훈련하는 부분을 구현한 후 content 폴더에 model.pt로 저장한 후 속도 계산 부분에서 이 model.pt를 불러오는 것으로 대체하였습니다.

*kaggle, local 등 환경에서는 문제 x (코랩 기준 1시간 걸립니다)

content/news_train.csv 넣어주셔야 합니다

*이후 추론 시간 측정 시 공정한 시간 평가를 위해 라이버리 로딩은 다시 넣었습니다

In [7]:
import re
import random
import time
import datetime
import numpy as np      
import pandas as pd       
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import ElectraModel, ElectraTokenizer, ElectraForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

if torch.cuda.is_available():  
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
tokenizer = ElectraTokenizer.from_pretrained('monologg/koelectra-base-v3-discriminator')
model = ElectraForSequenceClassification.from_pretrained('monologg/koelectra-base-v3-discriminator',num_labels=2)
model.cuda()
train = pd.read_csv("/content/news_train.csv")
def preprocess(text):
    # remove hyperlinks
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r'http?:\/\/.*[\r\n]*', '', text)
    #remove some puncts (except . ! ?)
    text=re.sub(r'[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '',text)
    text=" ".join(text.split())
    return text
train['clean_text'] = train['content'].apply(preprocess)
texts = train['clean_text'].values
labels = train['info'].values
indices=tokenizer.batch_encode_plus(texts,max_length=120,add_special_tokens=True, padding='max_length',pad_to_max_length=True,truncation=True)

input_ids=indices["input_ids"]
attention_masks=indices["attention_mask"]

# Use 99% for training and 1% for validation.
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=42)
# Do the same for the masks.
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
                                             random_state=42)
# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels, dtype=torch.long)
validation_labels = torch.tensor(validation_labels, dtype=torch.long)
train_masks = torch.tensor(train_masks, dtype=torch.long)
validation_masks = torch.tensor(validation_masks, dtype=torch.long)

batch_size = 32

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

optimizer = AdamW(model.parameters(),lr = 6e-6, eps = 1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)
loss_values = []

print('Training...')
for epoch_i in range(0, epochs):
    print('\n======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    t0 = time.time()
    total_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        if step % 1000 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
        b_input_ids = batch[0].to(device)  #   [0]: input ids 
        b_input_mask = batch[1].to(device) #   [1]: attention masks
        b_labels = batch[2].to(device)     #   [2]: labels 
        model.zero_grad() # Always clear any previously calculated gradients before performing a backward pass using pytorch    
        # Perform a forward pass (evaluate the model on this training batch). This will return the loss
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)        
        loss = outputs[0] # The call to `model` always returns a tuple, so we need to pull the loss value out of the tuple.
        total_loss += loss.item()  # Accumulate the training loss over all of the batches for calculate the average loss at the end
        loss.backward() # Perform a backward pass to calculate the gradients.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Clip the norm of the gradients to 1.0. to help prevent the "exploding gradients" problem.
        optimizer.step() # Update parameters and take a step using the computed gradient.
        scheduler.step() # Update the learning rate.
    avg_train_loss = total_loss / len(train_dataloader) # Calculate the average loss over the training data.            
    loss_values.append(avg_train_loss) # Store the loss value for plotting the learning curve.
print("Training complete!")
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
print("\nRunning Validation...")
t0 = time.time()
model.eval() # Put the model in evaluation mode--the dropout layers behave differently during evaluation.
preds=[]
true=[]
# Tracking variables 
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

# Evaluate data for one epoch
for batch in validation_dataloader:
    batch = tuple(t.to(device) for t in batch) # Add batch to GPU
    b_input_ids, b_input_mask, b_labels = batch # Unpack the inputs from our dataloader
    with torch.no_grad(): # Telling the model not to compute or store gradients, saving memory and speeding up validation      
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) # Forward pass, calculate logit predictions, cuz not with labels
    
    logits = outputs[0] # values prior to applying an activation function like the softmax.
    logits = logits.detach().cpu().numpy() # Move logits and labels to CPU
    label_ids = b_labels.to('cpu').numpy() # Move logits and labels to CPU
    preds.append(logits)
    true.append(label_ids)
    
    tmp_eval_accuracy = flat_accuracy(logits, label_ids) # Calculate the accuracy for this batch of test sentences.
    eval_accuracy += tmp_eval_accuracy # Accumulate the total accuracy
    nb_eval_steps += 1 # Track the number of batches

# Combine the predictions for each batch into a single list of 0s and 1s.
flat_predictions = [item for sublist in preds for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
# Combine the correct labels for each batch into a single list.
flat_true_labels = [item for sublist in true for item in sublist]
#print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
#print("  Validation took: {:}\n".format(format_time(time.time() - t0)))
#print("  Classification report\n",classification_report(flat_predictions,flat_true_labels))
print("  Model saving....")
torch.save(model,'/content/model.pt')

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

Training...
Training...
Training...
Training complete!

Running Validation...
  Model saving....


## 데이터 불러오기

In [8]:
import pandas as pd       
test = pd.read_csv('/content/news_test.csv')

## 시간 측정 시작

In [9]:
import time
start = time.time()

## Library  불러오기

In [10]:
import re
import datetime
import numpy as np      
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import transformers
from transformers import ElectraModel, ElectraTokenizer, ElectraForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

if torch.cuda.is_available():  
    device = torch.device("cuda")
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

We will use the GPU: Tesla P100-PCIE-16GB


## pos_Tagger, Tokenizer, pretraind_embedding, Model 불러오기

In [11]:
#config1 = '/content/5.Model/tokenizer_config.json'
#config2 = '/content/5.Model/config.json'
#tokenizer1 = ElectraTokenizer.from_pretrained('/content/5.Model/vocab.txt', cofing=config1)
#model2 = ElectraForSequenceClassification.from_pretrained('/content/5.Model/',num_labels=2,config=config2)
#model2.cuda()
#not working loading model above so substitute the way below

model1 = torch.load('/content/model.pt')

## 형태소 분석 + 전처리

In [12]:
#preprocessing text
def preprocess(text):
    # remove hyperlinks
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r'http?:\/\/.*[\r\n]*', '', text)
    #remove some puncts (except . ! ?)
    text=re.sub(r'[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '',text)
    text=" ".join(text.split())
    return text

test['clean_text'] = test['content'].apply(preprocess)
x_text = test['clean_text'].values

#tokenizing and put data in tensor
indices1=tokenizer.batch_encode_plus(x_text,max_length=120,add_special_tokens=True, return_attention_mask=True, padding='max_length',truncation=True)
input_ids1=indices1["input_ids"]
attention_masks1=indices1["attention_mask"]

prediction_inputs1= torch.tensor(input_ids1)
prediction_masks1 = torch.tensor(attention_masks1)

# Set the batch size.  
batch_size = 32 

# Create the DataLoader.
prediction_data1 = TensorDataset(prediction_inputs1, prediction_masks1)
prediction_sampler1 = SequentialSampler(prediction_data1)
prediction_dataloader1 = DataLoader(prediction_data1, sampler=prediction_sampler1, batch_size=batch_size)

## 예측 

*(/content/sample_submission.csv 필요)

In [None]:
print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs1)))
model.eval()
predictions = []
prediction_inputs1= torch.tensor(input_ids1) 
for batch in prediction_dataloader1:
  batch = tuple(t.to(device) for t in batch) # Add batch to GPU
  b_input_ids1, b_input_mask1 = batch # Unpack the inputs from our dataloader
  with torch.no_grad(): # Telling the model not to compute or store gradients, saving memory and speeding up prediction 
      outputs1 = model(b_input_ids1, token_type_ids=None, attention_mask=b_input_mask1) # Forward pass, calculate logit predictions
  logits1 = outputs1[0]
  logits1 = logits1.detach().cpu().numpy() # Move logits and labels to CPU
  predictions.append(logits1) # Store predictions and true labels

flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

sub=pd.read_csv('/content/sample_submission.csv')
sub=pd.DataFrame({'id':sub['id'].values.tolist(),'info':flat_predictions})
#Ad-sentence filtering
test['info'] = sub['info']
train_unique_ad_sentence = train.query('info == "1"')['clean_text'].unique()
test.loc[test['clean_text'].isin(train_unique_ad_sentence),['info']] = 1
sub['info'] = test['info']

In [15]:
print(time.time() - start)

709.2109813690186


# 제출

In [None]:
sub.to_csv('/content/submission.csv')