In [1]:
# Preprocessing
import os
import numpy as np
import pandas as pd
import torch
import random
import re

from tqdm import tqdm
from itertools import combinations
from collections import deque
from transformers import AutoTokenizer
from rank_bm25 import BM25Okapi
from sklearn.model_selection import train_test_split

def preprocess_script(code):
    
    code_1 = code['code1']
    code_2 = code['code2']

    code_1_new = deque()
    
    anotation_checksum = 0
    for line in code_1.split('\n'):
        if (line.lstrip().startswith("*/") or line.rstrip().endswith("*/")):
            anotation_checksum = 0
            continue
        if anotation_checksum == 1:
            continue
        if line.lstrip().startswith('#include'): # #include으로 시작되는 행 skip
            continue
        if line.lstrip().startswith('/*'): # 주석시작
            if "*/" in line:
                continue
            else:
                anotation_checksum = 1 
                continue

        if line.lstrip().startswith('//'): # 주석으로 시작되는 행 skip
            continue
        line = line.rstrip()
        if '//' in line:
            line = line[:line.index('//')] # 주석 전까지 코드만 저장
        line = line.replace('\n','') # 개행 문자를 모두 삭제함
        line = line.replace('    ','\t') # 공백 4칸을 tab으로 변환
        
        if line.lstrip().rstrip() == '': # 전처리 후 빈 라인은 skipa
            continue
        
        code_1_new.append(line)
        
    code_1_new = '\n'.join(code_1_new) # 개행 문자로 합침
    code_1_new = re.sub('("""[\w\W]*?""")', '<str>', code_1_new)
    code_1_new = re.sub("('''[\w\W]*?''')", '<str>', code_1_new)
    code_1_new = re.sub('/^(http?|https?):\/\/([a-z0-9-]+\.)+[a-z0-9]{2,4}.*$/', '', code_1_new)
    code['code1'] = code_1_new

    code_2_new = deque()
    
    anotation_checksum = 0
    for line in code_2.split('\n'):
        if (line.lstrip().startswith("*/") or line.rstrip().endswith("*/")):
            anotation_checksum = 0
            continue
        if anotation_checksum == 1:
            continue
        if line.lstrip().startswith('#include'): # #include으로 시작되는 행 skip
            continue
        if line.lstrip().startswith('/*'): # 주석시작
            if "*/" in line:
                continue
            else:
                anotation_checksum = 1 
                continue

        if line.lstrip().startswith('//'): # 주석으로 시작되는 행 skip
            continue
        line = line.rstrip()
        if '//' in line:
            line = line[:line.index('//')] # 주석 전까지 코드만 저장
        line = line.replace('\n','') # 개행 문자를 모두 삭제함
        line = line.replace('    ','\t') # 공백 4칸을 tab으로 변환
        
        if line.lstrip().rstrip() == '': # 전처리 후 빈 라인은 skipa
            continue
        
        code_2_new.append(line)
        
    code_2_new = '\n'.join(code_2_new) # 개행 문자로 합침
    code_2_new = re.sub('("""[\w\W]*?""")', '<str>', code_2_new)
    code_2_new = re.sub("('''[\w\W]*?''')", '<str>', code_2_new)
    code_2_new = re.sub('/^(http?|https?):\/\/([a-z0-9-]+\.)+[a-z0-9]{2,4}.*$/', '', code_2_new)
    code['code2'] = code_2_new

    return code

  from .autonotebook import tqdm as notebook_tqdm


In [44]:
DDD = pd.read_csv('/home/workspace/DACON/CodeSim/Dataset/valid_data_lv1.csv')
DDD.columns

Index(['code1', 'code2', 'similar'], dtype='object')

In [2]:
import torch
from torch.utils.data import (DataLoader ,RandomSampler, SequentialSampler, TensorDataset)
import torch.nn.functional as f
from torch.utils.data import TensorDataset
from torch.optim import Adam

import transformers
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from transformers import AdamW, RobertaConfig
from sklearn.metrics import (accuracy_score, 
                             precision_recall_curve,
                             f1_score,
                             auc)
import torch.nn as nn
from transformers import (AutoConfig, 
                          AutoTokenizer, 
                          RobertaForSequenceClassification,
                          Trainer,
                          TrainingArguments,
                          DataCollatorWithPadding,
                          EarlyStoppingCallback)

from torch.nn import CrossEntropyLoss
from sklearn.metrics import confusion_matrix, classification_report, matthews_corrcoef, f1_score, recall_score, precision_score
from datasets import concatenate_datasets, load_dataset

''' Tokenizer '''
tokenizer = RobertaTokenizer.from_pretrained('microsoft/graphcodebert-base')
tokenizer.truncation_side = 'left' # 설정된 길이만큼 tokenize를 한 다음 초과되는 부분을 왼쪽에서 부터 자른다. 

def example_fn(examples):
    outputs = tokenizer(examples['code1'], examples['code2'], padding='max_length', max_length=512, truncation=True)
    return outputs

In [50]:
transformers.logging.set_verbosity_error()
testdataset = load_dataset("csv", data_files='/home/workspace/DACON/CodeSim/Dataset/test.csv')['train']

preprocessed = testdataset.map(preprocess_script)
test_dataset = preprocessed.map(example_fn, remove_columns=['code1', 'code2','pair_id'])

collator = DataCollatorWithPadding(tokenizer=tokenizer)
testloader = DataLoader(test_dataset,
                          batch_size=16,
                          shuffle=False,
                         collate_fn = collator
                          )

Map: 100%|██████████| 595000/595000 [24:41<00:00, 401.72 examples/s] 


In [60]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [61]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base")
load_path = "/home/workspace/DACON/CodeSim/model/models/graphcodebert_Bs16_OptAdamW_ScduLinear_Sm0.0/1-fold/best.pt"

model.load_state_dict(torch.load(load_path, map_location=torch.device('cpu')))
model.to(device)

model.eval()
progress_bar = tqdm(enumerate(testloader), total=len(testloader), leave=True, position=0)
for i, data in progress_bar:
    with torch.no_grad():
        input_ids = torch.from_numpy(np.asarray(data['input_ids']))
        input_ids.to(device)
        attention_mask = torch.from_numpy(np.asarray(data['attention_mask']))
        attention_mask.to(device)
        logits = model(input_ids.to(device), attention_mask.to(device))
        logits=logits.logits
    if i==0:
      one_fold_logits = logits
    else:
      one_fold_logits = torch.cat([one_fold_logits,logits],dim=0)

one_fold_logits


100%|██████████| 37188/37188 [1:15:13<00:00,  8.24it/s]


tensor([[-2.6585,  2.9356],
        [ 1.7737, -1.8628],
        [-0.7754,  0.8949],
        ...,
        [-1.4672,  1.6166],
        [ 1.1838, -1.3130],
        [ 2.1381, -2.3336]], device='cuda:0')

In [66]:
one_fold_logits_new = one_fold_logits.squeeze(0).detach().cpu().numpy()
all_fold_predictions = np.argmax(one_fold_logits_new, axis=1)
all_fold_predictions

array([1, 0, 1, ..., 1, 0, 0])

In [67]:
len(all_fold_predictions)

595000

In [68]:
sub = pd.read_csv("/home/workspace/DACON/CodeSim/Dataset/sample_submission.csv")
sub['similar'] = all_fold_predictions
sub.to_csv('/home/workspace/DACON/CodeSim/Dataset/submission_output.csv', index=False)