In [1]:
import os
import numpy as np
import pandas as pd
import torch
import random
import re

from tqdm import tqdm
from itertools import combinations
from collections import deque
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split

### Preprocessing

In [2]:
def preprocess_script(script):
    new_script = deque()
    with open(script,'r',encoding='utf-8') as file:
        lines = file.readlines()
        for line in lines:
            if line.lstrip().startswith('#'): 
                continue
            line = line.rstrip()
            if '#' in line:
                line = line[:line.index('#')] 
            line = line.replace('\n','') 
            line = line.replace('    ','\t') 
            
            if line == '': 
                continue
            
            new_script.append(line)
            
        new_script = '\n'.join(new_script) 
        new_script = re.sub('("""[\w\W]*?""")', '<str>', new_script)
        new_script = re.sub("('''[\w\W]*?''')", '<str>', new_script)
        new_script = re.sub('/^(http?|https?):\/\/([a-z0-9-]+\.)+[a-z0-9]{2,4}.*$/', '', new_script)
    
    return new_script

def seed_everything(seed=1004):
  random.seed(seed)
  np.random.seed(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  print(f"Seed set as {seed}")

seed_everything(1004)

Seed set as 1004


In [3]:
code_folder = "./input/code/"
problem_folders = os.listdir(code_folder) 

In [4]:
preprocess_scripts = []
problem_nums = []

for problem_folder in tqdm(problem_folders):
    scripts = os.listdir(os.path.join(code_folder, problem_folder)) 
    problem_num = problem_folder 
    for script in scripts:
        script_file = os.path.join(code_folder,problem_folder,script)
        preprocessed_script = preprocess_script(script_file)

        preprocess_scripts.append(preprocessed_script)

    problem_nums.extend([problem_num]*len(scripts))

100%|██████████| 300/300 [00:04<00:00, 68.74it/s]


In [5]:
df = pd.DataFrame(data= {'code':preprocess_scripts, 'problem_num':problem_nums})

In [6]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
tokenizer.truncation_side = 'left'
MAX_LEN = 512

tokens = []
for code in df['code']:
    tokens.append(tokenizer.tokenize(code, max_length=MAX_LEN, truncation=True))

df['tokens'] = tokens
df['len'] = df['tokens'].apply(len)

In [7]:
train_df, valid_df, train_label, valid_label = train_test_split(
        df,
        df['problem_num'],
        random_state=42,
        test_size=0.1,
        stratify=df['problem_num']
    )

In [34]:
codes = train_df['code'].to_list()
problems = train_df['problem_num'].unique().tolist()
problems.sort()

total_positive_pairs = []
total_negative_pairs = []

for problem in tqdm(problems):
    solution_codes = train_df[train_df['problem_num'] == problem]['code'].to_list()
    other_codes = train_df[train_df['problem_num'] != problem]['code'].to_list()
    
    positive_pairs = list(combinations(solution_codes,2))
    random.shuffle(positive_pairs)
    positive_pairs = positive_pairs[:100]
    random.shuffle(other_codes)
    other_codes = other_codes[:100]
    
    negative_pairs = []
    for pos_codes, others in zip(positive_pairs, other_codes):
        negative_pairs.append((pos_codes[0], others))
    
    total_positive_pairs.extend(positive_pairs)
    total_negative_pairs.extend(negative_pairs)

100%|██████████| 300/300 [00:09<00:00, 33.31it/s]


In [35]:
code1 = [code[0] for code in total_positive_pairs] + [code[0] for code in total_negative_pairs]
code2 = [code[1] for code in total_positive_pairs] + [code[1] for code in total_negative_pairs]
label = [1]*len(total_positive_pairs) + [0]*len(total_negative_pairs)

train_data = pd.DataFrame(data={'code1':code1, 'code2':code2, 'similar':label})
train_data = train_data.sample(frac=1).reset_index(drop=True) # frac: 추출할 표본 비율
train_data.to_csv('train_data_lv1.csv',index=False)

In [14]:
codes = valid_df['code'].to_list() 
problems = valid_df['problem_num'].unique().tolist() 
problems.sort()

total_positive_pairs = []
total_negative_pairs = []

for problem in tqdm(problems):
    solution_codes = valid_df[valid_df['problem_num'] == problem]['code'].to_list()
    other_codes = valid_df[valid_df['problem_num'] != problem]['code'].to_list()

    positive_pairs = list(combinations(solution_codes,2))
    random.shuffle(positive_pairs)
    positive_pairs = positive_pairs[:100]
    random.shuffle(other_codes)
    other_codes = other_codes[:100]
    
    negative_pairs = []
    for pos_codes, others in zip(positive_pairs, other_codes):
        negative_pairs.append((pos_codes[0], others))
    
    total_positive_pairs.extend(positive_pairs)
    total_negative_pairs.extend(negative_pairs)

100%|██████████| 300/300 [00:01<00:00, 270.99it/s]


In [15]:
code1 = [code[0] for code in total_positive_pairs] + [code[0] for code in total_negative_pairs]
code2 = [code[1] for code in total_positive_pairs] + [code[1] for code in total_negative_pairs]
label = [1]*len(total_positive_pairs) + [0]*len(total_negative_pairs)

valid_data = pd.DataFrame(data={'code1':code1, 'code2':code2, 'similar':label})
valid_data = valid_data.sample(frac=1).reset_index(drop=True) 
valid_data.to_csv('valid_data_lv1.csv',index=False)

In [16]:
def seed_everything(seed):
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
  np.random.seed(seed)
  random.seed(seed)

seed_everything(1004)

In [36]:
from datasets import load_metric, load_dataset, Dataset, concatenate_datasets
train_dset = load_dataset("csv", data_files='train_data_lv1.csv')['train']
validation_dset = load_dataset("csv", data_files='valid_data_lv1.csv')['train']
rawdataset = concatenate_datasets([train_dset, validation_dset])

Using custom data configuration default-c8e4987d7c799e22


Downloading and preparing dataset csv/default to C:/Users/johana/.cache/huggingface/datasets/csv/default-c8e4987d7c799e22/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files: 100%|██████████| 1/1 [00:00<?, ?it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 1003.18it/s]


Dataset csv downloaded and prepared to C:/Users/johana/.cache/huggingface/datasets/csv/default-c8e4987d7c799e22/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 100.22it/s]
Using custom data configuration default-07c938b99a36b8d1
Found cached dataset csv (C:/Users/johana/.cache/huggingface/datasets/csv/default-07c938b99a36b8d1/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)
100%|██████████| 1/1 [00:00<00:00, 250.62it/s]


### Train

In [39]:
from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import pandas as pd
import numpy as np
from datasets import load_dataset, load_metric

MODEL = "klue/bert-base"
MAX_LEN = 256
tokenizer = AutoTokenizer.from_pretrained(MODEL)

def example_fn(examples):
    outputs = tokenizer(examples['code1'], examples['code2'], padding=True, max_length=MAX_LEN,truncation=True)
    if 'similar' in examples:
        outputs["labels"] = examples["similar"]
    return outputs

rawdataset = rawdataset.map(example_fn, remove_columns=['code1', 'code2', 'similar'])
rawdataset = rawdataset.train_test_split(0.1)

_collator = DataCollatorWithPadding(tokenizer=tokenizer)
_metric = load_metric("glue", "sst2")

def metric_fn(p):
    preds, labels = p
    output =  _metric.compute(references=labels, predictions=np.argmax(preds, axis=-1))
    return output
    
model = BertForSequenceClassification.from_pretrained(MODEL) 

args = TrainingArguments(
    'runs/',
    per_device_train_batch_size=32,
    num_train_epochs=3,
    do_train=True,
    do_eval=True,
    save_strategy="epoch",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    report_to="wandb"
)

trainer = Trainer(
        model=model,
        args=args,
        data_collator=_collator,
        train_dataset=rawdataset["train"],
        eval_dataset=rawdataset["test"],
        tokenizer=tokenizer,
        compute_metrics=metric_fn)

trainer.train()

loading configuration file config.json from cache at C:\Users\johana/.cache\huggingface\hub\models--klue--bert-base\snapshots\812449f1a6bc736e693db7aa0e513e5e90795a62\config.json
Model config BertConfig {
  "_name_or_path": "klue/bert-base",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

loading file vocab.txt from cache at C:\Users\johana/.cache\huggingface\hub\models--klue--bert-base\snapshots\812449f1a6bc736e693db7aa0e513e5e90795a62\vocab.txt
loading file tokenizer.json from cach

{'loss': 0.23, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}


                                                    
 33%|███▎      | 3375/10125 [16:54<32:04,  3.51it/s]Saving model checkpoint to runs/checkpoint-3375
Configuration saved in runs/checkpoint-3375\config.json


{'eval_loss': 0.16544044017791748, 'eval_accuracy': 0.93125, 'eval_runtime': 42.9684, 'eval_samples_per_second': 279.275, 'eval_steps_per_second': 34.909, 'epoch': 1.0}


Model weights saved in runs/checkpoint-3375\pytorch_model.bin
tokenizer config file saved in runs/checkpoint-3375\tokenizer_config.json
Special tokens file saved in runs/checkpoint-3375\special_tokens_map.json
 67%|██████▋   | 6750/10125 [33:09<16:16,  3.46it/s]   ***** Running Evaluation *****
  Num examples = 12000
  Batch size = 8


{'loss': 0.1407, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}


                                                    
 67%|██████▋   | 6750/10125 [33:53<16:16,  3.46it/s]Saving model checkpoint to runs/checkpoint-6750
Configuration saved in runs/checkpoint-6750\config.json


{'eval_loss': 0.13777361810207367, 'eval_accuracy': 0.944, 'eval_runtime': 44.4703, 'eval_samples_per_second': 269.843, 'eval_steps_per_second': 33.73, 'epoch': 2.0}


Model weights saved in runs/checkpoint-6750\pytorch_model.bin
tokenizer config file saved in runs/checkpoint-6750\tokenizer_config.json
Special tokens file saved in runs/checkpoint-6750\special_tokens_map.json
100%|██████████| 10125/10125 [50:10<00:00,  3.44it/s]  ***** Running Evaluation *****
  Num examples = 12000
  Batch size = 8


{'loss': 0.0945, 'learning_rate': 0.0, 'epoch': 3.0}


                                                     
100%|██████████| 10125/10125 [50:55<00:00,  3.44it/s]Saving model checkpoint to runs/checkpoint-10125
Configuration saved in runs/checkpoint-10125\config.json


{'eval_loss': 0.12032566219568253, 'eval_accuracy': 0.9549166666666666, 'eval_runtime': 44.7159, 'eval_samples_per_second': 268.361, 'eval_steps_per_second': 33.545, 'epoch': 3.0}


Model weights saved in runs/checkpoint-10125\pytorch_model.bin
tokenizer config file saved in runs/checkpoint-10125\tokenizer_config.json
Special tokens file saved in runs/checkpoint-10125\special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 10125/10125 [50:57<00:00,  3.31it/s]

{'train_runtime': 3057.1481, 'train_samples_per_second': 105.981, 'train_steps_per_second': 3.312, 'train_loss': 0.15506593303915894, 'epoch': 3.0}





TrainOutput(global_step=10125, training_loss=0.15506593303915894, metrics={'train_runtime': 3057.1481, 'train_samples_per_second': 105.981, 'train_steps_per_second': 3.312, 'train_loss': 0.15506593303915894, 'epoch': 3.0})

### Test

In [41]:
import pandas as pd

TEST = "input/test.csv"
SUB = "input/sample_submission.csv"

testdataset = load_dataset("csv", data_files=TEST)['train']
test_dataset = testdataset.map(example_fn, remove_columns=['code1', 'code2'])

predictions = trainer.predict(test_dataset)

df = pd.read_csv(SUB)
df['similar'] = np.argmax(predictions.predictions, axis=-1)
df.to_csv('submission.csv', index=False)

Using custom data configuration default-f16767c7ad6bd23c
Found cached dataset csv (C:/Users/johana/.cache/huggingface/datasets/csv/default-f16767c7ad6bd23c/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)
100%|██████████| 1/1 [00:00<00:00, 77.44it/s]
100%|██████████| 179700/179700 [02:53<00:00, 1034.03ex/s]
The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: pair_id. If pair_id are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 179700
  Batch size = 8
100%|██████████| 22463/22463 [10:55<00:00, 34.25it/s]
