In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install rank_bm25
!pip install transformers



In [None]:
import os
from tqdm import tqdm
import pandas as pd
import re
from rank_bm25 import BM25Okapi
from itertools import combinations
import torch
from transformers import AutoTokenizer
from glob import glob


In [None]:
base_path = '/content/drive/MyDrive/Colab Notebooks/open'
train_path = os.path.join(base_path, 'train_code')
data_path = os.path.join(base_path, 'dataframe')

dir_list = glob(os.path.join(train_path, '*'))
dir_list.sort()
dir_list[:5]

['/content/drive/MyDrive/Colab Notebooks/open/train_code/problem001',
 '/content/drive/MyDrive/Colab Notebooks/open/train_code/problem002',
 '/content/drive/MyDrive/Colab Notebooks/open/train_code/problem003',
 '/content/drive/MyDrive/Colab Notebooks/open/train_code/problem004',
 '/content/drive/MyDrive/Colab Notebooks/open/train_code/problem005']

In [None]:
def data_clean(text):
  two_space_tap = re.search("\n  (\S)+", text) is not None

  #한줄 주석 처리
  text = re.sub(r"//.*", "", text)

  #여러줄 주석 처리
  text = re.sub(r"/.*?/", "",  text)

  #include 제거 -> 토큰화 하기에 너무 김
  text = re.sub(r"#include.*", '',text)

  #빈줄 제거
  text = re.sub('(\n)+', '\n', text)
  text = re.sub('(\n)(\s)+(\n)', '\n',  text)

  if two_space_tap:
    text = re.sub('  ', '\t', text)
  else:
    text = re.sub('    ', '\t', text)

    text = text.strip()
  return text

exam = '''
#include<iostream>
#include<vector>
using namespace std;
typedef long long li;
#define repa(i,a,n) for(int i=(a);i<(n);i++)
#define rep(i,n) for(int i=0;i<(n);i++)
#define df 0
template<class T> void print(const T& t){ cout << t << "\n"; }
template<class T, class... Ts> void print(const T& t, const Ts&... ts) { cout << t; if (sizeof...(ts)) cout << " "; print(ts...); }

int main(){
  int n; cin >>n;
  vector<int> c(n-1),s(n-1),f(n-1);
  rep(i,n-1){
    cin >>c[i] >>s[i] >>f[i];
  }
  rep(j,n){
    int t=0;
    repa(i,j,n-1){
      //      if(df)print(s[i],f[i]);
      if(t<s[i])t=s[i];
      if(t%f[i])t=(t/f[i]+1)*f[i];
      t+=c[i];
    }
    print(t);
  }
}
'''

print(data_clean(exam))


using namespace std;
typedef long long li;
#define repa(i,a,n) for(int i=(a);i<(n);i++)
#define rep(i,n) for(int i=0;i<(n);i++)
#define df 0
template<class T> void print(const T& t){ cout << t << "
"; }
template<class T, class... Ts> void print(const T& t, const Ts&... ts) { cout << t; if (sizeof...(ts)) cout << " "; print(ts...); }
int main(){
	int n; cin >>n;
	vector<int> c(n-1),s(n-1),f(n-1);
	rep(i,n-1){
		cin >>c[i] >>s[i] >>f[i];
	}
	rep(j,n){
		int t=0;
		repa(i,j,n-1){
			if(t<s[i])t=s[i];
			if(t%f[i])t=(t/f[i]+1)*f[i];
			t+=c[i];
		}
		print(t);
	}
}



In [None]:
#새로 만들 때
code_list = []
p_num_list = []

for p_num, problem in enumerate(tqdm(dir_list), start=1):
  for sol in glob(os.path.join(problem, '*')):
    with open(sol, 'r') as f:
      code = f.read()

      code_list.append(data_clean(code))
      p_num_list.append(p_num)

p_df = pd.DataFrame(data = {"code" : code_list, "p_num" : p_num_list})

p_df.to_csv(os.path.join(data_path, "problem_df_2403061220.csv"), index=False)

  3%|▎         | 17/500 [01:40<50:51,  6.32s/it]

In [None]:
#불러올때
p_df = pd.read_csv(os.path.join(data_path, "problem_df_2403061220.csv"))

In [None]:
p_df.head()

Unnamed: 0,code,p_num
0,using namespace std;\n#define fastio \t\t\tios...,1
1,"\nusing namespace std;\nint main(){\n\tint r,D...",1
2,"\nusing namespace std;\nint main(){\n\tint r,d...",1
3,using namespace std;\n#define FAST ios_base::s...,1
4,\nusing namespace std;\nusing Graph = vector<v...,1


In [None]:
from itertools import combinations
import random

def get_pair(inputs, tokenizer):
  codes = inputs['code'].to_list()
  problems = inputs['p_num'].unique().tolist()
  problems.sort()

  tokenized_corpus = [tokenizer.tokenize(code) for code in codes]
  bm25 = BM25Okapi(tokenized_corpus)

  total_positive_pairs = []
  total_negative_pairs = []

  for problem in tqdm(problems):
    solution_codes = inputs[inputs['p_num'] == problem]['code']
    #이건 렘이 부족해서 코랩에서는 불가능한 경우 같아서 일부만 추출 해봄
    positive_pairs = list(combinations(solution_codes.to_list(), 2))
    positive_pairs = random.sample(positive_pairs, len(positive_pairs) // 20)

    solution_codes_indices = solution_codes.index.to_list()
    negative_pairs = []

    first_tokenized_code = tokenizer.tokenize(positive_pairs[0][0])
    negative_code_scores = bm25.get_scores(first_tokenized_code)
    negative_code_ranking = negative_code_scores.argsort()[::-1]
    ranking_idx = 0

    for solution_code in solution_codes:
      negative_solutions = []
      while len(negative_solutions) < len(positive_pairs) // len(solution_codes):
        high_score_idx = negative_code_ranking[ranking_idx]

        if high_score_idx not in solution_codes_indices:
          negative_solutions.append(inputs['code'].iloc[high_score_idx])
        ranking_idx += 1

      for negative_solution in negative_solutions:
        negative_pairs.append((solution_code, negative_solution))

    total_positive_pairs.extend(positive_pairs)
    total_negative_pairs.extend(negative_pairs)

  positive_code1 = list(map(lambda x:x[0], total_positive_pairs))
  positive_code2 = list(map(lambda x:x[1], total_positive_pairs))

  negative_code1 = list(map(lambda x:x[0], total_negative_pairs))
  negative_code2 = list(map(lambda x:x[1], total_negative_pairs))

  positive_label = [1] * len(positive_code1)
  negative_label = [0] * len(negative_code1)

  positive_code1.extend(negative_code1)
  positive_code2.extend(negative_code2)
  positive_label.extend(negative_label)

  pair_data = pd.DataFrame(data = {
      'code1' : positive_code1,
      'code2' : positive_code2,
      'similar' : positive_label
  })

  pair_data = pair_data.sample(frac=1).reset_index(drop=True)

  return pair_data



In [None]:
p_df.value_counts('p_num').min()

In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    p_df,
    test_size = 0.1,
    random_state = 42,
    stratify = p_df['p_num']
)

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

train_df

Unnamed: 0,code,p_num
0,using namespace std;\n#define int long long\n#...,161
1,using namespace std;\ntypedef unsigned long lo...,461
2,\nusing namespace std;\nusing ll = long long;\...,296
3,"#define _overload(_1,_2,_3,name,...) name\n#de...",252
4,"#define rep(i, n) for (int i = 0; i < n; ++i)\...",26
...,...,...
224995,using namespace std;\nusing ll=long long;\ntem...,37
224996,\nusing namespace std;\ntypedef long long int ...,173
224997,using namespace std;\nint main(){\n\tcin.sync_...,119
224998,using namespace std;\nint main()\n{\n\tlong lo...,366


In [None]:
tokenizer = AutoTokenizer.from_pretrained('neulab/codebert-cpp')
tokenizer.truncation_side = 'left'

train_df['code_tokenized'] = train_df['code'].apply(tokenizer.tokenize)
val_df['code_tokenized'] = val_df['code'].apply(tokenizer.tokenize)

for i in range(0, len(train_df)):
  train_df['code_tokenized'][i] = train_df['code_tokenized'][i][:512]

for i in range(0, len(val_df)):
  val_df['code_tokenized'][i] = val_df['code_tokenized'][i][:512]

bm25_train_df = get_pair(train_df, tokenizer)

bm25_val_df = get_pair(val_df, tokenizer)

bm25_train_df

Token indices sequence length is longer than the specified maximum sequence length for this model (637 > 512). Running this sequence through the model will result in indexing errors
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['code_tokenized'][i] = train_df['code_tokenized'][i][:512]


KeyboardInterrupt: 

In [None]:
bm25_train_df.to_csv(os.path.join(data_path, "bm25_train_df_2403062244.csv"), index=False)
bm25_val_df.to_csv(os.path.join(data_path, "bm25_val_df_2403062244.csv"), index=False)

In [None]:
test_df = pd.read_csv(os.path.join(base_path, 'test.csv'))

tokenizer = AutoTokenizer.from_pretrained('neulab/codebert-cpp')
tokenizer.truncation_side = 'left'

test_df['code1_tokenized'] = test_df['code1'].apply(tokenizer.tokenize)
test_df['code2_tokenized'] = test_df['code2'].apply(tokenizer.tokenize)

for i in range(0, len(test_df)):
  test_df['code1_tokenized'][i] = test_df['code1_tokenized'][i][:512]
  test_df['code2_tokenized'][i] = test_df['code2_tokenized'][i][:512]

test_df.to_csv(os.path.join(data_path, "test_df_2403062244.csv"), index=False)

Token indices sequence length is longer than the specified maximum sequence length for this model (516 > 512). Running this sequence through the model will result in indexing errors


In [None]:
#재시작 후 모델 러닝

In [None]:
!pip install rank_bm25
!pip install datasets
! pip install -U accelerate
! pip install -U transformers

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2
Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import re
import torch
import transformers

from glob import glob
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, AutoModel, AutoModelForSequenceClassification,DataCollatorForTokenClassification,EarlyStoppingCallback
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from datasets import load_dataset, load_metric, Dataset

from tqdm import tqdm
from tqdm import trange
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data_path = '/content/drive/MyDrive/Colab Notebooks/open/dataframe'

train_df = pd.read_csv(os.path.join(data_path, "bm25_train_df_2403061321.csv"))
val_df = pd.read_csv(os.path.join(data_path, "bm25_val_df_2403061321.csv"))
train_df = train_df.rename(columns={'similar': 'label'})
val_df = val_df.rename(columns={'similar': 'label'})

In [None]:
train_df = train_df.sample(n = 1000000, replace = False).reset_index(drop=True)
val_df = val_df.sample(n= 1000, replace = False).reset_index(drop = True)
dataset_train = Dataset.from_pandas(train_df)
dataset_val = Dataset.from_pandas(val_df)

In [None]:
import random

#model_name = 'neulab/codebert-cpp'
model_name = 'microsoft/codereviewer'
wd = 0.01
batch_size = 16
lr = 2e-5
epochs = 1
task = 'binary_classification'
label_list = ['0', '1']
num_labels = 2

def seed_everything(seed:42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  return self.fget.__get__(instance, owner)()
Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at microsoft/codereviewer and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have be

In [None]:
def tokenize_function(examples):
    tokenizer.truncation_side = 'left'
    return tokenizer(examples["code1"], examples["code2"],padding="max_length", max_length = 512, truncation=True)

In [None]:
#tokenized_train_datasets= dataset_train.map(tokenize_function, batched=True)
tokenized_val_datasets = dataset_val.map(tokenize_function, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
args = TrainingArguments(
    output_dir = '/content/drive/MyDrive/Colab Notebooks/open/codereviewer',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate= lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    seed = 42,
    weight_decay=wd,
    load_best_model_at_end=True,
    logging_dir = '/content/drive/MyDrive/Colab Notebooks/open/codereviewer'
)


def compute_metrics(pred):
    labels = pred.label_ids
    preds = []
    for i in pred.predictions:
      preds.append(np.argmax(i, axis=1).flatten())
    print(preds)
    acc = accuracy_score(labels, preds[0])
    return {'accuracy': acc}

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_val_datasets,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)
trainer.train()

NameError: name 'tokenized_train_datasets' is not defined

In [None]:
base_path = '/content/drive/MyDrive/Colab Notebooks/open/codereviewer'

#torch.save(model, os.path.join(base_path + '/model0312.pt'))  # 전체 모델 저장

In [None]:
#test를 위한 다시 시작

In [None]:
base_path = '/content/drive/MyDrive/Colab Notebooks/open/codereviewer'
data_path = '/content/drive/MyDrive/Colab Notebooks/open/dataframe'

In [None]:
#model = AutoModelForSequenceClassification.from_pretrained(os.path.join(base_path, 'checkpoint-93750'))

model = torch.load(os.path.join(base_path, 'model0311.pt'))

tokenizer = AutoTokenizer.from_pretrained(model_name)

test_df = pd.read_csv(os.path.join(data_path,'test_df_2403062244.csv'))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
test_df = test_df.drop(columns = ['pair_id', 'code1_tokenized', 'code2_tokenized'], axis = 1)

In [None]:
for i in range(len(test_df)):
  if i % 10000 == 0:
    print(i)
  test_df.iloc[i]['code1'] = data_clean(test_df.iloc[i]['code1'])
  test_df.iloc[i]['code2'] = data_clean(test_df.iloc[i]['code2'])

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000


In [None]:
args = TrainingArguments(
    output_dir = '/content/drive/MyDrive/Colab Notebooks/open',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate= lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    seed = 42,
    weight_decay=wd,
    load_best_model_at_end=True,
    logging_dir = '/content/drive/MyDrive/Colab Notebooks/open'
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = []
    for i in pred.predictions:
      preds.append(np.argmax(i, axis=1).flatten())
    print(preds)
    acc = accuracy_score(labels, preds[0])
    return {'accuracy': acc}

trainer = Trainer(
    model,
    args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
predictions = []
for i in trange(len(test_df) // 5000):
  pred = []
  dataset_test = Dataset.from_pandas(test_df[i*5000:(i+1)*5000])
  tokenized_test_datasets = dataset_test.map(tokenize_function, batched=True)
  predictions_test = trainer.predict(tokenized_test_datasets)

  for j in predictions_test.predictions:
    pred.append(np.argmax(j, axis=1).flatten())

  predictions.extend(pred[0])

len(predictions)

  0%|          | 0/119 [00:00<?, ?it/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  1%|          | 1/119 [02:05<4:06:26, 125.31s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  2%|▏         | 2/119 [03:53<3:44:28, 115.11s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  3%|▎         | 3/119 [05:40<3:35:46, 111.61s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  3%|▎         | 4/119 [07:28<3:30:56, 110.06s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  4%|▍         | 5/119 [09:16<3:27:43, 109.33s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  5%|▌         | 6/119 [11:03<3:24:45, 108.72s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  6%|▌         | 7/119 [12:51<3:22:10, 108.30s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  7%|▋         | 8/119 [14:38<3:19:54, 108.06s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  8%|▊         | 9/119 [16:26<3:17:50, 107.91s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  8%|▊         | 10/119 [18:14<3:15:52, 107.82s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  9%|▉         | 11/119 [20:01<3:13:57, 107.75s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 10%|█         | 12/119 [21:49<3:12:02, 107.69s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 11%|█         | 13/119 [23:36<3:10:03, 107.58s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 12%|█▏        | 14/119 [25:24<3:08:11, 107.54s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 13%|█▎        | 15/119 [27:11<3:06:19, 107.49s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 13%|█▎        | 16/119 [28:58<3:04:31, 107.49s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 14%|█▍        | 17/119 [30:46<3:02:45, 107.50s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 15%|█▌        | 18/119 [32:34<3:01:02, 107.55s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 16%|█▌        | 19/119 [34:22<2:59:26, 107.67s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 17%|█▋        | 20/119 [36:09<2:57:46, 107.74s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 18%|█▊        | 21/119 [37:57<2:56:02, 107.78s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 18%|█▊        | 22/119 [39:45<2:54:10, 107.73s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 19%|█▉        | 23/119 [41:32<2:52:13, 107.64s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 20%|██        | 24/119 [43:20<2:50:18, 107.56s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 21%|██        | 25/119 [45:07<2:48:26, 107.51s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 22%|██▏       | 26/119 [46:55<2:46:35, 107.48s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 23%|██▎       | 27/119 [48:42<2:44:51, 107.52s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 24%|██▎       | 28/119 [50:30<2:43:01, 107.49s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 24%|██▍       | 29/119 [52:17<2:41:16, 107.52s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 25%|██▌       | 30/119 [54:05<2:39:31, 107.54s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 26%|██▌       | 31/119 [55:53<2:37:49, 107.60s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 27%|██▋       | 32/119 [57:40<2:36:01, 107.61s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 28%|██▊       | 33/119 [59:28<2:34:12, 107.59s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 29%|██▊       | 34/119 [1:01:15<2:32:18, 107.51s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 29%|██▉       | 35/119 [1:03:02<2:30:28, 107.49s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 30%|███       | 36/119 [1:04:50<2:28:39, 107.46s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 31%|███       | 37/119 [1:06:37<2:26:51, 107.45s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 32%|███▏      | 38/119 [1:08:25<2:25:04, 107.46s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 33%|███▎      | 39/119 [1:10:12<2:23:20, 107.51s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 34%|███▎      | 40/119 [1:12:00<2:21:36, 107.55s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 34%|███▍      | 41/119 [1:13:47<2:19:45, 107.51s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 35%|███▌      | 42/119 [1:15:35<2:17:54, 107.46s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 36%|███▌      | 43/119 [1:17:22<2:16:03, 107.41s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 37%|███▋      | 44/119 [1:19:09<2:14:14, 107.39s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 38%|███▊      | 45/119 [1:20:57<2:12:31, 107.46s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 39%|███▊      | 46/119 [1:22:44<2:10:43, 107.44s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 39%|███▉      | 47/119 [1:24:32<2:08:53, 107.42s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 40%|████      | 48/119 [1:26:19<2:07:06, 107.42s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 41%|████      | 49/119 [1:28:07<2:05:22, 107.46s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 42%|████▏     | 50/119 [1:29:54<2:03:35, 107.47s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 43%|████▎     | 51/119 [1:31:42<2:01:50, 107.50s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 44%|████▎     | 52/119 [1:33:29<2:00:03, 107.52s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 45%|████▍     | 53/119 [1:35:17<1:58:15, 107.51s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 45%|████▌     | 54/119 [1:37:04<1:56:28, 107.51s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 46%|████▌     | 55/119 [1:38:52<1:54:40, 107.50s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 47%|████▋     | 56/119 [1:40:39<1:52:51, 107.48s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 48%|████▊     | 57/119 [1:42:27<1:51:03, 107.48s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 49%|████▊     | 58/119 [1:44:14<1:49:17, 107.50s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 50%|████▉     | 59/119 [1:46:02<1:47:37, 107.62s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 50%|█████     | 60/119 [1:47:50<1:45:48, 107.60s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 51%|█████▏    | 61/119 [1:49:37<1:43:59, 107.57s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 52%|█████▏    | 62/119 [1:51:25<1:42:09, 107.54s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 53%|█████▎    | 63/119 [1:53:12<1:40:24, 107.58s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 54%|█████▍    | 64/119 [1:55:00<1:38:34, 107.54s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 55%|█████▍    | 65/119 [1:56:47<1:36:46, 107.53s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 55%|█████▌    | 66/119 [1:58:35<1:34:59, 107.53s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 56%|█████▋    | 67/119 [2:00:23<1:33:12, 107.55s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 57%|█████▋    | 68/119 [2:02:10<1:31:28, 107.61s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 58%|█████▊    | 69/119 [2:03:58<1:29:39, 107.58s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 59%|█████▉    | 70/119 [2:05:45<1:27:51, 107.59s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 60%|█████▉    | 71/119 [2:07:33<1:26:08, 107.68s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 61%|██████    | 72/119 [2:09:21<1:24:18, 107.62s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 61%|██████▏   | 73/119 [2:11:08<1:22:29, 107.59s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 62%|██████▏   | 74/119 [2:12:56<1:20:40, 107.56s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 63%|██████▎   | 75/119 [2:14:43<1:18:52, 107.55s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 64%|██████▍   | 76/119 [2:16:31<1:17:03, 107.53s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 65%|██████▍   | 77/119 [2:18:19<1:15:18, 107.58s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 66%|██████▌   | 78/119 [2:20:06<1:13:31, 107.59s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 66%|██████▋   | 79/119 [2:21:54<1:11:45, 107.64s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 67%|██████▋   | 80/119 [2:23:42<1:09:58, 107.65s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 68%|██████▊   | 81/119 [2:25:29<1:08:10, 107.65s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 69%|██████▉   | 82/119 [2:27:17<1:06:22, 107.62s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 70%|██████▉   | 83/119 [2:29:04<1:04:33, 107.60s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 71%|███████   | 84/119 [2:30:52<1:02:46, 107.62s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 71%|███████▏  | 85/119 [2:32:40<1:01:01, 107.70s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 72%|███████▏  | 86/119 [2:34:27<59:13, 107.67s/it]  

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 73%|███████▎  | 87/119 [2:36:15<57:23, 107.61s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 74%|███████▍  | 88/119 [2:38:02<55:34, 107.57s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 75%|███████▍  | 89/119 [2:39:50<53:47, 107.58s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 76%|███████▌  | 90/119 [2:41:38<52:00, 107.60s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 76%|███████▋  | 91/119 [2:43:25<50:12, 107.59s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 77%|███████▋  | 92/119 [2:45:13<48:25, 107.61s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 78%|███████▊  | 93/119 [2:47:00<46:37, 107.60s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 79%|███████▉  | 94/119 [2:48:48<44:50, 107.61s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 80%|███████▉  | 95/119 [2:50:36<43:02, 107.61s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 81%|████████  | 96/119 [2:52:23<41:15, 107.63s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 82%|████████▏ | 97/119 [2:54:11<39:27, 107.63s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 82%|████████▏ | 98/119 [2:55:59<37:42, 107.72s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 83%|████████▎ | 99/119 [2:57:47<35:54, 107.74s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 84%|████████▍ | 100/119 [2:59:34<34:06, 107.70s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 85%|████████▍ | 101/119 [3:01:22<32:18, 107.70s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 86%|████████▌ | 102/119 [3:03:10<30:30, 107.65s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 87%|████████▋ | 103/119 [3:04:57<28:42, 107.65s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 87%|████████▋ | 104/119 [3:06:45<26:54, 107.66s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 88%|████████▊ | 105/119 [3:08:32<25:06, 107.63s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 89%|████████▉ | 106/119 [3:10:20<23:19, 107.63s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 90%|████████▉ | 107/119 [3:12:08<21:31, 107.65s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 91%|█████████ | 108/119 [3:13:55<19:44, 107.65s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 92%|█████████▏| 109/119 [3:15:43<17:56, 107.65s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 92%|█████████▏| 110/119 [3:17:31<16:08, 107.64s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 93%|█████████▎| 111/119 [3:19:19<14:21, 107.70s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 94%|█████████▍| 112/119 [3:21:06<12:33, 107.69s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 95%|█████████▍| 113/119 [3:22:54<10:45, 107.62s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 96%|█████████▌| 114/119 [3:24:41<08:57, 107.59s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 97%|█████████▋| 115/119 [3:26:29<07:10, 107.60s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 97%|█████████▋| 116/119 [3:28:16<05:22, 107.61s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 98%|█████████▊| 117/119 [3:30:04<03:35, 107.63s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 99%|█████████▉| 118/119 [3:31:52<01:47, 107.62s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

100%|██████████| 119/119 [3:33:39<00:00, 107.73s/it]


595000

In [None]:
#public 0.76592 (codebert - test전처리 전)
#public 0.81441 (codebert - test전처리 후)
#public xx (codereviewer - test전처리 후)

In [None]:


sub = pd.read_csv(os.path.join('/content/drive/MyDrive/Colab Notebooks/open', 'sample_submission.csv'))
sub['similar'] = predictions

sub.to_csv(os.path.join(base_path, 'submission_0313.csv'), index = False)

In [None]:
########

In [None]:
tokenized_val_datasets = dataset_val.map(tokenize_function, batched=True)

Map:   0%|          | 0/55503 [00:00<?, ? examples/s]

In [None]:
predictions_val = trainer.predict(tokenized_val_datasets)

[array([0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0

In [None]:
preds = []
for i in predictions_val.predictions:
  preds.append(np.argmax(i, axis=1).flatten())

predict = preds[0]

In [None]:
score = 0
for i in range(len(predict)):
  if val_df.iloc[i]['label'] == predict[i]:
    score += 1

score / len(predict)

0.981