# Introduction
This notebook is used to create a dataset for the semantic clone detection task. The dataset is created from the Semantic Benchmark dataset.

The dataset contains 1000 pairs of similar and 1000 dissimilar code snippets. The dataset is saved as a csv file.

In [4]:
from pathlib import Path
import glob
import os
import re
import pandas as pd

####  Read the python files from the correct subdirectory and create the clone pairs

In [7]:
# maybe needed to change to base directory
os.chdir("..")

In [8]:
path = Path("data/Semantic_Benchmark/Python/Stand alone clones")
if not path.exists():
    raise FileNotFoundError(f"The path does not exist {path.absolute()}")

In [14]:
python_files = path.glob("**/*.py")
pattern = re.compile(r"data/Semantic_Benchmark/Python/Stand alone clones/Clone(?P<id>\d+)\.py")

clones = []
for file in python_files:
    match = pattern.match(str(file))
    if match:
        groups_dict = dict(match.groupdict())

        with open(file) as f:
            content = f.read()
        clone1,_, clone2 = content.partition("\n\n\n")
        groups_dict["clone1"] = clone1.strip().strip("\n")
        groups_dict["clone2"] = clone2.strip().strip("\n")

        clones.append(groups_dict)
    else:
        print(f"no match for {file}")


In [15]:
df = pd.DataFrame(clones)
df.set_index("id", inplace=True)
df.sort_index(inplace=True)

#### Create the dissimilar pairs

In [19]:
first_500 = df.iloc[:500]
last_500 = df.iloc[-500:]

print("Split shapes: ",first_500.shape, last_500.shape)

dissimilar_pairs = []
for i in range(500):
    dissimilar_pairs.append((first_500.iloc[i]['clone1'], last_500.iloc[i]['clone1']))
    
for i in range(500):
    dissimilar_pairs.append((first_500.iloc[i]['clone2'], last_500.iloc[i]['clone2']))
    
dissimilar_df = pd.DataFrame(dissimilar_pairs, columns=['clone1', 'clone2'])
print("Dissimilar shape: ",dissimilar_df.shape)
dissimilar_df.head()

Split shapes:  (500, 2) (500, 2)
Dissimilar shape:  (1000, 2)


Unnamed: 0,clone1,clone2
0,def second_largest(numbers) :\n\tcount = 0\n\t...,def is_prime(x) :\n\tif x < 2 :\n\t\treturn Fa...
1,"def download_file(service, file_id) :\n\treque...",def cumulative_sum(lst) :\n\tnew_lst = []\n\tf...
2,"def shift_n_letters(letter, n) :\n\tchar_array...","def is_member(x) :\n\ta = [1, 5, 3, 9, 4, 100]..."
3,"def write(self, data) :\n\tif data [0 : LOG_ID...","def is_member(x) :\n\ta = [1, 5, 3, 9, 4, 100]..."
4,"def num_input(prompt, error) :\n\twhile True :...",def __init__(self) :\n\tself.secondsRemaining ...


#### Merge and add labels

In [20]:
df['semantic_clone'] = 1
dissimilar_df['semantic_clone'] = 0
merged_df = pd.concat([df, dissimilar_df], ignore_index=True)

In [23]:
print(merged_df.shape)
merged_df.sample(10)

(2000, 3)


Unnamed: 0,clone1,clone2,semantic_clone
328,"def __init__(self, parent) :\n\tsuper(MyInterp...","def __init__(self, parent) :\n\tsuper(PyInterp...",1
954,"def md5sum(filename) :\n\twith open(filename, ...","def md5sum(filename) :\n\twith open(filename, ...",1
1479,def deep_reverse(a) :\n\ta.reverse()\n\tfor i ...,"def create_response(self, request, data, respo...",0
523,"def createfile() :\n\tvar = """"""\\n\t#!/bin/sh\...","def createfile() :\n\tvar = """"""#!/bin/sh\n\tec...",1
110,def transformFactorList(factorList) :\n\ttwos ...,def transformFactorList(factorList) :\n\toldsi...,1
1665,"def changelist_view(self, request, extra_conte...","def biggest() :\n\tbig_x, big_y, max_seen, pro...",0
975,def wrapper(arg1) :\n\tresult = func(arg1)\n\t...,def wrapper(arg1) :\n\terrors = []\n\tresult =...,1
1272,"def upload(path) :\n\twith open(path, 'rb') as...","def recursiveHalfString(s, offset = 0) :\n\tha...",0
1672,"def bmi_risk(bmi, age) :\n\tif bmi < 22 and ag...","def change_keys(obj, convert) :\n\tif isinstan...",0
1924,def is_valid_hostname(hostname) :\n\tif len(ho...,def unique(items) :\n\tseen = set()\n\tfor i i...,0


In [34]:
# Store Dataset
merged_df.to_csv("data/semantic_benchmark_dataset.csv")