# Introduction
This notebook is used to create a dataset for the semantic clone detection task. The dataset is created from the Semantic Benchmark dataset.

The dataset contains 1000 pairs of similar and 1000 dissimilar code snippets. The dataset is saved as a csv file.

**Note: In version two we remove the method names.**

In [6]:
from pathlib import Path
import glob
import os
import re
import pandas as pd

####  Read the python files from the correct subdirectory and create the clone pairs

In [7]:
# maybe needed to change to base directory
os.chdir("..")

In [8]:
path = Path("data/Semantic_Benchmark/Python/Stand alone clones")
if not path.exists():
    raise FileNotFoundError(f"The path does not exist {path.absolute()}")

In [9]:
python_files = path.glob("**/*.py")
pattern = re.compile(r"data/Semantic_Benchmark/Python/Stand alone clones/Clone(?P<id>\d+)\.py")
import re

method_name_pattern = re.compile(r"def\s+(.+?)\(")  # Matches "def", followed by whitespace, then captures any characters until an opening parenthesis

clones = []
for file in python_files:
    match = pattern.match(str(file))
    if match:
        groups_dict = dict(match.groupdict())

        with open(file) as f:
            content = f.read()
        clone1,_, clone2 = content.partition("\n\n\n")
        clone1 = method_name_pattern.sub("def method_name(", clone1.strip().strip("\n"), count=1)
        clone2 = method_name_pattern.sub("def method_name(", clone2.strip().strip("\n"), count=1)
        groups_dict["clone1"] = clone1
        groups_dict["clone2"] = clone2

        clones.append(groups_dict)
    else:
        print(f"no match for {file}")


In [10]:
df = pd.DataFrame(clones)
df.set_index("id", inplace=True)
df.sort_index(inplace=True)
df.head()

Unnamed: 0_level_0,clone1,clone2
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,def method_name(numbers) :\n\tcount = 0\n\tm1 ...,def method_name(L) :\n\tif (len(L) < 2) :\n\t\...
1,"def method_name(service, file_id) :\n\trequest...","def method_name(file_id, mimeType, filename) :..."
10,"def method_name(letter, n) :\n\tchar_array = [...","def method_name(letter, n) :\n\tn_ = n % 26\n\..."
100,"def method_name(self, data) :\n\tif data [0 : ...","def method_name(self, data) :\n\tself.stream.w..."
101,"def method_name(prompt, error) :\n\twhile True...",def method_name(s) :\n\twhile True :\n\t\ttry ...


#### Create the dissimilar pairs

In [11]:
first_500 = df.iloc[:500]
last_500 = df.iloc[-500:]

print("Split shapes: ",first_500.shape, last_500.shape)

dissimilar_pairs = []
for i in range(500):
    dissimilar_pairs.append((first_500.iloc[i]['clone1'], last_500.iloc[i]['clone1']))
    
for i in range(500):
    dissimilar_pairs.append((first_500.iloc[i]['clone2'], last_500.iloc[i]['clone2']))
    
dissimilar_df = pd.DataFrame(dissimilar_pairs, columns=['clone1', 'clone2'])
print("Dissimilar shape: ",dissimilar_df.shape)
dissimilar_df.head()

Split shapes:  (500, 2) (500, 2)
Dissimilar shape:  (1000, 2)


Unnamed: 0,clone1,clone2
0,def method_name(numbers) :\n\tcount = 0\n\tm1 ...,def method_name(x) :\n\tif x < 2 :\n\t\treturn...
1,"def method_name(service, file_id) :\n\trequest...",def method_name(lst) :\n\tnew_lst = []\n\tfor ...
2,"def method_name(letter, n) :\n\tchar_array = [...","def method_name(x) :\n\ta = [1, 5, 3, 9, 4, 10..."
3,"def method_name(self, data) :\n\tif data [0 : ...","def method_name(x) :\n\ta = [1, 5, 3, 9, 4, 10..."
4,"def method_name(prompt, error) :\n\twhile True...",def method_name(self) :\n\tself.secondsRemaini...


#### Merge and add labels

In [10]:
df['semantic_clone'] = 1
dissimilar_df['semantic_clone'] = 0
merged_df = pd.concat([df, dissimilar_df], ignore_index=True)

In [11]:
print(merged_df.shape)
merged_df.sample(10)

(2000, 3)


Unnamed: 0,clone1,clone2,semantic_clone
1843,"def method_name(the_list, match) :\n\ttry :\n\...","def method_name(y, A, B, Pi = None) :\n\tK = A...",0
1823,"def method_name(seq, sub) :\n\tm, n = len(seq)...","def method_name(self) :\n\tself.parent.title(""...",0
959,"def method_name(self, key, value = None) :\n\t...","def method_name(self, key, value = None) :\n\t...",1
1085,"def method_name(self, a = None, b = None, e = ...",def method_name(arr) :\n\tif len(arr) < 2 :\n\...,0
430,def method_name(value) :\n\tcapitalized_words ...,def method_name(value) :\n\tdef camelcase() :\...,1
8,def method_name(lst) :\n\tret = []\n\ta = b = ...,def method_name(l) :\n\tr = []\n\tp = q = None...,1
941,"def method_name(node, v) :\n\tnew = [v, [], []...","def method_name(self, val, node) :\n\tif (val ...",1
1744,"def method_name(self, maxlen, * a, ** k) :\n\t...",def method_name(request) :\n\tif request.metho...,0
32,"def method_name(self, * args, ** kwargs) :\n\t...","def method_name(self, * args, ** kwargs) :\n\t...",1
597,"def method_name(left, right, compare) :\n\tres...","def method_name(arr, p, q, r) :\n\tn1 = q - p ...",1


In [12]:
# Store Dataset
merged_df.to_csv("data/semantic_benchmark_dataset_2.csv")

In [1]:
merged_df[merged_df['semantic_clone'] == 1].sample(10)

NameError: name 'merged_df' is not defined

In [ ]:
merged_df[merged_df['semantic_clone'] == 1].sample(10)