In [None]:
from extract_SATD_comments import _Comment, identify_comments, xml_lexer
import re
from antlr4 import Token, Lexer, InputStream
from lexer.CMakeLexer import CMakeLexer
from lexer.MakefileCommentLexer import MakefileCommentLexer
from lexer.MakefileAmCommentLexer import MakefileAmCommentLexer
from lexer.CPP14Lexer import CPP14Lexer
import pandas as pd

In [None]:
Non_SATD_comments = pd.read_csv('../data/Non_SATD_clones.csv')

In [None]:
def clean_comments(content, comments):
    for comment in comments:
        for line in comment.text.split('\n'):
            content = content.replace(line, '', 1)
    return content

In [None]:
def clean_comments(content, comments):
    i = 0
    for comment in comments:
        for line in comment.get_text().split('\n'):
            content = content.replace(line, '', 1)
    # This is hack to clean undiscoved comment because of Antrl4 cannot read these "wrong" grammar
    lines = [line if len(line.strip()) == 0 else ('' if line.strip()[0] == '#' else line) for line in content.split('\n')]
    content = '\n'.join(lines)
    return content

In [None]:
code_snippets = pd.Series()
threshold = 5

for name, group in Non_SATD_comments.groupby('filePath'):
    file = '/data/satd-clone-2022/' + str(group.iloc[0]['repoIndex']) + '/' + '/'.join(name.split('/')[7:])

    content = open(file, 'r', errors='ignore').read()
    input_stream = InputStream(content)
    if re.match(r'^build\.xml$', file.split('/')[-1]) or re.match(r'^(pom\.xml|maven[123]?\.xml)$', file.split('/')[-1]):
        comments = xml_lexer(file, content)
    elif re.match(r'^([Cc]onfigure.in|[Mm]akefile.in)$', file.split('/')[-1]) or re.match(r'^[Cc]onfigure.ac$', file.split('/')[-1]) or re.match(r'^ac(local|site).m4$', file.split('/')[-1]):
        comments = identify_comments(MakefileCommentLexer(input_stream))
    elif re.match(r'^config.h.in$', file.split('/')[-1]):
        comments = identify_comments(CPP14Lexer(input_stream))
    elif re.match(r'^[Mm]akefile.am$', file.split('/')[-1]):
        comments = identify_comments(MakefileAmCommentLexer(input_stream))
    elif re.findall(r'.cmake', file.split('/')[-1]) or file.split('/')[-1] == 'CMakeLists.txt' or file.split('/')[-1] == 'build.properties':
        comments = identify_comments(CMakeLexer(input_stream))
    else: 
        print(name)
        continue

    content = clean_comments(content, comments)

    lines = content.split('\n')
    
    for index, row in group.iterrows():
        start_line = row['startLine']
        upper_code = []
        while start_line >= 1 and len(upper_code) < threshold:
            code = lines[start_line - 1]
            if code.strip() != '': upper_code.append(code)
            start_line -= 1
        upper_code.reverse()

        end_line = row['endLine']
        bottom_code = []
        while end_line <= len(lines) and len(bottom_code) < threshold:
            code = lines[end_line - 1]
            if code.strip() != '': bottom_code.append(code)
            end_line += 1
        code_snippets.at[index] = '\n'.join(upper_code + bottom_code)


In [None]:
Non_SATD_comments['codeSnippet'] = code_snippets

In [None]:
Non_SATD_comments.to_csv('../data/Non_SATD_clones_with_code.csv')

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import math
import time, pickle, math, warnings, os, operator
import string 
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate import bleu_score
import time

In [None]:
Non_SATD_comments_clone_same_system = Non_SATD_comments.loc[Non_SATD_comments['systemDiversity'] == 1].reset_index()
punctuations = string.punctuation.replace("\"","")
cleand_code_snippets = [code.replace('\n',' ').strip() for code in list(Non_SATD_comments_clone_same_system['codeSnippet'])]
cleand_code_snippets = [code.translate(str.maketrans({key: " {0} ".format(key) for key in punctuations})) for code in cleand_code_snippets]
Non_SATD_comments_clone_same_system['cleaned_code'] = cleand_code_snippets
data_count_vect = CountVectorizer(max_df=0.5)
data_vect = data_count_vect.fit_transform(cleand_code_snippets)

In [None]:
similarity = cosine_similarity(data_vect)

In [None]:
import itertools
cos_score_mins = []
cos_score_maxs = []
cos_score_means = []
cos_score_medians = []
systems = []
names = []
for name, group in Non_SATD_comments_clone_same_system.groupby('groupId'):
    group_index = list(group.index.values)
    cos_score = []
    names.append(name)
    for combination in itertools.combinations(group_index, 2):
        cos_score.append(similarity[combination[0]][combination[1]])
    systems.append(group.iloc[0]['buildSysteminFile'])
    stat = pd.DataFrame(cos_score)[0].describe()
    cos_score_mins.append(stat['min'])
    cos_score_maxs.append(stat['max'])
    cos_score_means.append(stat['mean'])
    cos_score_medians.append(stat['50%'])

In [None]:
stat_df = pd.DataFrame({'min': cos_score_mins,
             'max': cos_score_maxs,
             'mean': cos_score_means,
             'median': cos_score_medians,
            'system': systems,
            'groupId': names})

In [None]:
stat_df.to_csv("../data/RQ2_Non_SATD_stat.csv",index=None)