In [None]:
import pandas as pd
Non_SATD_comments = pd.read_csv('../data/Non_SATD_clones.csv')

In [None]:
Non_SATD_comments['filePath'] = Non_SATD_comments['linkLocation'].apply(lambda x: ''.join(x.split('#L')[:-1]))

In [None]:
import dask.dataframe as ddf
import dask
import multiprocessing
import os
# dask.config.set(scheduler='multiprocessing')
# get num cpu cores
num_partitions = multiprocessing.cpu_count()

def git_log(row):
    fname = '/'.join(row['filePath'].split('/')[7:])
    cmd = 'cd {path}; git log --pretty=format:%H^^^^%an^^^^%ae^^^^%at^^^^%B --no-patch -L {lnumber},{lnumber}:"{fname}"'.format(
        path = "/data/satd-clone-2022/" + str(row['repoIndex']),
        lnumber=row['startLine'],
        fname=fname)
    with os.popen(cmd) as process:
        result = process.read()
    result = str(row.name) + '^^^^' + result
    return result
df_dask = ddf.from_pandas(Non_SATD_comments, npartitions=(num_partitions-1)*4)
df_dask['output'] = df_dask.apply(lambda x: git_log(x), meta=('str'), axis=1).compute(scheduler='multiprocessing')


In [None]:
df = df_dask.compute()

In [None]:
df['introducedSha'] = df['output'].apply(lambda x: x.split('^^^^')[1])
df['authorName'] = df['output'].apply(lambda x: x.split('^^^^')[2])
df['authorEmail'] = df['output'].apply(lambda x: x.split('^^^^')[3])
df['authorTime'] = df['output'].apply(lambda x: x.split('^^^^')[4])
df['commitMessage'] = df['output'].apply(lambda x: x.split('^^^^')[5])

In [None]:
Non_SATD_comments = df.reset_index(drop=True)

In [2]:
Non_SATD_comments.to_csv('../data/Non_SATD_clones_with_authorship.csv')

In [None]:
Non_SATD_comments_external_repository = Non_SATD_comments.loc[Non_SATD_comments['repoDiversity'] != 1].reset_index()

In [None]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
from sentence_transformers import SentenceTransformer
import re
import string
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
def clean_url(text):
    text = ''.join([word if word not in string.punctuation else ' ' for word in text.group(0)])
    return text

def clean_string(text):
    text = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)',clean_url, text)
    text = re.sub(r'\s+',' ', text)
    if len(re.findall(r'\bdnl\b', text)) > 0:
        lines = []
        for line in text.split('\n'):
            if len(line.split()) > 0:
                if 'dnl' == line.split()[0]:
                    line = line.replace('dnl', '', 1)
            lines.append(line)
        text ='\n'.join(lines)
    text = re.sub(r'[^A-Za-z0-9.\']+',' ',text)
    text = text.lower()
    text = text.strip()
    return text

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cleaned_message = [clean_string(message) for message in list(Non_SATD_comments_external_repository['commitMessage'])]
mesage_embeddings = model.encode(cleaned_message)
csim_message = cosine_similarity(mesage_embeddings)

In [None]:
author_diversity = []
author_max_clone_diversity = []
author_clone_interval = []
cos_score_mins_message = []
cos_score_maxs_message = []
cos_score_means_message = []
cos_score_medians_message = []
names = []
is_same_author = []
same_sha = []
for name, group in Non_SATD_comments_external_repository.groupby('groupId'):
#     print(group)
    same_sha.append(len(list(group['introducedSha'].unique())) / group.shape[0])
    group = group.drop_duplicates(subset=['introducedSha','repoName'], keep='last')
    if group.shape[0] == 1:
        continue
    if group.groupby(["authorName", "authorEmail"]).size().reset_index(name="Time").shape[0] > 1:
        is_same_author.append(0)
    else:
        is_same_author.append(1)
    group_index = list(group.index.values)
    names.append(name)
    author_fre = group.groupby(["authorName", "authorEmail"]).size().reset_index(name="Time")
    author_max_clone_diversity.append(max(list(author_fre['Time'])) / group.shape[0])
    author_diversity.append(author_fre.shape[0] / group.shape[0])
    times = group['authorTime'].apply(lambda x: datetime.datetime.fromtimestamp(int(x)))
    author_clone_interval.append((max(times) - min(times))/ datetime.timedelta(days=1))
    cos_score = []
    for combination in itertools.combinations(group_index, 2):
        cos_score.append(csim_message[combination[0]][combination[1]])
    
    stat = pd.DataFrame(cos_score)[0].describe()
    cos_score_mins_message.append(stat['min'])
    cos_score_maxs_message.append(stat['max'])
    cos_score_means_message.append(stat['mean'])
    cos_score_medians_message.append(stat['50%'])

In [None]:
unique, counts = np.unique(is_same_author, return_counts=True)
print(np.asarray((unique, counts)).T)

In [None]:
author_stat = pd.DataFrame({'min': cos_score_mins_message,
             'max': cos_score_maxs_message,
             'mean': cos_score_means_message,
             'median': cos_score_medians_message,
            'author_clone_interval': author_clone_interval,
            'author_max_clone_diversity': author_max_clone_diversity,
            'author_diversity': author_diversity,
            'same_sha': same_sha,
            'groupId': names})

In [None]:
author_stat.to_csv("../data/RQ3_Non_SATD_stat.csv",index=None)