In [None]:
from tqdm import tqdm
import pandas as pd
import string
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
from sentence_transformers import SentenceTransformer
import pickle
import re
import itertools
import datetime
import plotly.graph_objects as go
from plotly.subplots import make_subplots
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [None]:

data_filter_repolist_with_full_info = pd.read_csv("../data/data_filter_repolist_with_full_info_no_header.csv",header=None)

SATD_comments_sum = pd.DataFrame()

for index, row in tqdm(data_filter_repolist_with_full_info.iterrows(), total=data_filter_repolist_with_full_info.shape[0]):
    comment = pd.read_csv('../data/SATD_comment/' + str(row[0]) + '.csv')
    SATD_comments_sum = pd.concat([SATD_comments_sum, comment], ignore_index=True)


In [None]:
len(SATD_comments_sum['repoIndex'].unique())

In [None]:
SATD_comments_sum['buildSysteminFile'].value_counts()

In [None]:
def clean_url(text):
    text = ''.join([word if word not in string.punctuation else ' ' for word in text.group(0)])
    return text

def clean_string(text):
    text = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)',clean_url, text)
    text = re.sub(r'\s+',' ', text)
    if len(re.findall(r'\bdnl\b', text)) > 0:
        lines = []
        for line in text.split('\n'):
            if len(line.split()) > 0:
                if 'dnl' == line.split()[0]:
                    line = line.replace('dnl', '', 1)
            lines.append(line)
        text ='\n'.join(lines)
    text = re.sub(r'[^A-Za-z0-9.\']+',' ',text)
    text = text.lower()
    text = text.strip()
    return text

In [None]:


cleaned_comments = []
SATD_comment_sum_remove_single_word = pd.DataFrame(columns = SATD_comments_sum.columns, dtype=object )
for index, row in SATD_comments_sum.iterrows():
    cleaned_comment = clean_string(row['comment'])
    word_limit = 2
    if len(cleaned_comment.split()) >= word_limit:
        cleaned_comments.append(cleaned_comment)
        SATD_comment_sum_remove_single_word = SATD_comment_sum_remove_single_word.append(row, ignore_index=True)
SATD_comment_sum_remove_single_word['cleaned_comments'] = cleaned_comments



In [None]:
SATD_comment_sum_remove_single_word.shape

In [None]:
SATD_comment_sum_remove_single_word['buildSysteminFile'].value_counts()

In [None]:
len(SATD_comment_sum_remove_single_word['repoName'].unique())

In [None]:
comments_embeddings = model.encode(list(SATD_comment_sum_remove_single_word['cleaned_comments']))
csim = cosine_similarity(comments_embeddings)
less_than_0_8_indexes = np.where(np.count_nonzero(csim >= 0.8, axis=1) <= 1)
comments_embeddings = np.delete(comments_embeddings,less_than_0_8_indexes[0],axis = 0 )

In [None]:
SATD_comment_sum_remove_single_word_and_less_0_8 = SATD_comment_sum_remove_single_word.drop(less_than_0_8_indexes[0])
SATD_comment_sum_remove_single_word_and_less_0_8.reset_index(drop=True, inplace=True)
SATD_comment_sum_remove_single_word_and_less_0_8.shape

In [None]:
len(SATD_comment_sum_remove_single_word_and_less_0_8['repoName'].unique())

In [None]:
SATD_comment_sum_remove_single_word_and_less_0_8['buildSysteminFile'].value_counts()

In [None]:
from sklearn.cluster import DBSCAN
clustering = DBSCAN(eps=0.1, min_samples=2, n_jobs = -1, metric='cosine').fit_predict(comments_embeddings)

In [None]:
SATD_comment_sum_remove_single_word_and_less_0_8['groupId'] = clustering
from sklearn.metrics import silhouette_score
silhouette_score(comments_embeddings, clustering, metric='cosine')

In [None]:
SATD_comment_sum_remove_single_word_and_less_0_8_and_noise = SATD_comment_sum_remove_single_word_and_less_0_8.loc[SATD_comment_sum_remove_single_word_and_less_0_8['groupId'] != -1]
SATD_comment_sum_remove_single_word_and_less_0_8_and_noise.shape

In [None]:
len(SATD_comment_sum_remove_single_word_and_less_0_8_and_noise['repoName'].unique())

In [None]:
SATD_comment_sum_remove_single_word_and_less_0_8_and_noise['buildSysteminFile'].value_counts()

In [None]:
len(SATD_comment_sum_remove_single_word_and_less_0_8_and_noise['groupId'].unique())

In [None]:
import plotly.graph_objects as go

x = SATD_comment_sum_remove_single_word_and_less_0_8_and_noise.groupby('groupId').size().sort_values(ascending=False).tolist()

y = list(range(len(x)))

fig = go.Figure([go.Scatter(
    x=y,
    y=x,
    marker_color = 'black', showlegend=False,
),     go.Scatter(x=[286,286], y=[0,5400 ], mode="lines", showlegend=False, marker_color='black')]
               )

fig.update_layout(
    xaxis_title="Top clone groups",
    yaxis_title="Threshold (# of SATD)",
    paper_bgcolor='rgba(255,255,255,255)',
    font=dict(
        size=18,
    ),
    plot_bgcolor='rgba(255,255,255,255)',width=1000, height=800
)
fig.update_yaxes(type="log", tickmode = 'array', tickvals = [10,100,1000,10000,100000,1000000], tickformat=',.2r')
fig.update_xaxes(tickformat=',2r', tickmode = 'array', tickvals = [286,1000,2000,3000,4000,5000])
fig.show()
fig.write_image('thresholds_clone_groups.pdf', scale=2)

In [None]:
top_286_SATD_clone_group = pd.DataFrame()
for i in SATD_comment_sum_remove_single_word_and_less_0_8_and_noise.groupby('groupId').size().sort_values(ascending=False).index.to_list()[:286]:
    top_286_SATD_clone_group = pd.concat([top_286_SATD_clone_group, SATD_comment_sum_remove_single_word_and_less_0_8_and_noise.loc[SATD_comment_sum_remove_single_word_and_less_0_8_and_noise['groupId'] == i].iloc[0].to_frame().T])

In [None]:
top_286_SATD_clone_group['numberofSATDinGroup'] = SATD_comment_sum_remove_single_word_and_less_0_8_and_noise.groupby('groupId').size().sort_values(ascending=False).tolist()[:286]

In [None]:
top_286_SATD_clone_group.to_csv('../data/top_286_SATD_clone_group.csv')

In [None]:
SATD_comments_manully_check = pd.read_csv('../data/top_286_SATD_clone_group_fix.csv')
SATD_comments_manully_check.loc[SATD_comments_manully_check['IsSATD'] == 'No']['numberofSATDinGroup'].sum()

In [None]:
false_positives = list(SATD_comments_manully_check.loc[SATD_comments_manully_check['IsSATD'] == 'No']['groupId'])

In [None]:
SATD_comments_exclude_FP = SATD_comment_sum_remove_single_word_and_less_0_8_and_noise.loc[~SATD_comment_sum_remove_single_word_and_less_0_8_and_noise['groupId'].isin(false_positives)]
SATD_comments_exclude_FP.shape

In [None]:
SATD_comments_exclude_FP['buildSysteminFile'].value_counts()

In [None]:
len(SATD_comments_exclude_FP['repoName'].unique())

In [None]:
SATD_comments = pd.DataFrame()

for name, group in SATD_comments_exclude_FP.groupby('groupId'):
    repo_diversity = len(set(list(group['repoName'])))
    group['repoDiversity'] = [repo_diversity] * group.shape[0]
    build_system_set = set(list(group['buildSysteminFile']))
    if set(['CMake','Autotool']).intersection(build_system_set) and set(['Maven','Ant']).intersection(build_system_set):
        group['isCrossLanguage'] = [True] * group.shape[0]
    else:
        group['isCrossLanguage'] = [False] * group.shape[0]
    system_diversity = len(build_system_set)
    group['systemDiversity'] = [system_diversity] * group.shape[0]
    SATD_comments = pd.concat([SATD_comments,group])
        

In [None]:
SATD_groups = pd.DataFrame({'count' : SATD_comments.groupby(['groupId','repoDiversity','isCrossLanguage','systemDiversity']).size()}).reset_index()

In [None]:
SATD_groups_repository_dimension = SATD_groups.loc[SATD_groups['repoDiversity'] == 1]

In [None]:
SATD_groups.loc[SATD_groups['repoDiversity'] == 1]['count'].describe()

In [None]:
sum(SATD_groups.loc[SATD_groups['repoDiversity'] == 1]['count'])

In [None]:
SATD_groups.loc[SATD_groups['repoDiversity'] != 1]['count'].describe()

In [None]:
sum(SATD_groups.loc[SATD_groups['repoDiversity'] != 1]['count'])

In [None]:
SATD_groups.loc[SATD_groups['isCrossLanguage'] == False]['count'].describe()

In [None]:
sum(SATD_groups.loc[SATD_groups['isCrossLanguage'] == False]['count'])

In [None]:
SATD_groups.loc[SATD_groups['isCrossLanguage'] == True]['count'].describe()

In [None]:
sum(SATD_groups.loc[SATD_groups['isCrossLanguage'] == True]['count'])

In [None]:
SATD_groups.loc[SATD_groups['systemDiversity'] == 1]['count'].describe()

In [None]:
sum(SATD_groups.loc[SATD_groups['systemDiversity'] == 1]['count'])

In [None]:
SATD_groups.loc[SATD_groups['systemDiversity'] != 1]['count'].describe()

In [None]:
sum(SATD_groups.loc[SATD_groups['systemDiversity'] != 1]['count'])

In [None]:
SATD_comments[SATD_comments['systemDiversity'] == 1]['buildSysteminFile'].value_counts()

In [None]:
SATD_comments[SATD_comments['systemDiversity'] == 1].groupby(['buildSysteminFile'])['groupId'].agg(lambda x: len(set(x)))

In [None]:
from collections import Counter

same_system_clone = SATD_comments[SATD_comments['systemDiversity'] == 1].groupby(['buildSysteminFile'])['groupId'].agg(lambda x: {key:value for key, value in dict(Counter(x)).items()}).to_frame()

same_system_clone_Ant = pd.DataFrame(same_system_clone.iloc[0]['groupId'].values(),
                                              columns =['values'])
same_system_clone_Ant['values'].describe()

In [None]:
same_system_clone_Autotool = pd.DataFrame(same_system_clone.iloc[1]['groupId'].values(),
                                              columns =['values'])
same_system_clone_Autotool['values'].describe()

In [None]:
same_system_clone_CMake = pd.DataFrame(same_system_clone.iloc[2]['groupId'].values(),
                                              columns =['values'])
same_system_clone_CMake['values'].describe()

In [None]:
same_system_clone_Maven = pd.DataFrame(same_system_clone.iloc[3]['groupId'].values(),
                                              columns =['values'])
same_system_clone_Maven['values'].describe()

In [None]:
SATD_comments.to_csv('../data/SATD_clones.csv',index=None)

In [None]:
SATD_comments_clone_same_system = SATD_comments.loc[SATD_comments['systemDiversity'] == 1].reset_index()

In [None]:
SATD_clones = pd.DataFrame()
for i in SATD_comments_clone_same_system.groupby('groupId').size().sort_values(ascending=False).index.to_list():
    SATD_clones = pd.concat([SATD_clones, SATD_comments_clone_same_system.loc[SATD_comments_clone_same_system['groupId'] == i].iloc[0].to_frame().T])

In [None]:
SATD_clones['numberofSATDinGroup'] = SATD_comments_clone_same_system.groupby('groupId').size().sort_values(ascending=False).tolist()

In [None]:
SATD_clones.to_csv('../data/SATD_clones_coding.csv',index=None)