In [1]:
from tqdm import tqdm
import pandas as pd
import string
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
from sentence_transformers import SentenceTransformer
import pickle
import re
import itertools
import datetime
import plotly.graph_objects as go
from plotly.subplots import make_subplots
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

data_filter_repolist_with_full_info = pd.read_csv("../data/data_filter_repolist_with_full_info_no_header.csv",header=None)

SATD_comment_sum = pd.DataFrame()

for index, row in tqdm(data_filter_repolist_with_full_info.iterrows(), total=data_filter_repolist_with_full_info.shape[0]):
    comment = pd.read_csv('../data/SATD_comment/' + str(row[0]) + '.csv')
    SATD_comment_sum = pd.concat([SATD_comment_sum, comment], ignore_index=True)


100%|███████████████████████████████████████████████████████████| 6504/6504 [02:02<00:00, 53.18it/s]


In [4]:
SATD_comment_sum['filePath'] = SATD_comment_sum['linkLocation'].apply(lambda x: ''.join(x.split('#L')[:-1]))

In [5]:
Non_SATD_comment_sum = pd.DataFrame()

for index in tqdm(SATD_comment_sum['repoIndex'].unique()):
    comment = pd.read_csv('../data/Comments_with_no_keywords/' + str(index) + '.csv')
    Non_SATD_comment_sum = pd.concat([Non_SATD_comment_sum, comment], ignore_index=True)


100%|███████████████████████████████████████████████████████████| 4641/4641 [19:36<00:00,  3.95it/s]


In [6]:
Non_SATD_comment_sum['filePath'] = Non_SATD_comment_sum['linkLocation'].apply(lambda x: ''.join(x.split('#L')[:-1]))

In [7]:
len(Non_SATD_comment_sum['repoIndex'].unique())

4641

In [8]:
Non_SATD_comment_sum['buildSysteminFile'].value_counts()

Autotool    1205319
CMake        949424
Maven        161913
Ant           44482
Ivy             248
Name: buildSysteminFile, dtype: int64

In [9]:
Non_SATD_comment_sum_same_file = Non_SATD_comment_sum.loc[Non_SATD_comment_sum['filePath'].isin(list(SATD_comment_sum['filePath'].unique()))]

In [10]:
Non_SATD_comment_sum_sample = pd.DataFrame(columns=Non_SATD_comment_sum_same_file.columns, dtype=object)

unexptected_case = {}
for index, row in SATD_comment_sum.iterrows():
    comments = Non_SATD_comment_sum_same_file.loc[(Non_SATD_comment_sum_same_file['filePath'] == row['filePath']) & (~Non_SATD_comment_sum_same_file.index.isin(list(Non_SATD_comment_sum_sample.index)))]
    if comments.shape[0] == 0:
        if row['filePath'] in unexptected_case.keys():
            unexptected_case[row['filePath']] += 1
        else: unexptected_case[row['filePath']] = 0
        continue
    line_diff_upper = [line for line in list(comments['startLine']) if row['startLine'] > line]
    indexes = []
    if len(line_diff_upper) != 0:
        indexes.append(line_diff_upper.index(max(line_diff_upper)))
    line_diff_below = [line for line in list(comments['endLine']) if row['endLine'] < line]
    if len(line_diff_below) != 0:
        indexes.append(line_diff_below.index(min(line_diff_below)))
    for i in indexes:
        Non_SATD_comment_sum_sample = Non_SATD_comment_sum_sample.append(comments.iloc[i])


In [11]:
df = Non_SATD_comment_sum_sample.drop_duplicates()

In [12]:
df['buildSysteminFile'].value_counts()

Autotool    67574
CMake       41447
Maven        3465
Ant           935
Ivy            43
Name: buildSysteminFile, dtype: int64

In [13]:
len(df['repoIndex'].unique())

4588

In [14]:
import re
def clean_url(text):
    text = ''.join([word if word not in string.punctuation else ' ' for word in text.group(0)])
    return text

def clean_string(text):
    text = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)',clean_url, text)
    text = re.sub(r'\s+',' ', text)
    if len(re.findall(r'\bdnl\b', text)) > 0:
        lines = []
        for line in text.split('\n'):
            if len(line.split()) > 0:
                if 'dnl' == line.split()[0]:
                    line = line.replace('dnl', '', 1)
            lines.append(line)
        text ='\n'.join(lines)
    text = re.sub(r'[^A-Za-z0-9.\']+',' ',text)
    text = text.lower()
    text = text.strip()
    return text

In [15]:
cleaned_comments = []
Non_SATD_comment_sample_remove_single_word = pd.DataFrame(columns = df.columns, dtype=object )
for index, row in df.iterrows():
    cleaned_comment = clean_string(row['comment'])
    word_limit = 2
    if len(cleaned_comment.split()) >= word_limit:
        cleaned_comments.append(cleaned_comment)
        Non_SATD_comment_sample_remove_single_word = Non_SATD_comment_sample_remove_single_word.append(row, ignore_index=True)
Non_SATD_comment_sample_remove_single_word['cleaned_comments'] = cleaned_comments



In [16]:
Non_SATD_comment_sample_remove_single_word['buildSysteminFile'].value_counts()

Autotool    53742
CMake       39617
Maven        3124
Ant           862
Ivy            41
Name: buildSysteminFile, dtype: int64

In [17]:
len(Non_SATD_comment_sample_remove_single_word['repoName'].unique())

4549

In [18]:
comments_embeddings = model.encode(list(Non_SATD_comment_sample_remove_single_word['cleaned_comments']))

In [19]:
csim = cosine_similarity(comments_embeddings)

In [20]:
less_than_0_8_indexes = np.where(np.count_nonzero(csim >= 0.8, axis=1) <= 1)
comments_embeddings = np.delete(comments_embeddings,less_than_0_8_indexes[0],axis = 0 )
Non_SATD_comment_sample_remove_single_word_and_less_0_8 = Non_SATD_comment_sample_remove_single_word.drop(less_than_0_8_indexes[0])
Non_SATD_comment_sample_remove_single_word_and_less_0_8.reset_index(drop=True, inplace=True)
len(Non_SATD_comment_sample_remove_single_word_and_less_0_8['repoName'].unique())

4055

In [21]:
Non_SATD_comment_sample_remove_single_word_and_less_0_8['buildSysteminFile'].value_counts()

Autotool    51509
CMake       29402
Maven        1985
Ant           560
Ivy             5
Name: buildSysteminFile, dtype: int64

In [22]:
from sklearn.cluster import DBSCAN
clustering = DBSCAN(eps=0.1, min_samples=2, n_jobs = -1, metric='cosine').fit_predict(comments_embeddings)

In [23]:
Non_SATD_comment_sample_remove_single_word_and_less_0_8['groupId'] = clustering

In [24]:
from sklearn.metrics import silhouette_score
silhouette_score(comments_embeddings, clustering, metric='cosine')

0.8937854

In [25]:
Non_SATD_comment_sample_remove_single_word_and_less_0_8_and_noise = Non_SATD_comment_sample_remove_single_word_and_less_0_8.loc[Non_SATD_comment_sample_remove_single_word_and_less_0_8['groupId'] != -1]

In [26]:
len(Non_SATD_comment_sample_remove_single_word_and_less_0_8_and_noise['repoName'].unique())

3780

In [27]:
Non_SATD_comment_sample_remove_single_word_and_less_0_8_and_noise['buildSysteminFile'].value_counts()

Autotool    51062
CMake       27712
Maven        1791
Ant           544
Name: buildSysteminFile, dtype: int64

In [28]:
Non_SATD_comments = pd.DataFrame()

for name, group in Non_SATD_comment_sample_remove_single_word_and_less_0_8_and_noise.groupby('groupId'):
    repo_diversity = len(set(list(group['repoName'])))
    group['repoDiversity'] = [repo_diversity] * group.shape[0]
    build_system_set = set(list(group['buildSysteminFile']))
    if set(['CMake','Autotool']).intersection(build_system_set) and set(['Maven','Ant']).intersection(build_system_set):
        group['isCrossLanguage'] = [True] * group.shape[0]
    else:
        group['isCrossLanguage'] = [False] * group.shape[0]
    system_diversity = len(build_system_set)
    group['systemDiversity'] = [system_diversity] * group.shape[0]
    Non_SATD_comments = pd.concat([Non_SATD_comments,group])

        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group['repoDiversity'] = [repo_diversity] * group.shape[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group['isCrossLanguage'] = [False] * group.shape[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group['systemDiversity'] = [system_diversity] * group.shape[0]
A value is trying to be set on a c

In [29]:
Non_SATD_groups = pd.DataFrame({'count' : Non_SATD_comments.groupby(['groupId','repoDiversity','isCrossLanguage','systemDiversity']).size()}).reset_index()

In [30]:
Non_SATD_groups.loc[Non_SATD_groups['repoDiversity'] == 1]['count'].describe()

count    1211.000000
mean        3.678778
std         5.133430
min         2.000000
25%         2.000000
50%         2.000000
75%         3.000000
max        84.000000
Name: count, dtype: float64

In [31]:
sum(Non_SATD_groups.loc[Non_SATD_groups['repoDiversity'] == 1]['count'])

4455

In [32]:
Non_SATD_groups.loc[Non_SATD_groups['repoDiversity'] != 1]['count'].describe()

count    6157.000000
mean       12.449894
std        69.421757
min         2.000000
25%         2.000000
50%         3.000000
75%         6.000000
max      2427.000000
Name: count, dtype: float64

In [33]:
sum(Non_SATD_groups.loc[Non_SATD_groups['repoDiversity'] != 1]['count'])

76654

In [34]:
Non_SATD_groups.loc[Non_SATD_groups['isCrossLanguage'] == False]['count'].describe()

count    7332.000000
mean       10.743181
std        62.989773
min         2.000000
25%         2.000000
50%         3.000000
75%         5.000000
max      2427.000000
Name: count, dtype: float64

In [35]:
sum(Non_SATD_groups.loc[Non_SATD_groups['isCrossLanguage'] == False]['count'])

78769

In [36]:
Non_SATD_groups.loc[Non_SATD_groups['isCrossLanguage'] == True]['count'].describe()

count     36.000000
mean      65.000000
std      129.293907
min        2.000000
25%        3.000000
50%        7.500000
75%       44.250000
max      598.000000
Name: count, dtype: float64

In [37]:
sum(Non_SATD_groups.loc[Non_SATD_groups['isCrossLanguage'] == True]['count'])

2340

In [38]:
Non_SATD_groups.loc[Non_SATD_groups['systemDiversity'] == 1]['count'].describe()

count    7249.000000
mean       10.665747
std        63.185616
min         2.000000
25%         2.000000
50%         3.000000
75%         5.000000
max      2427.000000
Name: count, dtype: float64

In [39]:
sum(Non_SATD_groups.loc[Non_SATD_groups['systemDiversity'] == 1]['count'])

77316

In [40]:
Non_SATD_groups.loc[Non_SATD_groups['systemDiversity'] != 1]['count'].describe()

count    119.000000
mean      31.873950
std       81.724481
min        2.000000
25%        3.000000
50%        6.000000
75%       16.500000
max      598.000000
Name: count, dtype: float64

In [41]:
sum(Non_SATD_groups.loc[Non_SATD_groups['systemDiversity'] != 1]['count'])

3793

In [42]:
Non_SATD_comments[Non_SATD_comments['systemDiversity'] == 1]['buildSysteminFile'].value_counts()

Autotool    49955
CMake       25811
Maven        1178
Ant           372
Name: buildSysteminFile, dtype: int64

In [43]:
Non_SATD_comments[Non_SATD_comments['systemDiversity'] == 1].groupby(['buildSysteminFile'])['groupId'].agg(lambda x: len(set(x)))

buildSysteminFile
Ant           73
Autotool    1962
CMake       4889
Maven        325
Name: groupId, dtype: int64

In [44]:
from collections import Counter

same_system_clone = Non_SATD_comments[Non_SATD_comments['systemDiversity'] == 1].groupby(['buildSysteminFile'])['groupId'].agg(lambda x: {key:value for key, value in dict(Counter(x)).items()}).to_frame()

same_system_clone_Ant = pd.DataFrame(same_system_clone.iloc[0]['groupId'].values(),
                                              columns =['values'])
same_system_clone_Ant['values'].describe()

count    73.000000
mean      5.095890
std       5.074896
min       2.000000
25%       2.000000
50%       2.000000
75%       7.000000
max      29.000000
Name: values, dtype: float64

In [45]:
same_system_clone_Autotool = pd.DataFrame(same_system_clone.iloc[1]['groupId'].values(),
                                              columns =['values'])
same_system_clone_Autotool['values'].describe()

count    1962.000000
mean       25.461264
std       119.036454
min         2.000000
25%         2.000000
50%         4.000000
75%         9.000000
max      2427.000000
Name: values, dtype: float64

In [46]:
same_system_clone_CMake = pd.DataFrame(same_system_clone.iloc[2]['groupId'].values(),
                                              columns =['values'])
same_system_clone_CMake['values'].describe()

count    4889.000000
mean        5.279403
std        10.653891
min         2.000000
25%         2.000000
50%         3.000000
75%         5.000000
max       333.000000
Name: values, dtype: float64

In [47]:
same_system_clone_Maven = pd.DataFrame(same_system_clone.iloc[3]['groupId'].values(),
                                              columns =['values'])
same_system_clone_Maven['values'].describe()

count    325.000000
mean       3.624615
std        3.492586
min        2.000000
25%        2.000000
50%        2.000000
75%        4.000000
max       28.000000
Name: values, dtype: float64

In [48]:
Non_SATD_comments.to_csv('../data/Non_SATD_clones.csv',index=None)